diff mbox series

[v7,10/13] vfio: Add load state functions to SaveVMHandlers

Message ID 1562665760-26158-11-git-send-email-kwankhede@nvidia.com
State New
Headers show
Series Add migration support for VFIO device | expand

Commit Message

Kirti Wankhede July 9, 2019, 9:49 a.m. UTC
Flow during _RESUMING device state:
- If Vendor driver defines mappable region, mmap migration region.
- Load config state.
- For data packet, till VFIO_MIG_FLAG_END_OF_STATE is not reached
    - read data_size from packet, read buffer of data_size
    - read data_offset from where QEMU should write data.
        if region is mmaped, write data of data_size to mmaped region.
    - write data_size.
        In case of mmapped region, write to data_size indicates kernel
        driver that data is written in staging buffer.
    - if region is trapped, pwrite() data of data_size from data_offset.
- Repeat above until VFIO_MIG_FLAG_END_OF_STATE.
- Unmap migration region.

For user, data is opaque. User should write data in the same order as
received.

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Reviewed-by: Neo Jia <cjia@nvidia.com>
---
 hw/vfio/migration.c  | 162 +++++++++++++++++++++++++++++++++++++++++++++++++++
 hw/vfio/trace-events |   3 +
 2 files changed, 165 insertions(+)

Comments

Yan Zhao July 12, 2019, 2:52 a.m. UTC | #1
On Tue, Jul 09, 2019 at 05:49:17PM +0800, Kirti Wankhede wrote:
> Flow during _RESUMING device state:
> - If Vendor driver defines mappable region, mmap migration region.
> - Load config state.
> - For data packet, till VFIO_MIG_FLAG_END_OF_STATE is not reached
>     - read data_size from packet, read buffer of data_size
>     - read data_offset from where QEMU should write data.
>         if region is mmaped, write data of data_size to mmaped region.
>     - write data_size.
>         In case of mmapped region, write to data_size indicates kernel
>         driver that data is written in staging buffer.
>     - if region is trapped, pwrite() data of data_size from data_offset.
> - Repeat above until VFIO_MIG_FLAG_END_OF_STATE.
> - Unmap migration region.
> 
> For user, data is opaque. User should write data in the same order as
> received.
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Reviewed-by: Neo Jia <cjia@nvidia.com>
> ---
>  hw/vfio/migration.c  | 162 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  hw/vfio/trace-events |   3 +
>  2 files changed, 165 insertions(+)
> 
> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
> index 4e9b4cce230b..5fb4c5329ede 100644
> --- a/hw/vfio/migration.c
> +++ b/hw/vfio/migration.c
> @@ -249,6 +249,26 @@ static int vfio_save_device_config_state(QEMUFile *f, void *opaque)
>      return qemu_file_get_error(f);
>  }
>  
> +static int vfio_load_device_config_state(QEMUFile *f, void *opaque)
> +{
> +    VFIODevice *vbasedev = opaque;
> +    uint64_t data;
> +
> +    if (vbasedev->ops && vbasedev->ops->vfio_load_config) {
> +        vbasedev->ops->vfio_load_config(vbasedev, f);
> +    }
> +
> +    data = qemu_get_be64(f);
> +    if (data != VFIO_MIG_FLAG_END_OF_STATE) {
> +        error_report("%s: Failed loading device config space, "
> +                     "end flag incorrect 0x%"PRIx64, vbasedev->name, data);
> +        return -EINVAL;
> +    }
> +
> +    trace_vfio_load_device_config_state(vbasedev->name);
> +    return qemu_file_get_error(f);
> +}
> +
>  /* ---------------------------------------------------------------------- */
>  
>  static int vfio_save_setup(QEMUFile *f, void *opaque)
> @@ -421,12 +441,154 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
>      return ret;
>  }
>  
> +static int vfio_load_setup(QEMUFile *f, void *opaque)
> +{
> +    VFIODevice *vbasedev = opaque;
> +    VFIOMigration *migration = vbasedev->migration;
> +    int ret = 0;
> +
> +    if (migration->region.buffer.mmaps) {
> +        ret = vfio_region_mmap(&migration->region.buffer);
> +        if (ret) {
> +            error_report("%s: Failed to mmap VFIO migration region %d: %s",
> +                         vbasedev->name, migration->region.index,
> +                         strerror(-ret));
> +            return ret;
> +        }
> +    }
> +
> +    ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RESUMING);
> +    if (ret) {
> +        error_report("%s: Failed to set state RESUMING", vbasedev->name);
> +    }
> +    return ret;
> +}
> +
> +static int vfio_load_cleanup(void *opaque)
> +{
> +    vfio_save_cleanup(opaque);
> +    return 0;
> +}
> +
> +static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
> +{
> +    VFIODevice *vbasedev = opaque;
> +    VFIOMigration *migration = vbasedev->migration;
> +    int ret = 0;
> +    uint64_t data, data_size;
> +
I think checking of version_id is still needed.

Thanks
Yan

> +    data = qemu_get_be64(f);
> +    while (data != VFIO_MIG_FLAG_END_OF_STATE) {
> +
> +        trace_vfio_load_state(vbasedev->name, data);
> +
> +        switch (data) {
> +        case VFIO_MIG_FLAG_DEV_CONFIG_STATE:
> +        {
> +            ret = vfio_load_device_config_state(f, opaque);
> +            if (ret) {
> +                return ret;
> +            }
> +            break;
> +        }
> +        case VFIO_MIG_FLAG_DEV_SETUP_STATE:
> +        {
> +            data = qemu_get_be64(f);
> +            if (data == VFIO_MIG_FLAG_END_OF_STATE) {
> +                return ret;
> +            } else {
> +                error_report("%s: SETUP STATE: EOS not found 0x%"PRIx64,
> +                             vbasedev->name, data);
> +                return -EINVAL;
> +            }
> +            break;
> +        }
> +        case VFIO_MIG_FLAG_DEV_DATA_STATE:
> +        {
> +            VFIORegion *region = &migration->region.buffer;
> +            void *buf = NULL;
> +            bool buffer_mmaped = false;
> +            uint64_t data_offset = 0;
> +
> +            data_size = qemu_get_be64(f);
> +            if (data_size == 0) {
> +                break;
> +            }
> +
> +            ret = pread(vbasedev->fd, &data_offset, sizeof(data_offset),
> +                        region->fd_offset +
> +                        offsetof(struct vfio_device_migration_info,
> +                        data_offset));
> +            if (ret != sizeof(data_offset)) {
> +                error_report("%s:Failed to get migration buffer data offset %d",
> +                             vbasedev->name, ret);
> +                return -EINVAL;
> +            }
> +
> +            if (region->mmaps) {
> +                buf = find_data_region(region, data_offset, data_size);
> +            }
> +
> +            buffer_mmaped = (buf != NULL) ? true : false;
> +
> +            if (!buffer_mmaped) {
> +                buf = g_try_malloc0(data_size);
> +                if (!buf) {
> +                    error_report("%s: Error allocating buffer ", __func__);
> +                    return -ENOMEM;
> +                }
> +            }
> +
> +            qemu_get_buffer(f, buf, data_size);
> +
> +            ret = pwrite(vbasedev->fd, &data_size, sizeof(data_size),
> +                         region->fd_offset +
> +                       offsetof(struct vfio_device_migration_info, data_size));
> +            if (ret != sizeof(data_size)) {
> +                error_report("%s: Failed to set migration buffer data size %d",
> +                             vbasedev->name, ret);
> +                if (!buffer_mmaped) {
> +                    g_free(buf);
> +                }
> +                return -EINVAL;
> +            }
> +
> +            if (!buffer_mmaped) {
> +                ret = pwrite(vbasedev->fd, buf, data_size,
> +                             region->fd_offset + data_offset);
> +                g_free(buf);
> +
> +                if (ret != data_size) {
> +                    error_report("%s: Failed to set migration buffer %d",
> +                                 vbasedev->name, ret);
> +                    return -EINVAL;
> +                }
> +            }
> +            trace_vfio_load_state_device_data(vbasedev->name, data_offset,
> +                                              data_size);
> +            break;
> +        }
> +        }
> +
> +        ret = qemu_file_get_error(f);
> +        if (ret) {
> +            return ret;
> +        }
> +        data = qemu_get_be64(f);
> +    }
> +
> +    return ret;
> +}
> +
>  static SaveVMHandlers savevm_vfio_handlers = {
>      .save_setup = vfio_save_setup,
>      .save_cleanup = vfio_save_cleanup,
>      .save_live_pending = vfio_save_pending,
>      .save_live_iterate = vfio_save_iterate,
>      .save_live_complete_precopy = vfio_save_complete_precopy,
> +    .load_setup = vfio_load_setup,
> +    .load_cleanup = vfio_load_cleanup,
> +    .load_state = vfio_load_state,
>  };
>  
>  /* ---------------------------------------------------------------------- */
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index bdf40ba368c7..ac065b559f4e 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -157,3 +157,6 @@ vfio_save_device_config_state(char *name) " (%s)"
>  vfio_save_pending(char *name, uint64_t precopy, uint64_t postcopy, uint64_t compatible) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" compatible 0x%"PRIx64
>  vfio_save_iterate(char *name, int data_size) " (%s) data_size %d"
>  vfio_save_complete_precopy(char *name) " (%s)"
> +vfio_load_device_config_state(char *name) " (%s)"
> +vfio_load_state(char *name, uint64_t data) " (%s) data 0x%"PRIx64
> +vfio_load_state_device_data(char *name, uint64_t data_offset, uint64_t data_size) " (%s) Offset 0x%"PRIx64" size 0x%"PRIx64
> -- 
> 2.7.0
>
Kirti Wankhede July 18, 2019, 7 p.m. UTC | #2
On 7/12/2019 8:22 AM, Yan Zhao wrote:
> On Tue, Jul 09, 2019 at 05:49:17PM +0800, Kirti Wankhede wrote:
>> Flow during _RESUMING device state:
>> - If Vendor driver defines mappable region, mmap migration region.
>> - Load config state.
>> - For data packet, till VFIO_MIG_FLAG_END_OF_STATE is not reached
>>     - read data_size from packet, read buffer of data_size
>>     - read data_offset from where QEMU should write data.
>>         if region is mmaped, write data of data_size to mmaped region.
>>     - write data_size.
>>         In case of mmapped region, write to data_size indicates kernel
>>         driver that data is written in staging buffer.
>>     - if region is trapped, pwrite() data of data_size from data_offset.
>> - Repeat above until VFIO_MIG_FLAG_END_OF_STATE.
>> - Unmap migration region.
>>
>> For user, data is opaque. User should write data in the same order as
>> received.
>>
>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>> Reviewed-by: Neo Jia <cjia@nvidia.com>
>> ---
>>  hw/vfio/migration.c  | 162 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>  hw/vfio/trace-events |   3 +
>>  2 files changed, 165 insertions(+)
>>
>> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
>> index 4e9b4cce230b..5fb4c5329ede 100644
>> --- a/hw/vfio/migration.c
>> +++ b/hw/vfio/migration.c
>> @@ -249,6 +249,26 @@ static int vfio_save_device_config_state(QEMUFile *f, void *opaque)
>>      return qemu_file_get_error(f);
>>  }
>>  
>> +static int vfio_load_device_config_state(QEMUFile *f, void *opaque)
>> +{
>> +    VFIODevice *vbasedev = opaque;
>> +    uint64_t data;
>> +
>> +    if (vbasedev->ops && vbasedev->ops->vfio_load_config) {
>> +        vbasedev->ops->vfio_load_config(vbasedev, f);
>> +    }
>> +
>> +    data = qemu_get_be64(f);
>> +    if (data != VFIO_MIG_FLAG_END_OF_STATE) {
>> +        error_report("%s: Failed loading device config space, "
>> +                     "end flag incorrect 0x%"PRIx64, vbasedev->name, data);
>> +        return -EINVAL;
>> +    }
>> +
>> +    trace_vfio_load_device_config_state(vbasedev->name);
>> +    return qemu_file_get_error(f);
>> +}
>> +
>>  /* ---------------------------------------------------------------------- */
>>  
>>  static int vfio_save_setup(QEMUFile *f, void *opaque)
>> @@ -421,12 +441,154 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
>>      return ret;
>>  }
>>  
>> +static int vfio_load_setup(QEMUFile *f, void *opaque)
>> +{
>> +    VFIODevice *vbasedev = opaque;
>> +    VFIOMigration *migration = vbasedev->migration;
>> +    int ret = 0;
>> +
>> +    if (migration->region.buffer.mmaps) {
>> +        ret = vfio_region_mmap(&migration->region.buffer);
>> +        if (ret) {
>> +            error_report("%s: Failed to mmap VFIO migration region %d: %s",
>> +                         vbasedev->name, migration->region.index,
>> +                         strerror(-ret));
>> +            return ret;
>> +        }
>> +    }
>> +
>> +    ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RESUMING);
>> +    if (ret) {
>> +        error_report("%s: Failed to set state RESUMING", vbasedev->name);
>> +    }
>> +    return ret;
>> +}
>> +
>> +static int vfio_load_cleanup(void *opaque)
>> +{
>> +    vfio_save_cleanup(opaque);
>> +    return 0;
>> +}
>> +
>> +static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
>> +{
>> +    VFIODevice *vbasedev = opaque;
>> +    VFIOMigration *migration = vbasedev->migration;
>> +    int ret = 0;
>> +    uint64_t data, data_size;
>> +
> I think checking of version_id is still needed.
> 

Checking version_id with what value?

Thanks,
Kirti
Yan Zhao July 22, 2019, 3:20 a.m. UTC | #3
On Fri, Jul 19, 2019 at 03:00:13AM +0800, Kirti Wankhede wrote:
> 
> 
> On 7/12/2019 8:22 AM, Yan Zhao wrote:
> > On Tue, Jul 09, 2019 at 05:49:17PM +0800, Kirti Wankhede wrote:
> >> Flow during _RESUMING device state:
> >> - If Vendor driver defines mappable region, mmap migration region.
> >> - Load config state.
> >> - For data packet, till VFIO_MIG_FLAG_END_OF_STATE is not reached
> >>     - read data_size from packet, read buffer of data_size
> >>     - read data_offset from where QEMU should write data.
> >>         if region is mmaped, write data of data_size to mmaped region.
> >>     - write data_size.
> >>         In case of mmapped region, write to data_size indicates kernel
> >>         driver that data is written in staging buffer.
> >>     - if region is trapped, pwrite() data of data_size from data_offset.
> >> - Repeat above until VFIO_MIG_FLAG_END_OF_STATE.
> >> - Unmap migration region.
> >>
> >> For user, data is opaque. User should write data in the same order as
> >> received.
> >>
> >> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> >> Reviewed-by: Neo Jia <cjia@nvidia.com>
> >> ---
> >>  hw/vfio/migration.c  | 162 +++++++++++++++++++++++++++++++++++++++++++++++++++
> >>  hw/vfio/trace-events |   3 +
> >>  2 files changed, 165 insertions(+)
> >>
> >> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
> >> index 4e9b4cce230b..5fb4c5329ede 100644
> >> --- a/hw/vfio/migration.c
> >> +++ b/hw/vfio/migration.c
> >> @@ -249,6 +249,26 @@ static int vfio_save_device_config_state(QEMUFile *f, void *opaque)
> >>      return qemu_file_get_error(f);
> >>  }
> >>  
> >> +static int vfio_load_device_config_state(QEMUFile *f, void *opaque)
> >> +{
> >> +    VFIODevice *vbasedev = opaque;
> >> +    uint64_t data;
> >> +
> >> +    if (vbasedev->ops && vbasedev->ops->vfio_load_config) {
> >> +        vbasedev->ops->vfio_load_config(vbasedev, f);
> >> +    }
> >> +
> >> +    data = qemu_get_be64(f);
> >> +    if (data != VFIO_MIG_FLAG_END_OF_STATE) {
> >> +        error_report("%s: Failed loading device config space, "
> >> +                     "end flag incorrect 0x%"PRIx64, vbasedev->name, data);
> >> +        return -EINVAL;
> >> +    }
> >> +
> >> +    trace_vfio_load_device_config_state(vbasedev->name);
> >> +    return qemu_file_get_error(f);
> >> +}
> >> +
> >>  /* ---------------------------------------------------------------------- */
> >>  
> >>  static int vfio_save_setup(QEMUFile *f, void *opaque)
> >> @@ -421,12 +441,154 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
> >>      return ret;
> >>  }
> >>  
> >> +static int vfio_load_setup(QEMUFile *f, void *opaque)
> >> +{
> >> +    VFIODevice *vbasedev = opaque;
> >> +    VFIOMigration *migration = vbasedev->migration;
> >> +    int ret = 0;
> >> +
> >> +    if (migration->region.buffer.mmaps) {
> >> +        ret = vfio_region_mmap(&migration->region.buffer);
> >> +        if (ret) {
> >> +            error_report("%s: Failed to mmap VFIO migration region %d: %s",
> >> +                         vbasedev->name, migration->region.index,
> >> +                         strerror(-ret));
> >> +            return ret;
> >> +        }
> >> +    }
> >> +
> >> +    ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RESUMING);
> >> +    if (ret) {
> >> +        error_report("%s: Failed to set state RESUMING", vbasedev->name);
> >> +    }
> >> +    return ret;
> >> +}
> >> +
> >> +static int vfio_load_cleanup(void *opaque)
> >> +{
> >> +    vfio_save_cleanup(opaque);
> >> +    return 0;
> >> +}
> >> +
> >> +static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
> >> +{
> >> +    VFIODevice *vbasedev = opaque;
> >> +    VFIOMigration *migration = vbasedev->migration;
> >> +    int ret = 0;
> >> +    uint64_t data, data_size;
> >> +
> > I think checking of version_id is still needed.
> > 
> 
> Checking version_id with what value?
>
this version_id passed-in is the source VFIO software interface id.
need to check it with the value in target side, right?

Though we previously discussed the sysfs node interface to check live
migration version even before launching live migration, I think we still
need this runtime software version check in qemu to ensure software
interfaces in QEMU VFIO are compatible.

Thanks
Yan
Alex Williamson July 22, 2019, 7:07 p.m. UTC | #4
On Sun, 21 Jul 2019 23:20:28 -0400
Yan Zhao <yan.y.zhao@intel.com> wrote:

> On Fri, Jul 19, 2019 at 03:00:13AM +0800, Kirti Wankhede wrote:
> > 
> > 
> > On 7/12/2019 8:22 AM, Yan Zhao wrote:  
> > > On Tue, Jul 09, 2019 at 05:49:17PM +0800, Kirti Wankhede wrote:  
> > >> Flow during _RESUMING device state:
> > >> - If Vendor driver defines mappable region, mmap migration region.
> > >> - Load config state.
> > >> - For data packet, till VFIO_MIG_FLAG_END_OF_STATE is not reached
> > >>     - read data_size from packet, read buffer of data_size
> > >>     - read data_offset from where QEMU should write data.
> > >>         if region is mmaped, write data of data_size to mmaped region.
> > >>     - write data_size.
> > >>         In case of mmapped region, write to data_size indicates kernel
> > >>         driver that data is written in staging buffer.
> > >>     - if region is trapped, pwrite() data of data_size from data_offset.
> > >> - Repeat above until VFIO_MIG_FLAG_END_OF_STATE.
> > >> - Unmap migration region.
> > >>
> > >> For user, data is opaque. User should write data in the same order as
> > >> received.
> > >>
> > >> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> > >> Reviewed-by: Neo Jia <cjia@nvidia.com>
> > >> ---
> > >>  hw/vfio/migration.c  | 162 +++++++++++++++++++++++++++++++++++++++++++++++++++
> > >>  hw/vfio/trace-events |   3 +
> > >>  2 files changed, 165 insertions(+)
> > >>
> > >> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
> > >> index 4e9b4cce230b..5fb4c5329ede 100644
> > >> --- a/hw/vfio/migration.c
> > >> +++ b/hw/vfio/migration.c
> > >> @@ -249,6 +249,26 @@ static int vfio_save_device_config_state(QEMUFile *f, void *opaque)
> > >>      return qemu_file_get_error(f);
> > >>  }
> > >>  
> > >> +static int vfio_load_device_config_state(QEMUFile *f, void *opaque)
> > >> +{
> > >> +    VFIODevice *vbasedev = opaque;
> > >> +    uint64_t data;
> > >> +
> > >> +    if (vbasedev->ops && vbasedev->ops->vfio_load_config) {
> > >> +        vbasedev->ops->vfio_load_config(vbasedev, f);
> > >> +    }
> > >> +
> > >> +    data = qemu_get_be64(f);
> > >> +    if (data != VFIO_MIG_FLAG_END_OF_STATE) {
> > >> +        error_report("%s: Failed loading device config space, "
> > >> +                     "end flag incorrect 0x%"PRIx64, vbasedev->name, data);
> > >> +        return -EINVAL;
> > >> +    }
> > >> +
> > >> +    trace_vfio_load_device_config_state(vbasedev->name);
> > >> +    return qemu_file_get_error(f);
> > >> +}
> > >> +
> > >>  /* ---------------------------------------------------------------------- */
> > >>  
> > >>  static int vfio_save_setup(QEMUFile *f, void *opaque)
> > >> @@ -421,12 +441,154 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
> > >>      return ret;
> > >>  }
> > >>  
> > >> +static int vfio_load_setup(QEMUFile *f, void *opaque)
> > >> +{
> > >> +    VFIODevice *vbasedev = opaque;
> > >> +    VFIOMigration *migration = vbasedev->migration;
> > >> +    int ret = 0;
> > >> +
> > >> +    if (migration->region.buffer.mmaps) {
> > >> +        ret = vfio_region_mmap(&migration->region.buffer);
> > >> +        if (ret) {
> > >> +            error_report("%s: Failed to mmap VFIO migration region %d: %s",
> > >> +                         vbasedev->name, migration->region.index,
> > >> +                         strerror(-ret));
> > >> +            return ret;
> > >> +        }
> > >> +    }
> > >> +
> > >> +    ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RESUMING);
> > >> +    if (ret) {
> > >> +        error_report("%s: Failed to set state RESUMING", vbasedev->name);
> > >> +    }
> > >> +    return ret;
> > >> +}
> > >> +
> > >> +static int vfio_load_cleanup(void *opaque)
> > >> +{
> > >> +    vfio_save_cleanup(opaque);
> > >> +    return 0;
> > >> +}
> > >> +
> > >> +static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
> > >> +{
> > >> +    VFIODevice *vbasedev = opaque;
> > >> +    VFIOMigration *migration = vbasedev->migration;
> > >> +    int ret = 0;
> > >> +    uint64_t data, data_size;
> > >> +  
> > > I think checking of version_id is still needed.
> > >   
> > 
> > Checking version_id with what value?
> >  
> this version_id passed-in is the source VFIO software interface id.
> need to check it with the value in target side, right?
> 
> Though we previously discussed the sysfs node interface to check live
> migration version even before launching live migration, I think we still
> need this runtime software version check in qemu to ensure software
> interfaces in QEMU VFIO are compatible.

Do we want QEMU to interact directly with sysfs for that, which would
require write privileges to sysfs, or do we want to suggest that vendor
drivers should include equivalent information early in their migration
data stream to force a migration failure as early as possible for
incompatible data?  I think we need the latter regardless because the
vendor driver should never trust userspace like that, but does that
make any QEMU use of the sysfs version test itself redundant?  Thanks,

Alex
Yan Zhao July 22, 2019, 9:50 p.m. UTC | #5
On Tue, Jul 23, 2019 at 03:07:13AM +0800, Alex Williamson wrote:
> On Sun, 21 Jul 2019 23:20:28 -0400
> Yan Zhao <yan.y.zhao@intel.com> wrote:
> 
> > On Fri, Jul 19, 2019 at 03:00:13AM +0800, Kirti Wankhede wrote:
> > > 
> > > 
> > > On 7/12/2019 8:22 AM, Yan Zhao wrote:  
> > > > On Tue, Jul 09, 2019 at 05:49:17PM +0800, Kirti Wankhede wrote:  
> > > >> Flow during _RESUMING device state:
> > > >> - If Vendor driver defines mappable region, mmap migration region.
> > > >> - Load config state.
> > > >> - For data packet, till VFIO_MIG_FLAG_END_OF_STATE is not reached
> > > >>     - read data_size from packet, read buffer of data_size
> > > >>     - read data_offset from where QEMU should write data.
> > > >>         if region is mmaped, write data of data_size to mmaped region.
> > > >>     - write data_size.
> > > >>         In case of mmapped region, write to data_size indicates kernel
> > > >>         driver that data is written in staging buffer.
> > > >>     - if region is trapped, pwrite() data of data_size from data_offset.
> > > >> - Repeat above until VFIO_MIG_FLAG_END_OF_STATE.
> > > >> - Unmap migration region.
> > > >>
> > > >> For user, data is opaque. User should write data in the same order as
> > > >> received.
> > > >>
> > > >> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> > > >> Reviewed-by: Neo Jia <cjia@nvidia.com>
> > > >> ---
> > > >>  hw/vfio/migration.c  | 162 +++++++++++++++++++++++++++++++++++++++++++++++++++
> > > >>  hw/vfio/trace-events |   3 +
> > > >>  2 files changed, 165 insertions(+)
> > > >>
> > > >> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
> > > >> index 4e9b4cce230b..5fb4c5329ede 100644
> > > >> --- a/hw/vfio/migration.c
> > > >> +++ b/hw/vfio/migration.c
> > > >> @@ -249,6 +249,26 @@ static int vfio_save_device_config_state(QEMUFile *f, void *opaque)
> > > >>      return qemu_file_get_error(f);
> > > >>  }
> > > >>  
> > > >> +static int vfio_load_device_config_state(QEMUFile *f, void *opaque)
> > > >> +{
> > > >> +    VFIODevice *vbasedev = opaque;
> > > >> +    uint64_t data;
> > > >> +
> > > >> +    if (vbasedev->ops && vbasedev->ops->vfio_load_config) {
> > > >> +        vbasedev->ops->vfio_load_config(vbasedev, f);
> > > >> +    }
> > > >> +
> > > >> +    data = qemu_get_be64(f);
> > > >> +    if (data != VFIO_MIG_FLAG_END_OF_STATE) {
> > > >> +        error_report("%s: Failed loading device config space, "
> > > >> +                     "end flag incorrect 0x%"PRIx64, vbasedev->name, data);
> > > >> +        return -EINVAL;
> > > >> +    }
> > > >> +
> > > >> +    trace_vfio_load_device_config_state(vbasedev->name);
> > > >> +    return qemu_file_get_error(f);
> > > >> +}
> > > >> +
> > > >>  /* ---------------------------------------------------------------------- */
> > > >>  
> > > >>  static int vfio_save_setup(QEMUFile *f, void *opaque)
> > > >> @@ -421,12 +441,154 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
> > > >>      return ret;
> > > >>  }
> > > >>  
> > > >> +static int vfio_load_setup(QEMUFile *f, void *opaque)
> > > >> +{
> > > >> +    VFIODevice *vbasedev = opaque;
> > > >> +    VFIOMigration *migration = vbasedev->migration;
> > > >> +    int ret = 0;
> > > >> +
> > > >> +    if (migration->region.buffer.mmaps) {
> > > >> +        ret = vfio_region_mmap(&migration->region.buffer);
> > > >> +        if (ret) {
> > > >> +            error_report("%s: Failed to mmap VFIO migration region %d: %s",
> > > >> +                         vbasedev->name, migration->region.index,
> > > >> +                         strerror(-ret));
> > > >> +            return ret;
> > > >> +        }
> > > >> +    }
> > > >> +
> > > >> +    ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RESUMING);
> > > >> +    if (ret) {
> > > >> +        error_report("%s: Failed to set state RESUMING", vbasedev->name);
> > > >> +    }
> > > >> +    return ret;
> > > >> +}
> > > >> +
> > > >> +static int vfio_load_cleanup(void *opaque)
> > > >> +{
> > > >> +    vfio_save_cleanup(opaque);
> > > >> +    return 0;
> > > >> +}
> > > >> +
> > > >> +static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
> > > >> +{
> > > >> +    VFIODevice *vbasedev = opaque;
> > > >> +    VFIOMigration *migration = vbasedev->migration;
> > > >> +    int ret = 0;
> > > >> +    uint64_t data, data_size;
> > > >> +  
> > > > I think checking of version_id is still needed.
> > > >   
> > > 
> > > Checking version_id with what value?
> > >  
> > this version_id passed-in is the source VFIO software interface id.
> > need to check it with the value in target side, right?
> > 
> > Though we previously discussed the sysfs node interface to check live
> > migration version even before launching live migration, I think we still
> > need this runtime software version check in qemu to ensure software
> > interfaces in QEMU VFIO are compatible.
> 
> Do we want QEMU to interact directly with sysfs for that, which would
> require write privileges to sysfs, or do we want to suggest that vendor
> drivers should include equivalent information early in their migration
> data stream to force a migration failure as early as possible for
> incompatible data?  I think we need the latter regardless because the
> vendor driver should never trust userspace like that, but does that
> make any QEMU use of the sysfs version test itself redundant?  Thanks,
> 
> Alex

hi Alex
I think QEMU needs to check at least the code version of software interface in
QEMU, like format of migration region, details of migration protocol,
IOW, the software version QEMU interacts with vendor driver.
This information should not be known to vendor driver until migration
running to certain phase.
e.g. if saving flow or format in source qemu is changed a little as a result
of software upgrading, target qemu has to detect that from this
version_id check, as vendor driver has no knowledge of that.
Does that make sense?


Thanks
Yan
Kirti Wankhede Aug. 20, 2019, 8:35 p.m. UTC | #6
On 7/23/2019 3:20 AM, Yan Zhao wrote:
> On Tue, Jul 23, 2019 at 03:07:13AM +0800, Alex Williamson wrote:
>> On Sun, 21 Jul 2019 23:20:28 -0400
>> Yan Zhao <yan.y.zhao@intel.com> wrote:
>>
>>> On Fri, Jul 19, 2019 at 03:00:13AM +0800, Kirti Wankhede wrote:
>>>>
>>>>
>>>> On 7/12/2019 8:22 AM, Yan Zhao wrote:  
>>>>> On Tue, Jul 09, 2019 at 05:49:17PM +0800, Kirti Wankhede wrote:  
>>>>>> Flow during _RESUMING device state:
>>>>>> - If Vendor driver defines mappable region, mmap migration region.
>>>>>> - Load config state.
>>>>>> - For data packet, till VFIO_MIG_FLAG_END_OF_STATE is not reached
>>>>>>     - read data_size from packet, read buffer of data_size
>>>>>>     - read data_offset from where QEMU should write data.
>>>>>>         if region is mmaped, write data of data_size to mmaped region.
>>>>>>     - write data_size.
>>>>>>         In case of mmapped region, write to data_size indicates kernel
>>>>>>         driver that data is written in staging buffer.
>>>>>>     - if region is trapped, pwrite() data of data_size from data_offset.
>>>>>> - Repeat above until VFIO_MIG_FLAG_END_OF_STATE.
>>>>>> - Unmap migration region.
>>>>>>
>>>>>> For user, data is opaque. User should write data in the same order as
>>>>>> received.
>>>>>>
>>>>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>>>>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
>>>>>> ---
>>>>>>  hw/vfio/migration.c  | 162 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>>  hw/vfio/trace-events |   3 +
>>>>>>  2 files changed, 165 insertions(+)
>>>>>>
>>>>>> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
>>>>>> index 4e9b4cce230b..5fb4c5329ede 100644
>>>>>> --- a/hw/vfio/migration.c
>>>>>> +++ b/hw/vfio/migration.c
>>>>>> @@ -249,6 +249,26 @@ static int vfio_save_device_config_state(QEMUFile *f, void *opaque)
>>>>>>      return qemu_file_get_error(f);
>>>>>>  }
>>>>>>  
>>>>>> +static int vfio_load_device_config_state(QEMUFile *f, void *opaque)
>>>>>> +{
>>>>>> +    VFIODevice *vbasedev = opaque;
>>>>>> +    uint64_t data;
>>>>>> +
>>>>>> +    if (vbasedev->ops && vbasedev->ops->vfio_load_config) {
>>>>>> +        vbasedev->ops->vfio_load_config(vbasedev, f);
>>>>>> +    }
>>>>>> +
>>>>>> +    data = qemu_get_be64(f);
>>>>>> +    if (data != VFIO_MIG_FLAG_END_OF_STATE) {
>>>>>> +        error_report("%s: Failed loading device config space, "
>>>>>> +                     "end flag incorrect 0x%"PRIx64, vbasedev->name, data);
>>>>>> +        return -EINVAL;
>>>>>> +    }
>>>>>> +
>>>>>> +    trace_vfio_load_device_config_state(vbasedev->name);
>>>>>> +    return qemu_file_get_error(f);
>>>>>> +}
>>>>>> +
>>>>>>  /* ---------------------------------------------------------------------- */
>>>>>>  
>>>>>>  static int vfio_save_setup(QEMUFile *f, void *opaque)
>>>>>> @@ -421,12 +441,154 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
>>>>>>      return ret;
>>>>>>  }
>>>>>>  
>>>>>> +static int vfio_load_setup(QEMUFile *f, void *opaque)
>>>>>> +{
>>>>>> +    VFIODevice *vbasedev = opaque;
>>>>>> +    VFIOMigration *migration = vbasedev->migration;
>>>>>> +    int ret = 0;
>>>>>> +
>>>>>> +    if (migration->region.buffer.mmaps) {
>>>>>> +        ret = vfio_region_mmap(&migration->region.buffer);
>>>>>> +        if (ret) {
>>>>>> +            error_report("%s: Failed to mmap VFIO migration region %d: %s",
>>>>>> +                         vbasedev->name, migration->region.index,
>>>>>> +                         strerror(-ret));
>>>>>> +            return ret;
>>>>>> +        }
>>>>>> +    }
>>>>>> +
>>>>>> +    ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RESUMING);
>>>>>> +    if (ret) {
>>>>>> +        error_report("%s: Failed to set state RESUMING", vbasedev->name);
>>>>>> +    }
>>>>>> +    return ret;
>>>>>> +}
>>>>>> +
>>>>>> +static int vfio_load_cleanup(void *opaque)
>>>>>> +{
>>>>>> +    vfio_save_cleanup(opaque);
>>>>>> +    return 0;
>>>>>> +}
>>>>>> +
>>>>>> +static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
>>>>>> +{
>>>>>> +    VFIODevice *vbasedev = opaque;
>>>>>> +    VFIOMigration *migration = vbasedev->migration;
>>>>>> +    int ret = 0;
>>>>>> +    uint64_t data, data_size;
>>>>>> +  
>>>>> I think checking of version_id is still needed.
>>>>>   
>>>>
>>>> Checking version_id with what value?
>>>>  
>>> this version_id passed-in is the source VFIO software interface id.
>>> need to check it with the value in target side, right?
>>>
>>> Though we previously discussed the sysfs node interface to check live
>>> migration version even before launching live migration, I think we still
>>> need this runtime software version check in qemu to ensure software
>>> interfaces in QEMU VFIO are compatible.
>>
>> Do we want QEMU to interact directly with sysfs for that, which would
>> require write privileges to sysfs, or do we want to suggest that vendor
>> drivers should include equivalent information early in their migration
>> data stream to force a migration failure as early as possible for
>> incompatible data?  I think we need the latter regardless because the
>> vendor driver should never trust userspace like that, but does that
>> make any QEMU use of the sysfs version test itself redundant?  Thanks,
>>
>> Alex
> 
> hi Alex
> I think QEMU needs to check at least the code version of software interface in
> QEMU, like format of migration region, details of migration protocol,
> IOW, the software version QEMU interacts with vendor driver.
> This information should not be known to vendor driver until migration
> running to certain phase.
> e.g. if saving flow or format in source qemu is changed a little as a result
> of software upgrading, target qemu has to detect that from this
> version_id check, as vendor driver has no knowledge of that.
> Does that make sense?
> 

That is already done in qemu_loadvm_section_start_full()

    /* Validate version */
    if (version_id > se->version_id) {
        error_report("savevm: unsupported version %d for '%s' v%d",
                     version_id, idstr, se->version_id);
        return -EINVAL;
    }
    se->load_version_id = version_id;

Thanks,
Kirti
diff mbox series

Patch

diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 4e9b4cce230b..5fb4c5329ede 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -249,6 +249,26 @@  static int vfio_save_device_config_state(QEMUFile *f, void *opaque)
     return qemu_file_get_error(f);
 }
 
+static int vfio_load_device_config_state(QEMUFile *f, void *opaque)
+{
+    VFIODevice *vbasedev = opaque;
+    uint64_t data;
+
+    if (vbasedev->ops && vbasedev->ops->vfio_load_config) {
+        vbasedev->ops->vfio_load_config(vbasedev, f);
+    }
+
+    data = qemu_get_be64(f);
+    if (data != VFIO_MIG_FLAG_END_OF_STATE) {
+        error_report("%s: Failed loading device config space, "
+                     "end flag incorrect 0x%"PRIx64, vbasedev->name, data);
+        return -EINVAL;
+    }
+
+    trace_vfio_load_device_config_state(vbasedev->name);
+    return qemu_file_get_error(f);
+}
+
 /* ---------------------------------------------------------------------- */
 
 static int vfio_save_setup(QEMUFile *f, void *opaque)
@@ -421,12 +441,154 @@  static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
     return ret;
 }
 
+static int vfio_load_setup(QEMUFile *f, void *opaque)
+{
+    VFIODevice *vbasedev = opaque;
+    VFIOMigration *migration = vbasedev->migration;
+    int ret = 0;
+
+    if (migration->region.buffer.mmaps) {
+        ret = vfio_region_mmap(&migration->region.buffer);
+        if (ret) {
+            error_report("%s: Failed to mmap VFIO migration region %d: %s",
+                         vbasedev->name, migration->region.index,
+                         strerror(-ret));
+            return ret;
+        }
+    }
+
+    ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RESUMING);
+    if (ret) {
+        error_report("%s: Failed to set state RESUMING", vbasedev->name);
+    }
+    return ret;
+}
+
+static int vfio_load_cleanup(void *opaque)
+{
+    vfio_save_cleanup(opaque);
+    return 0;
+}
+
+static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
+{
+    VFIODevice *vbasedev = opaque;
+    VFIOMigration *migration = vbasedev->migration;
+    int ret = 0;
+    uint64_t data, data_size;
+
+    data = qemu_get_be64(f);
+    while (data != VFIO_MIG_FLAG_END_OF_STATE) {
+
+        trace_vfio_load_state(vbasedev->name, data);
+
+        switch (data) {
+        case VFIO_MIG_FLAG_DEV_CONFIG_STATE:
+        {
+            ret = vfio_load_device_config_state(f, opaque);
+            if (ret) {
+                return ret;
+            }
+            break;
+        }
+        case VFIO_MIG_FLAG_DEV_SETUP_STATE:
+        {
+            data = qemu_get_be64(f);
+            if (data == VFIO_MIG_FLAG_END_OF_STATE) {
+                return ret;
+            } else {
+                error_report("%s: SETUP STATE: EOS not found 0x%"PRIx64,
+                             vbasedev->name, data);
+                return -EINVAL;
+            }
+            break;
+        }
+        case VFIO_MIG_FLAG_DEV_DATA_STATE:
+        {
+            VFIORegion *region = &migration->region.buffer;
+            void *buf = NULL;
+            bool buffer_mmaped = false;
+            uint64_t data_offset = 0;
+
+            data_size = qemu_get_be64(f);
+            if (data_size == 0) {
+                break;
+            }
+
+            ret = pread(vbasedev->fd, &data_offset, sizeof(data_offset),
+                        region->fd_offset +
+                        offsetof(struct vfio_device_migration_info,
+                        data_offset));
+            if (ret != sizeof(data_offset)) {
+                error_report("%s:Failed to get migration buffer data offset %d",
+                             vbasedev->name, ret);
+                return -EINVAL;
+            }
+
+            if (region->mmaps) {
+                buf = find_data_region(region, data_offset, data_size);
+            }
+
+            buffer_mmaped = (buf != NULL) ? true : false;
+
+            if (!buffer_mmaped) {
+                buf = g_try_malloc0(data_size);
+                if (!buf) {
+                    error_report("%s: Error allocating buffer ", __func__);
+                    return -ENOMEM;
+                }
+            }
+
+            qemu_get_buffer(f, buf, data_size);
+
+            ret = pwrite(vbasedev->fd, &data_size, sizeof(data_size),
+                         region->fd_offset +
+                       offsetof(struct vfio_device_migration_info, data_size));
+            if (ret != sizeof(data_size)) {
+                error_report("%s: Failed to set migration buffer data size %d",
+                             vbasedev->name, ret);
+                if (!buffer_mmaped) {
+                    g_free(buf);
+                }
+                return -EINVAL;
+            }
+
+            if (!buffer_mmaped) {
+                ret = pwrite(vbasedev->fd, buf, data_size,
+                             region->fd_offset + data_offset);
+                g_free(buf);
+
+                if (ret != data_size) {
+                    error_report("%s: Failed to set migration buffer %d",
+                                 vbasedev->name, ret);
+                    return -EINVAL;
+                }
+            }
+            trace_vfio_load_state_device_data(vbasedev->name, data_offset,
+                                              data_size);
+            break;
+        }
+        }
+
+        ret = qemu_file_get_error(f);
+        if (ret) {
+            return ret;
+        }
+        data = qemu_get_be64(f);
+    }
+
+    return ret;
+}
+
 static SaveVMHandlers savevm_vfio_handlers = {
     .save_setup = vfio_save_setup,
     .save_cleanup = vfio_save_cleanup,
     .save_live_pending = vfio_save_pending,
     .save_live_iterate = vfio_save_iterate,
     .save_live_complete_precopy = vfio_save_complete_precopy,
+    .load_setup = vfio_load_setup,
+    .load_cleanup = vfio_load_cleanup,
+    .load_state = vfio_load_state,
 };
 
 /* ---------------------------------------------------------------------- */
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index bdf40ba368c7..ac065b559f4e 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -157,3 +157,6 @@  vfio_save_device_config_state(char *name) " (%s)"
 vfio_save_pending(char *name, uint64_t precopy, uint64_t postcopy, uint64_t compatible) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" compatible 0x%"PRIx64
 vfio_save_iterate(char *name, int data_size) " (%s) data_size %d"
 vfio_save_complete_precopy(char *name) " (%s)"
+vfio_load_device_config_state(char *name) " (%s)"
+vfio_load_state(char *name, uint64_t data) " (%s) data 0x%"PRIx64
+vfio_load_state_device_data(char *name, uint64_t data_offset, uint64_t data_size) " (%s) Offset 0x%"PRIx64" size 0x%"PRIx64