diff mbox series

[5/5] vfio/migration: support device memory capability

Message ID 1550566380-3788-1-git-send-email-yan.y.zhao@intel.com
State New
Headers show
Series QEMU VFIO live migration | expand

Commit Message

Yan Zhao Feb. 19, 2019, 8:53 a.m. UTC
If a device has device memory capability, save/load data from device memory
in pre-copy and stop-and-copy phases.

LOGGING state is set for device memory for dirty page logging:
in LOGGING state, get device memory returns whole device memory snapshot;
outside LOGGING state, get device memory returns dirty data since last get
operation.

Usually, device memory is very big, qemu needs to chunk it into several
pieces each with size of device memory region.

Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
---
 hw/vfio/migration.c | 235 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 hw/vfio/pci.h       |   1 +
 2 files changed, 231 insertions(+), 5 deletions(-)

Comments

Dr. David Alan Gilbert Feb. 19, 2019, 11:25 a.m. UTC | #1
* Yan Zhao (yan.y.zhao@intel.com) wrote:
> If a device has device memory capability, save/load data from device memory
> in pre-copy and stop-and-copy phases.
> 
> LOGGING state is set for device memory for dirty page logging:
> in LOGGING state, get device memory returns whole device memory snapshot;
> outside LOGGING state, get device memory returns dirty data since last get
> operation.
> 
> Usually, device memory is very big, qemu needs to chunk it into several
> pieces each with size of device memory region.
> 
> Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> ---
>  hw/vfio/migration.c | 235 ++++++++++++++++++++++++++++++++++++++++++++++++++--
>  hw/vfio/pci.h       |   1 +
>  2 files changed, 231 insertions(+), 5 deletions(-)
> 
> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
> index 16d6395..f1e9309 100644
> --- a/hw/vfio/migration.c
> +++ b/hw/vfio/migration.c
> @@ -203,6 +203,201 @@ static int vfio_load_data_device_config(VFIOPCIDevice *vdev,
>      return 0;
>  }
>  
> +static int vfio_get_device_memory_size(VFIOPCIDevice *vdev)
> +{
> +    VFIODevice *vbasedev = &vdev->vbasedev;
> +    VFIORegion *region_ctl =
> +        &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL];
> +    uint64_t len;
> +    int sz;
> +
> +    sz = sizeof(len);
> +    if (pread(vbasedev->fd, &len, sz,
> +                region_ctl->fd_offset +
> +                offsetof(struct vfio_device_state_ctl, device_memory.size))
> +            != sz) {
> +        error_report("vfio: Failed to get length of device memory");
> +        return -1;
> +    }
> +    vdev->migration->devmem_size = len;
> +    return 0;
> +}
> +
> +static int vfio_set_device_memory_size(VFIOPCIDevice *vdev, uint64_t size)
> +{
> +    VFIODevice *vbasedev = &vdev->vbasedev;
> +    VFIORegion *region_ctl =
> +        &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL];
> +    int sz;
> +
> +    sz = sizeof(size);
> +    if (pwrite(vbasedev->fd, &size, sz,
> +                region_ctl->fd_offset +
> +                offsetof(struct vfio_device_state_ctl, device_memory.size))
> +            != sz) {
> +        error_report("vfio: Failed to set length of device comemory");
> +        return -1;
> +    }
> +    vdev->migration->devmem_size = size;
> +    return 0;
> +}
> +
> +static
> +int vfio_save_data_device_memory_chunk(VFIOPCIDevice *vdev, QEMUFile *f,
> +                                    uint64_t pos, uint64_t len)
> +{
> +    VFIODevice *vbasedev = &vdev->vbasedev;
> +    VFIORegion *region_ctl =
> +        &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL];
> +    VFIORegion *region_devmem =
> +        &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_DEVICE_MEMORY];
> +    void *dest;
> +    uint32_t sz;
> +    uint8_t *buf = NULL;
> +    uint32_t action = VFIO_DEVICE_DATA_ACTION_GET_BUFFER;
> +
> +    if (len > region_devmem->size) {
> +        return -1;
> +    }
> +
> +    sz = sizeof(pos);
> +    if (pwrite(vbasedev->fd, &pos, sz,
> +                region_ctl->fd_offset +
> +                offsetof(struct vfio_device_state_ctl, device_memory.pos))
> +            != sz) {
> +        error_report("vfio: Failed to set save buffer pos");
> +        return -1;
> +    }
> +    sz = sizeof(action);
> +    if (pwrite(vbasedev->fd, &action, sz,
> +                region_ctl->fd_offset +
> +                offsetof(struct vfio_device_state_ctl, device_memory.action))
> +            != sz) {
> +        error_report("vfio: Failed to set save buffer action");
> +        return -1;
> +    }
> +
> +    if (!vfio_device_state_region_mmaped(region_devmem)) {
> +        buf = g_malloc(len);
> +        if (buf == NULL) {
> +            error_report("vfio: Failed to allocate memory for migrate");
> +            return -1;
> +        }
> +        if (pread(vbasedev->fd, buf, len, region_devmem->fd_offset) != len) {
> +            error_report("vfio: error load device memory buffer");

That's forgotten to g_free(buf)

> +            return -1;
> +        }
> +        qemu_put_be64(f, len);
> +        qemu_put_be64(f, pos);
> +        qemu_put_buffer(f, buf, len);
> +        g_free(buf);
> +    } else {
> +        dest = region_devmem->mmaps[0].mmap;
> +        qemu_put_be64(f, len);
> +        qemu_put_be64(f, pos);
> +        qemu_put_buffer(f, dest, len);
> +    }
> +    return 0;
> +}
> +
> +static int vfio_save_data_device_memory(VFIOPCIDevice *vdev, QEMUFile *f)
> +{
> +    VFIORegion *region_devmem =
> +        &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_DEVICE_MEMORY];
> +    uint64_t total_len = vdev->migration->devmem_size;
> +    uint64_t pos = 0;
> +
> +    qemu_put_be64(f, total_len);
> +    while (pos < total_len) {
> +        uint64_t len = region_devmem->size;
> +
> +        if (pos + len >= total_len) {
> +            len = total_len - pos;
> +        }
> +        if (vfio_save_data_device_memory_chunk(vdev, f, pos, len)) {
> +            return -1;
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +static
> +int vfio_load_data_device_memory_chunk(VFIOPCIDevice *vdev, QEMUFile *f,
> +                                uint64_t pos, uint64_t len)
> +{
> +    VFIODevice *vbasedev = &vdev->vbasedev;
> +    VFIORegion *region_ctl =
> +        &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL];
> +    VFIORegion *region_devmem =
> +        &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_DEVICE_MEMORY];
> +
> +    void *dest;
> +    uint32_t sz;
> +    uint8_t *buf = NULL;
> +    uint32_t action = VFIO_DEVICE_DATA_ACTION_SET_BUFFER;
> +
> +    if (len > region_devmem->size) {
> +        return -1;
> +    }
> +
> +    sz = sizeof(pos);
> +    if (pwrite(vbasedev->fd, &pos, sz,
> +                region_ctl->fd_offset +
> +                offsetof(struct vfio_device_state_ctl, device_memory.pos))
> +            != sz) {
> +        error_report("vfio: Failed to set device memory buffer pos");
> +        return -1;
> +    }
> +    if (!vfio_device_state_region_mmaped(region_devmem)) {
> +        buf = g_malloc(len);
> +        if (buf == NULL) {
> +            error_report("vfio: Failed to allocate memory for migrate");
> +            return -1;
> +        }
> +        qemu_get_buffer(f, buf, len);
> +        if (pwrite(vbasedev->fd, buf, len,
> +                    region_devmem->fd_offset) != len) {
> +            error_report("vfio: Failed to load devie memory buffer");

Again, failed to free buf

> +            return -1;
> +        }
> +        g_free(buf);
> +    } else {
> +        dest = region_devmem->mmaps[0].mmap;
> +        qemu_get_buffer(f, dest, len);
> +    }

You might want to use qemu_file_get_error(f)  before writing the data
to the device, to check for the case of a read error on the migration
stream that happened somewhere in the pevious qemu_get's

> +    sz = sizeof(action);
> +    if (pwrite(vbasedev->fd, &action, sz,
> +                region_ctl->fd_offset +
> +                offsetof(struct vfio_device_state_ctl, device_memory.action))
> +            != sz) {
> +        error_report("vfio: Failed to set load device memory buffer action");
> +        return -1;
> +    }
> +
> +    return 0;
> +
> +}
> +
> +static int vfio_load_data_device_memory(VFIOPCIDevice *vdev,
> +                        QEMUFile *f, uint64_t total_len)
> +{
> +    uint64_t pos = 0, len = 0;
> +
> +    vfio_set_device_memory_size(vdev, total_len);
> +
> +    while (pos + len < total_len) {
> +        len = qemu_get_be64(f);
> +        pos = qemu_get_be64(f);

Please check len/pos - always assume that the migration stream could
be (maliciously or accidentally) corrupt.

> +        vfio_load_data_device_memory_chunk(vdev, f, pos, len);
> +    }
> +
> +    return 0;
> +}
> +
> +
>  static int vfio_set_dirty_page_bitmap_chunk(VFIOPCIDevice *vdev,
>          uint64_t start_addr, uint64_t page_nr)
>  {
> @@ -377,6 +572,10 @@ static void vfio_save_live_pending(QEMUFile *f, void *opaque,
>          return;
>      }
>  
> +    /* get dirty data size of device memory */
> +    vfio_get_device_memory_size(vdev);
> +
> +    *res_precopy_only += vdev->migration->devmem_size;
>      return;
>  }
>  
> @@ -388,7 +587,9 @@ static int vfio_save_iterate(QEMUFile *f, void *opaque)
>          return 0;
>      }
>  
> -    return 0;
> +    qemu_put_byte(f, VFIO_SAVE_FLAG_DEVMEMORY);
> +    /* get dirty data of device memory */
> +    return vfio_save_data_device_memory(vdev, f);
>  }
>  
>  static void vfio_pci_load_config(VFIOPCIDevice *vdev, QEMUFile *f)
> @@ -458,6 +659,10 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
>              len = qemu_get_be64(f);
>              vfio_load_data_device_config(vdev, f, len);
>              break;
> +        case VFIO_SAVE_FLAG_DEVMEMORY:
> +            len = qemu_get_be64(f);
> +            vfio_load_data_device_memory(vdev, f, len);
> +            break;
>          default:
>              ret = -EINVAL;
>          }
> @@ -503,6 +708,13 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
>      VFIOPCIDevice *vdev = opaque;
>      int rc = 0;
>  
> +    if (vfio_device_data_cap_device_memory(vdev)) {
> +        qemu_put_byte(f, VFIO_SAVE_FLAG_DEVMEMORY | VFIO_SAVE_FLAG_CONTINUE);
> +        /* get dirty data of device memory */
> +        vfio_get_device_memory_size(vdev);
> +        rc = vfio_save_data_device_memory(vdev, f);
> +    }
> +
>      qemu_put_byte(f, VFIO_SAVE_FLAG_PCI | VFIO_SAVE_FLAG_CONTINUE);
>      vfio_pci_save_config(vdev, f);
>  
> @@ -515,12 +727,22 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
>  
>  static int vfio_save_setup(QEMUFile *f, void *opaque)
>  {
> +    int rc = 0;
>      VFIOPCIDevice *vdev = opaque;
> -    qemu_put_byte(f, VFIO_SAVE_FLAG_SETUP);
> +
> +    if (vfio_device_data_cap_device_memory(vdev)) {
> +        qemu_put_byte(f, VFIO_SAVE_FLAG_SETUP | VFIO_SAVE_FLAG_CONTINUE);
> +        qemu_put_byte(f, VFIO_SAVE_FLAG_DEVMEMORY);
> +        /* get whole snapshot of device memory */
> +        vfio_get_device_memory_size(vdev);
> +        rc = vfio_save_data_device_memory(vdev, f);
> +    } else {
> +        qemu_put_byte(f, VFIO_SAVE_FLAG_SETUP);
> +    }
>  
>      vfio_set_device_state(vdev, VFIO_DEVICE_STATE_RUNNING |
>              VFIO_DEVICE_STATE_LOGGING);
> -    return 0;
> +    return rc;
>  }
>  
>  static int vfio_load_setup(QEMUFile *f, void *opaque)
> @@ -576,8 +798,11 @@ int vfio_migration_init(VFIOPCIDevice *vdev, Error **errp)
>          goto error;
>      }
>  
> -    if (vfio_device_data_cap_device_memory(vdev)) {
> -        error_report("No suppport of data cap device memory Yet");
> +    if (vfio_device_data_cap_device_memory(vdev) &&
> +            vfio_device_state_region_setup(vdev,
> +              &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_DEVICE_MEMORY],
> +              VFIO_REGION_SUBTYPE_DEVICE_STATE_DATA_MEMORY,
> +              "device-state-data-device-memory")) {
>          goto error;
>      }
>  
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index 4b7b1bb..a2cc64b 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -69,6 +69,7 @@ typedef struct VFIOMigration {
>      uint32_t data_caps;
>      uint32_t device_state;
>      uint64_t devconfig_size;
> +    uint64_t devmem_size;
>      VMChangeStateEntry *vm_state;
>  } VFIOMigration;
>  
> -- 
> 2.7.4
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Christophe de Dinechin Feb. 19, 2019, 2:42 p.m. UTC | #2
> On 19 Feb 2019, at 09:53, Yan Zhao <yan.y.zhao@intel.com> wrote:
> 
> If a device has device memory capability, save/load data from device memory
> in pre-copy and stop-and-copy phases.
> 
> LOGGING state is set for device memory for dirty page logging:
> in LOGGING state, get device memory returns whole device memory snapshot;
> outside LOGGING state, get device memory returns dirty data since last get
> operation.
> 
> Usually, device memory is very big, qemu needs to chunk it into several
> pieces each with size of device memory region.
> 
> Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> ---
> hw/vfio/migration.c | 235 ++++++++++++++++++++++++++++++++++++++++++++++++++--
> hw/vfio/pci.h       |   1 +
> 2 files changed, 231 insertions(+), 5 deletions(-)
> 
> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
> index 16d6395..f1e9309 100644
> --- a/hw/vfio/migration.c
> +++ b/hw/vfio/migration.c
> @@ -203,6 +203,201 @@ static int vfio_load_data_device_config(VFIOPCIDevice *vdev,
>     return 0;
> }
> 
> +static int vfio_get_device_memory_size(VFIOPCIDevice *vdev)
> +{
> +    VFIODevice *vbasedev = &vdev->vbasedev;
> +    VFIORegion *region_ctl =
> +        &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL];
> +    uint64_t len;
> +    int sz;
> +
> +    sz = sizeof(len);
> +    if (pread(vbasedev->fd, &len, sz,
> +                region_ctl->fd_offset +
> +                offsetof(struct vfio_device_state_ctl, device_memory.size))
> +            != sz) {
> +        error_report("vfio: Failed to get length of device memory”);

s/length/size/ ? (to be consistent with function name)
> +        return -1;
> +    }
> +    vdev->migration->devmem_size = len;
> +    return 0;
> +}
> +
> +static int vfio_set_device_memory_size(VFIOPCIDevice *vdev, uint64_t size)
> +{
> +    VFIODevice *vbasedev = &vdev->vbasedev;
> +    VFIORegion *region_ctl =
> +        &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL];
> +    int sz;
> +
> +    sz = sizeof(size);
> +    if (pwrite(vbasedev->fd, &size, sz,
> +                region_ctl->fd_offset +
> +                offsetof(struct vfio_device_state_ctl, device_memory.size))
> +            != sz) {
> +        error_report("vfio: Failed to set length of device comemory”);

What is comemory? Typo?

Same comment about length vs size

> +        return -1;
> +    }
> +    vdev->migration->devmem_size = size;
> +    return 0;
> +}
> +
> +static
> +int vfio_save_data_device_memory_chunk(VFIOPCIDevice *vdev, QEMUFile *f,
> +                                    uint64_t pos, uint64_t len)
> +{
> +    VFIODevice *vbasedev = &vdev->vbasedev;
> +    VFIORegion *region_ctl =
> +        &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL];
> +    VFIORegion *region_devmem =
> +        &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_DEVICE_MEMORY];
> +    void *dest;
> +    uint32_t sz;
> +    uint8_t *buf = NULL;
> +    uint32_t action = VFIO_DEVICE_DATA_ACTION_GET_BUFFER;
> +
> +    if (len > region_devmem->size) {

Is it intentional that there is no error_report here?

> +        return -1;
> +    }
> +
> +    sz = sizeof(pos);
> +    if (pwrite(vbasedev->fd, &pos, sz,
> +                region_ctl->fd_offset +
> +                offsetof(struct vfio_device_state_ctl, device_memory.pos))
> +            != sz) {
> +        error_report("vfio: Failed to set save buffer pos");
> +        return -1;
> +    }
> +    sz = sizeof(action);
> +    if (pwrite(vbasedev->fd, &action, sz,
> +                region_ctl->fd_offset +
> +                offsetof(struct vfio_device_state_ctl, device_memory.action))
> +            != sz) {
> +        error_report("vfio: Failed to set save buffer action");
> +        return -1;
> +    }
> +
> +    if (!vfio_device_state_region_mmaped(region_devmem)) {
> +        buf = g_malloc(len);
> +        if (buf == NULL) {
> +            error_report("vfio: Failed to allocate memory for migrate”);
s/migrate/migration/ ?
> +            return -1;
> +        }
> +        if (pread(vbasedev->fd, buf, len, region_devmem->fd_offset) != len) {
> +            error_report("vfio: error load device memory buffer”);
s/load/loading/ ?
> +            return -1;
> +        }
> +        qemu_put_be64(f, len);
> +        qemu_put_be64(f, pos);
> +        qemu_put_buffer(f, buf, len);
> +        g_free(buf);
> +    } else {
> +        dest = region_devmem->mmaps[0].mmap;
> +        qemu_put_be64(f, len);
> +        qemu_put_be64(f, pos);
> +        qemu_put_buffer(f, dest, len);
> +    }
> +    return 0;
> +}
> +
> +static int vfio_save_data_device_memory(VFIOPCIDevice *vdev, QEMUFile *f)
> +{
> +    VFIORegion *region_devmem =
> +        &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_DEVICE_MEMORY];
> +    uint64_t total_len = vdev->migration->devmem_size;
> +    uint64_t pos = 0;
> +
> +    qemu_put_be64(f, total_len);
> +    while (pos < total_len) {
> +        uint64_t len = region_devmem->size;
> +
> +        if (pos + len >= total_len) {
> +            len = total_len - pos;
> +        }
> +        if (vfio_save_data_device_memory_chunk(vdev, f, pos, len)) {
> +            return -1;
> +        }

I don’t see where pos is incremented in this loop

> +    }
> +
> +    return 0;
> +}
> +
> +static
> +int vfio_load_data_device_memory_chunk(VFIOPCIDevice *vdev, QEMUFile *f,
> +                                uint64_t pos, uint64_t len)
> +{
> +    VFIODevice *vbasedev = &vdev->vbasedev;
> +    VFIORegion *region_ctl =
> +        &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL];
> +    VFIORegion *region_devmem =
> +        &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_DEVICE_MEMORY];
> +
> +    void *dest;
> +    uint32_t sz;
> +    uint8_t *buf = NULL;
> +    uint32_t action = VFIO_DEVICE_DATA_ACTION_SET_BUFFER;
> +
> +    if (len > region_devmem->size) {

error_report?
> +        return -1;
> +    }
> +
> +    sz = sizeof(pos);
> +    if (pwrite(vbasedev->fd, &pos, sz,
> +                region_ctl->fd_offset +
> +                offsetof(struct vfio_device_state_ctl, device_memory.pos))
> +            != sz) {
> +        error_report("vfio: Failed to set device memory buffer pos");
> +        return -1;
> +    }
> +    if (!vfio_device_state_region_mmaped(region_devmem)) {
> +        buf = g_malloc(len);
> +        if (buf == NULL) {
> +            error_report("vfio: Failed to allocate memory for migrate");
> +            return -1;
> +        }
> +        qemu_get_buffer(f, buf, len);
> +        if (pwrite(vbasedev->fd, buf, len,
> +                    region_devmem->fd_offset) != len) {
> +            error_report("vfio: Failed to load devie memory buffer");
> +            return -1;
> +        }
> +        g_free(buf);
> +    } else {
> +        dest = region_devmem->mmaps[0].mmap;
> +        qemu_get_buffer(f, dest, len);
> +    }
> +
> +    sz = sizeof(action);
> +    if (pwrite(vbasedev->fd, &action, sz,
> +                region_ctl->fd_offset +
> +                offsetof(struct vfio_device_state_ctl, device_memory.action))
> +            != sz) {
> +        error_report("vfio: Failed to set load device memory buffer action");
> +        return -1;
> +    }
> +
> +    return 0;
> +
> +}
> +
> +static int vfio_load_data_device_memory(VFIOPCIDevice *vdev,
> +                        QEMUFile *f, uint64_t total_len)
> +{
> +    uint64_t pos = 0, len = 0;
> +
> +    vfio_set_device_memory_size(vdev, total_len);
> +
> +    while (pos + len < total_len) {
> +        len = qemu_get_be64(f);
> +        pos = qemu_get_be64(f);

Nit: load reads len/pos in the loop, whereas save does it in the
inner function (vfio_save_data_device_memory_chunk)

> +
> +        vfio_load_data_device_memory_chunk(vdev, f, pos, len);
> +    }
> +
> +    return 0;
> +}
> +
> +
> static int vfio_set_dirty_page_bitmap_chunk(VFIOPCIDevice *vdev,
>         uint64_t start_addr, uint64_t page_nr)
> {
> @@ -377,6 +572,10 @@ static void vfio_save_live_pending(QEMUFile *f, void *opaque,
>         return;
>     }
> 
> +    /* get dirty data size of device memory */
> +    vfio_get_device_memory_size(vdev);
> +
> +    *res_precopy_only += vdev->migration->devmem_size;
>     return;
> }
> 
> @@ -388,7 +587,9 @@ static int vfio_save_iterate(QEMUFile *f, void *opaque)
>         return 0;
>     }
> 
> -    return 0;
> +    qemu_put_byte(f, VFIO_SAVE_FLAG_DEVMEMORY);
> +    /* get dirty data of device memory */
> +    return vfio_save_data_device_memory(vdev, f);
> }
> 
> static void vfio_pci_load_config(VFIOPCIDevice *vdev, QEMUFile *f)
> @@ -458,6 +659,10 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
>             len = qemu_get_be64(f);
>             vfio_load_data_device_config(vdev, f, len);
>             break;
> +        case VFIO_SAVE_FLAG_DEVMEMORY:
> +            len = qemu_get_be64(f);
> +            vfio_load_data_device_memory(vdev, f, len);
> +            break;
>         default:
>             ret = -EINVAL;
>         }
> @@ -503,6 +708,13 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
>     VFIOPCIDevice *vdev = opaque;
>     int rc = 0;
> 
> +    if (vfio_device_data_cap_device_memory(vdev)) {
> +        qemu_put_byte(f, VFIO_SAVE_FLAG_DEVMEMORY | VFIO_SAVE_FLAG_CONTINUE);
> +        /* get dirty data of device memory */
> +        vfio_get_device_memory_size(vdev);
> +        rc = vfio_save_data_device_memory(vdev, f);
> +    }
> +
>     qemu_put_byte(f, VFIO_SAVE_FLAG_PCI | VFIO_SAVE_FLAG_CONTINUE);
>     vfio_pci_save_config(vdev, f);
> 
> @@ -515,12 +727,22 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
> 
> static int vfio_save_setup(QEMUFile *f, void *opaque)
> {
> +    int rc = 0;
>     VFIOPCIDevice *vdev = opaque;
> -    qemu_put_byte(f, VFIO_SAVE_FLAG_SETUP);
> +
> +    if (vfio_device_data_cap_device_memory(vdev)) {
> +        qemu_put_byte(f, VFIO_SAVE_FLAG_SETUP | VFIO_SAVE_FLAG_CONTINUE);
> +        qemu_put_byte(f, VFIO_SAVE_FLAG_DEVMEMORY);
> +        /* get whole snapshot of device memory */
> +        vfio_get_device_memory_size(vdev);
> +        rc = vfio_save_data_device_memory(vdev, f);
> +    } else {
> +        qemu_put_byte(f, VFIO_SAVE_FLAG_SETUP);
> +    }
> 
>     vfio_set_device_state(vdev, VFIO_DEVICE_STATE_RUNNING |
>             VFIO_DEVICE_STATE_LOGGING);
> -    return 0;
> +    return rc;
> }
> 
> static int vfio_load_setup(QEMUFile *f, void *opaque)
> @@ -576,8 +798,11 @@ int vfio_migration_init(VFIOPCIDevice *vdev, Error **errp)
>         goto error;
>     }
> 
> -    if (vfio_device_data_cap_device_memory(vdev)) {
> -        error_report("No suppport of data cap device memory Yet");
> +    if (vfio_device_data_cap_device_memory(vdev) &&
> +            vfio_device_state_region_setup(vdev,
> +              &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_DEVICE_MEMORY],
> +              VFIO_REGION_SUBTYPE_DEVICE_STATE_DATA_MEMORY,
> +              "device-state-data-device-memory")) {
>         goto error;
>     }
> 
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index 4b7b1bb..a2cc64b 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -69,6 +69,7 @@ typedef struct VFIOMigration {
>     uint32_t data_caps;
>     uint32_t device_state;
>     uint64_t devconfig_size;
> +    uint64_t devmem_size;
>     VMChangeStateEntry *vm_state;
> } VFIOMigration;
> 
> -- 
> 2.7.4
> 
> _______________________________________________
> intel-gvt-dev mailing list
> intel-gvt-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gvt-dev
Yan Zhao Feb. 20, 2019, 5:17 a.m. UTC | #3
On Tue, Feb 19, 2019 at 11:25:43AM +0000, Dr. David Alan Gilbert wrote:
> * Yan Zhao (yan.y.zhao@intel.com) wrote:
> > If a device has device memory capability, save/load data from device memory
> > in pre-copy and stop-and-copy phases.
> > 
> > LOGGING state is set for device memory for dirty page logging:
> > in LOGGING state, get device memory returns whole device memory snapshot;
> > outside LOGGING state, get device memory returns dirty data since last get
> > operation.
> > 
> > Usually, device memory is very big, qemu needs to chunk it into several
> > pieces each with size of device memory region.
> > 
> > Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
> > Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> > ---
> >  hw/vfio/migration.c | 235 ++++++++++++++++++++++++++++++++++++++++++++++++++--
> >  hw/vfio/pci.h       |   1 +
> >  2 files changed, 231 insertions(+), 5 deletions(-)
> > 
> > diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
> > index 16d6395..f1e9309 100644
> > --- a/hw/vfio/migration.c
> > +++ b/hw/vfio/migration.c
> > @@ -203,6 +203,201 @@ static int vfio_load_data_device_config(VFIOPCIDevice *vdev,
> >      return 0;
> >  }
> >  
> > +static int vfio_get_device_memory_size(VFIOPCIDevice *vdev)
> > +{
> > +    VFIODevice *vbasedev = &vdev->vbasedev;
> > +    VFIORegion *region_ctl =
> > +        &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL];
> > +    uint64_t len;
> > +    int sz;
> > +
> > +    sz = sizeof(len);
> > +    if (pread(vbasedev->fd, &len, sz,
> > +                region_ctl->fd_offset +
> > +                offsetof(struct vfio_device_state_ctl, device_memory.size))
> > +            != sz) {
> > +        error_report("vfio: Failed to get length of device memory");
> > +        return -1;
> > +    }
> > +    vdev->migration->devmem_size = len;
> > +    return 0;
> > +}
> > +
> > +static int vfio_set_device_memory_size(VFIOPCIDevice *vdev, uint64_t size)
> > +{
> > +    VFIODevice *vbasedev = &vdev->vbasedev;
> > +    VFIORegion *region_ctl =
> > +        &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL];
> > +    int sz;
> > +
> > +    sz = sizeof(size);
> > +    if (pwrite(vbasedev->fd, &size, sz,
> > +                region_ctl->fd_offset +
> > +                offsetof(struct vfio_device_state_ctl, device_memory.size))
> > +            != sz) {
> > +        error_report("vfio: Failed to set length of device comemory");
> > +        return -1;
> > +    }
> > +    vdev->migration->devmem_size = size;
> > +    return 0;
> > +}
> > +
> > +static
> > +int vfio_save_data_device_memory_chunk(VFIOPCIDevice *vdev, QEMUFile *f,
> > +                                    uint64_t pos, uint64_t len)
> > +{
> > +    VFIODevice *vbasedev = &vdev->vbasedev;
> > +    VFIORegion *region_ctl =
> > +        &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL];
> > +    VFIORegion *region_devmem =
> > +        &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_DEVICE_MEMORY];
> > +    void *dest;
> > +    uint32_t sz;
> > +    uint8_t *buf = NULL;
> > +    uint32_t action = VFIO_DEVICE_DATA_ACTION_GET_BUFFER;
> > +
> > +    if (len > region_devmem->size) {
> > +        return -1;
> > +    }
> > +
> > +    sz = sizeof(pos);
> > +    if (pwrite(vbasedev->fd, &pos, sz,
> > +                region_ctl->fd_offset +
> > +                offsetof(struct vfio_device_state_ctl, device_memory.pos))
> > +            != sz) {
> > +        error_report("vfio: Failed to set save buffer pos");
> > +        return -1;
> > +    }
> > +    sz = sizeof(action);
> > +    if (pwrite(vbasedev->fd, &action, sz,
> > +                region_ctl->fd_offset +
> > +                offsetof(struct vfio_device_state_ctl, device_memory.action))
> > +            != sz) {
> > +        error_report("vfio: Failed to set save buffer action");
> > +        return -1;
> > +    }
> > +
> > +    if (!vfio_device_state_region_mmaped(region_devmem)) {
> > +        buf = g_malloc(len);
> > +        if (buf == NULL) {
> > +            error_report("vfio: Failed to allocate memory for migrate");
> > +            return -1;
> > +        }
> > +        if (pread(vbasedev->fd, buf, len, region_devmem->fd_offset) != len) {
> > +            error_report("vfio: error load device memory buffer");
> 
> That's forgotten to g_free(buf)
>
Right, I'll correct that.

> > +            return -1;
> > +        }
> > +        qemu_put_be64(f, len);
> > +        qemu_put_be64(f, pos);
> > +        qemu_put_buffer(f, buf, len);
> > +        g_free(buf);
> > +    } else {
> > +        dest = region_devmem->mmaps[0].mmap;
> > +        qemu_put_be64(f, len);
> > +        qemu_put_be64(f, pos);
> > +        qemu_put_buffer(f, dest, len);
> > +    }
> > +    return 0;
> > +}
> > +
> > +static int vfio_save_data_device_memory(VFIOPCIDevice *vdev, QEMUFile *f)
> > +{
> > +    VFIORegion *region_devmem =
> > +        &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_DEVICE_MEMORY];
> > +    uint64_t total_len = vdev->migration->devmem_size;
> > +    uint64_t pos = 0;
> > +
> > +    qemu_put_be64(f, total_len);
> > +    while (pos < total_len) {
> > +        uint64_t len = region_devmem->size;
> > +
> > +        if (pos + len >= total_len) {
> > +            len = total_len - pos;
> > +        }
> > +        if (vfio_save_data_device_memory_chunk(vdev, f, pos, len)) {
> > +            return -1;
> > +        }
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +static
> > +int vfio_load_data_device_memory_chunk(VFIOPCIDevice *vdev, QEMUFile *f,
> > +                                uint64_t pos, uint64_t len)
> > +{
> > +    VFIODevice *vbasedev = &vdev->vbasedev;
> > +    VFIORegion *region_ctl =
> > +        &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL];
> > +    VFIORegion *region_devmem =
> > +        &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_DEVICE_MEMORY];
> > +
> > +    void *dest;
> > +    uint32_t sz;
> > +    uint8_t *buf = NULL;
> > +    uint32_t action = VFIO_DEVICE_DATA_ACTION_SET_BUFFER;
> > +
> > +    if (len > region_devmem->size) {
> > +        return -1;
> > +    }
> > +
> > +    sz = sizeof(pos);
> > +    if (pwrite(vbasedev->fd, &pos, sz,
> > +                region_ctl->fd_offset +
> > +                offsetof(struct vfio_device_state_ctl, device_memory.pos))
> > +            != sz) {
> > +        error_report("vfio: Failed to set device memory buffer pos");
> > +        return -1;
> > +    }
> > +    if (!vfio_device_state_region_mmaped(region_devmem)) {
> > +        buf = g_malloc(len);
> > +        if (buf == NULL) {
> > +            error_report("vfio: Failed to allocate memory for migrate");
> > +            return -1;
> > +        }
> > +        qemu_get_buffer(f, buf, len);
> > +        if (pwrite(vbasedev->fd, buf, len,
> > +                    region_devmem->fd_offset) != len) {
> > +            error_report("vfio: Failed to load devie memory buffer");
> 
> Again, failed to free buf
> 
Right, I'll correct that.
> > +            return -1;
> > +        }
> > +        g_free(buf);
> > +    } else {
> > +        dest = region_devmem->mmaps[0].mmap;
> > +        qemu_get_buffer(f, dest, len);
> > +    }
> 
> You might want to use qemu_file_get_error(f)  before writing the data
> to the device, to check for the case of a read error on the migration
> stream that happened somewhere in the pevious qemu_get's
>

ok.

> > +    sz = sizeof(action);
> > +    if (pwrite(vbasedev->fd, &action, sz,
> > +                region_ctl->fd_offset +
> > +                offsetof(struct vfio_device_state_ctl, device_memory.action))
> > +            != sz) {
> > +        error_report("vfio: Failed to set load device memory buffer action");
> > +        return -1;
> > +    }
> > +
> > +    return 0;
> > +
> > +}
> > +
> > +static int vfio_load_data_device_memory(VFIOPCIDevice *vdev,
> > +                        QEMUFile *f, uint64_t total_len)
> > +{
> > +    uint64_t pos = 0, len = 0;
> > +
> > +    vfio_set_device_memory_size(vdev, total_len);
> > +
> > +    while (pos + len < total_len) {
> > +        len = qemu_get_be64(f);
> > +        pos = qemu_get_be64(f);
> 
> Please check len/pos - always assume that the migration stream could
> be (maliciously or accidentally) corrupt.
>
ok.

> > +        vfio_load_data_device_memory_chunk(vdev, f, pos, len);
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +
> >  static int vfio_set_dirty_page_bitmap_chunk(VFIOPCIDevice *vdev,
> >          uint64_t start_addr, uint64_t page_nr)
> >  {
> > @@ -377,6 +572,10 @@ static void vfio_save_live_pending(QEMUFile *f, void *opaque,
> >          return;
> >      }
> >  
> > +    /* get dirty data size of device memory */
> > +    vfio_get_device_memory_size(vdev);
> > +
> > +    *res_precopy_only += vdev->migration->devmem_size;
> >      return;
> >  }
> >  
> > @@ -388,7 +587,9 @@ static int vfio_save_iterate(QEMUFile *f, void *opaque)
> >          return 0;
> >      }
> >  
> > -    return 0;
> > +    qemu_put_byte(f, VFIO_SAVE_FLAG_DEVMEMORY);
> > +    /* get dirty data of device memory */
> > +    return vfio_save_data_device_memory(vdev, f);
> >  }
> >  
> >  static void vfio_pci_load_config(VFIOPCIDevice *vdev, QEMUFile *f)
> > @@ -458,6 +659,10 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
> >              len = qemu_get_be64(f);
> >              vfio_load_data_device_config(vdev, f, len);
> >              break;
> > +        case VFIO_SAVE_FLAG_DEVMEMORY:
> > +            len = qemu_get_be64(f);
> > +            vfio_load_data_device_memory(vdev, f, len);
> > +            break;
> >          default:
> >              ret = -EINVAL;
> >          }
> > @@ -503,6 +708,13 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
> >      VFIOPCIDevice *vdev = opaque;
> >      int rc = 0;
> >  
> > +    if (vfio_device_data_cap_device_memory(vdev)) {
> > +        qemu_put_byte(f, VFIO_SAVE_FLAG_DEVMEMORY | VFIO_SAVE_FLAG_CONTINUE);
> > +        /* get dirty data of device memory */
> > +        vfio_get_device_memory_size(vdev);
> > +        rc = vfio_save_data_device_memory(vdev, f);
> > +    }
> > +
> >      qemu_put_byte(f, VFIO_SAVE_FLAG_PCI | VFIO_SAVE_FLAG_CONTINUE);
> >      vfio_pci_save_config(vdev, f);
> >  
> > @@ -515,12 +727,22 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
> >  
> >  static int vfio_save_setup(QEMUFile *f, void *opaque)
> >  {
> > +    int rc = 0;
> >      VFIOPCIDevice *vdev = opaque;
> > -    qemu_put_byte(f, VFIO_SAVE_FLAG_SETUP);
> > +
> > +    if (vfio_device_data_cap_device_memory(vdev)) {
> > +        qemu_put_byte(f, VFIO_SAVE_FLAG_SETUP | VFIO_SAVE_FLAG_CONTINUE);
> > +        qemu_put_byte(f, VFIO_SAVE_FLAG_DEVMEMORY);
> > +        /* get whole snapshot of device memory */
> > +        vfio_get_device_memory_size(vdev);
> > +        rc = vfio_save_data_device_memory(vdev, f);
> > +    } else {
> > +        qemu_put_byte(f, VFIO_SAVE_FLAG_SETUP);
> > +    }
> >  
> >      vfio_set_device_state(vdev, VFIO_DEVICE_STATE_RUNNING |
> >              VFIO_DEVICE_STATE_LOGGING);
> > -    return 0;
> > +    return rc;
> >  }
> >  
> >  static int vfio_load_setup(QEMUFile *f, void *opaque)
> > @@ -576,8 +798,11 @@ int vfio_migration_init(VFIOPCIDevice *vdev, Error **errp)
> >          goto error;
> >      }
> >  
> > -    if (vfio_device_data_cap_device_memory(vdev)) {
> > -        error_report("No suppport of data cap device memory Yet");
> > +    if (vfio_device_data_cap_device_memory(vdev) &&
> > +            vfio_device_state_region_setup(vdev,
> > +              &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_DEVICE_MEMORY],
> > +              VFIO_REGION_SUBTYPE_DEVICE_STATE_DATA_MEMORY,
> > +              "device-state-data-device-memory")) {
> >          goto error;
> >      }
> >  
> > diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> > index 4b7b1bb..a2cc64b 100644
> > --- a/hw/vfio/pci.h
> > +++ b/hw/vfio/pci.h
> > @@ -69,6 +69,7 @@ typedef struct VFIOMigration {
> >      uint32_t data_caps;
> >      uint32_t device_state;
> >      uint64_t devconfig_size;
> > +    uint64_t devmem_size;
> >      VMChangeStateEntry *vm_state;
> >  } VFIOMigration;
> >  
> > -- 
> > 2.7.4
> > 
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> _______________________________________________
> intel-gvt-dev mailing list
> intel-gvt-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gvt-dev
Yan Zhao Feb. 20, 2019, 7:58 a.m. UTC | #4
On Tue, Feb 19, 2019 at 03:42:36PM +0100, Christophe de Dinechin wrote:
> 
> 
> > On 19 Feb 2019, at 09:53, Yan Zhao <yan.y.zhao@intel.com> wrote:
> > 
> > If a device has device memory capability, save/load data from device memory
> > in pre-copy and stop-and-copy phases.
> > 
> > LOGGING state is set for device memory for dirty page logging:
> > in LOGGING state, get device memory returns whole device memory snapshot;
> > outside LOGGING state, get device memory returns dirty data since last get
> > operation.
> > 
> > Usually, device memory is very big, qemu needs to chunk it into several
> > pieces each with size of device memory region.
> > 
> > Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
> > Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> > ---
> > hw/vfio/migration.c | 235 ++++++++++++++++++++++++++++++++++++++++++++++++++--
> > hw/vfio/pci.h       |   1 +
> > 2 files changed, 231 insertions(+), 5 deletions(-)
> > 
> > diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
> > index 16d6395..f1e9309 100644
> > --- a/hw/vfio/migration.c
> > +++ b/hw/vfio/migration.c
> > @@ -203,6 +203,201 @@ static int vfio_load_data_device_config(VFIOPCIDevice *vdev,
> >     return 0;
> > }
> > 
> > +static int vfio_get_device_memory_size(VFIOPCIDevice *vdev)
> > +{
> > +    VFIODevice *vbasedev = &vdev->vbasedev;
> > +    VFIORegion *region_ctl =
> > +        &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL];
> > +    uint64_t len;
> > +    int sz;
> > +
> > +    sz = sizeof(len);
> > +    if (pread(vbasedev->fd, &len, sz,
> > +                region_ctl->fd_offset +
> > +                offsetof(struct vfio_device_state_ctl, device_memory.size))
> > +            != sz) {
> > +        error_report("vfio: Failed to get length of device memory”);
> 
> s/length/size/ ? (to be consistent with function name)

ok. thanks
> > +        return -1;
> > +    }
> > +    vdev->migration->devmem_size = len;
> > +    return 0;
> > +}
> > +
> > +static int vfio_set_device_memory_size(VFIOPCIDevice *vdev, uint64_t size)
> > +{
> > +    VFIODevice *vbasedev = &vdev->vbasedev;
> > +    VFIORegion *region_ctl =
> > +        &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL];
> > +    int sz;
> > +
> > +    sz = sizeof(size);
> > +    if (pwrite(vbasedev->fd, &size, sz,
> > +                region_ctl->fd_offset +
> > +                offsetof(struct vfio_device_state_ctl, device_memory.size))
> > +            != sz) {
> > +        error_report("vfio: Failed to set length of device comemory”);
> 
> What is comemory? Typo?

Right, typo. should be "memory" :)
> 
> Same comment about length vs size
>
got it. thanks

> > +        return -1;
> > +    }
> > +    vdev->migration->devmem_size = size;
> > +    return 0;
> > +}
> > +
> > +static
> > +int vfio_save_data_device_memory_chunk(VFIOPCIDevice *vdev, QEMUFile *f,
> > +                                    uint64_t pos, uint64_t len)
> > +{
> > +    VFIODevice *vbasedev = &vdev->vbasedev;
> > +    VFIORegion *region_ctl =
> > +        &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL];
> > +    VFIORegion *region_devmem =
> > +        &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_DEVICE_MEMORY];
> > +    void *dest;
> > +    uint32_t sz;
> > +    uint8_t *buf = NULL;
> > +    uint32_t action = VFIO_DEVICE_DATA_ACTION_GET_BUFFER;
> > +
> > +    if (len > region_devmem->size) {
> 
> Is it intentional that there is no error_report here?
>
an error_report here may be better.
> > +        return -1;
> > +    }
> > +
> > +    sz = sizeof(pos);
> > +    if (pwrite(vbasedev->fd, &pos, sz,
> > +                region_ctl->fd_offset +
> > +                offsetof(struct vfio_device_state_ctl, device_memory.pos))
> > +            != sz) {
> > +        error_report("vfio: Failed to set save buffer pos");
> > +        return -1;
> > +    }
> > +    sz = sizeof(action);
> > +    if (pwrite(vbasedev->fd, &action, sz,
> > +                region_ctl->fd_offset +
> > +                offsetof(struct vfio_device_state_ctl, device_memory.action))
> > +            != sz) {
> > +        error_report("vfio: Failed to set save buffer action");
> > +        return -1;
> > +    }
> > +
> > +    if (!vfio_device_state_region_mmaped(region_devmem)) {
> > +        buf = g_malloc(len);
> > +        if (buf == NULL) {
> > +            error_report("vfio: Failed to allocate memory for migrate”);
> s/migrate/migration/ ?

yes, thanks
> > +            return -1;
> > +        }
> > +        if (pread(vbasedev->fd, buf, len, region_devmem->fd_offset) != len) {
> > +            error_report("vfio: error load device memory buffer”);
> s/load/loading/ ?
error to load? :)

> > +            return -1;
> > +        }
> > +        qemu_put_be64(f, len);
> > +        qemu_put_be64(f, pos);
> > +        qemu_put_buffer(f, buf, len);
> > +        g_free(buf);
> > +    } else {
> > +        dest = region_devmem->mmaps[0].mmap;
> > +        qemu_put_be64(f, len);
> > +        qemu_put_be64(f, pos);
> > +        qemu_put_buffer(f, dest, len);
> > +    }
> > +    return 0;
> > +}
> > +
> > +static int vfio_save_data_device_memory(VFIOPCIDevice *vdev, QEMUFile *f)
> > +{
> > +    VFIORegion *region_devmem =
> > +        &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_DEVICE_MEMORY];
> > +    uint64_t total_len = vdev->migration->devmem_size;
> > +    uint64_t pos = 0;
> > +
> > +    qemu_put_be64(f, total_len);
> > +    while (pos < total_len) {
> > +        uint64_t len = region_devmem->size;
> > +
> > +        if (pos + len >= total_len) {
> > +            len = total_len - pos;
> > +        }
> > +        if (vfio_save_data_device_memory_chunk(vdev, f, pos, len)) {
> > +            return -1;
> > +        }
> 
> I don’t see where pos is incremented in this loop
> 
yes, missing one line "pos += len;"
Currently, code is not verified in hardware with device memory cap on.
Thanks:)
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +static
> > +int vfio_load_data_device_memory_chunk(VFIOPCIDevice *vdev, QEMUFile *f,
> > +                                uint64_t pos, uint64_t len)
> > +{
> > +    VFIODevice *vbasedev = &vdev->vbasedev;
> > +    VFIORegion *region_ctl =
> > +        &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL];
> > +    VFIORegion *region_devmem =
> > +        &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_DEVICE_MEMORY];
> > +
> > +    void *dest;
> > +    uint32_t sz;
> > +    uint8_t *buf = NULL;
> > +    uint32_t action = VFIO_DEVICE_DATA_ACTION_SET_BUFFER;
> > +
> > +    if (len > region_devmem->size) {
> 
> error_report?

seems better to add error_report.
> > +        return -1;
> > +    }
> > +
> > +    sz = sizeof(pos);
> > +    if (pwrite(vbasedev->fd, &pos, sz,
> > +                region_ctl->fd_offset +
> > +                offsetof(struct vfio_device_state_ctl, device_memory.pos))
> > +            != sz) {
> > +        error_report("vfio: Failed to set device memory buffer pos");
> > +        return -1;
> > +    }
> > +    if (!vfio_device_state_region_mmaped(region_devmem)) {
> > +        buf = g_malloc(len);
> > +        if (buf == NULL) {
> > +            error_report("vfio: Failed to allocate memory for migrate");
> > +            return -1;
> > +        }
> > +        qemu_get_buffer(f, buf, len);
> > +        if (pwrite(vbasedev->fd, buf, len,
> > +                    region_devmem->fd_offset) != len) {
> > +            error_report("vfio: Failed to load devie memory buffer");
> > +            return -1;
> > +        }
> > +        g_free(buf);
> > +    } else {
> > +        dest = region_devmem->mmaps[0].mmap;
> > +        qemu_get_buffer(f, dest, len);
> > +    }
> > +
> > +    sz = sizeof(action);
> > +    if (pwrite(vbasedev->fd, &action, sz,
> > +                region_ctl->fd_offset +
> > +                offsetof(struct vfio_device_state_ctl, device_memory.action))
> > +            != sz) {
> > +        error_report("vfio: Failed to set load device memory buffer action");
> > +        return -1;
> > +    }
> > +
> > +    return 0;
> > +
> > +}
> > +
> > +static int vfio_load_data_device_memory(VFIOPCIDevice *vdev,
> > +                        QEMUFile *f, uint64_t total_len)
> > +{
> > +    uint64_t pos = 0, len = 0;
> > +
> > +    vfio_set_device_memory_size(vdev, total_len);
> > +
> > +    while (pos + len < total_len) {
> > +        len = qemu_get_be64(f);
> > +        pos = qemu_get_be64(f);
> 
> Nit: load reads len/pos in the loop, whereas save does it in the
> inner function (vfio_save_data_device_memory_chunk)
right, load has to read len/pos in the loop.
> 
> > +
> > +        vfio_load_data_device_memory_chunk(vdev, f, pos, len);
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +
> > static int vfio_set_dirty_page_bitmap_chunk(VFIOPCIDevice *vdev,
> >         uint64_t start_addr, uint64_t page_nr)
> > {
> > @@ -377,6 +572,10 @@ static void vfio_save_live_pending(QEMUFile *f, void *opaque,
> >         return;
> >     }
> > 
> > +    /* get dirty data size of device memory */
> > +    vfio_get_device_memory_size(vdev);
> > +
> > +    *res_precopy_only += vdev->migration->devmem_size;
> >     return;
> > }
> > 
> > @@ -388,7 +587,9 @@ static int vfio_save_iterate(QEMUFile *f, void *opaque)
> >         return 0;
> >     }
> > 
> > -    return 0;
> > +    qemu_put_byte(f, VFIO_SAVE_FLAG_DEVMEMORY);
> > +    /* get dirty data of device memory */
> > +    return vfio_save_data_device_memory(vdev, f);
> > }
> > 
> > static void vfio_pci_load_config(VFIOPCIDevice *vdev, QEMUFile *f)
> > @@ -458,6 +659,10 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
> >             len = qemu_get_be64(f);
> >             vfio_load_data_device_config(vdev, f, len);
> >             break;
> > +        case VFIO_SAVE_FLAG_DEVMEMORY:
> > +            len = qemu_get_be64(f);
> > +            vfio_load_data_device_memory(vdev, f, len);
> > +            break;
> >         default:
> >             ret = -EINVAL;
> >         }
> > @@ -503,6 +708,13 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
> >     VFIOPCIDevice *vdev = opaque;
> >     int rc = 0;
> > 
> > +    if (vfio_device_data_cap_device_memory(vdev)) {
> > +        qemu_put_byte(f, VFIO_SAVE_FLAG_DEVMEMORY | VFIO_SAVE_FLAG_CONTINUE);
> > +        /* get dirty data of device memory */
> > +        vfio_get_device_memory_size(vdev);
> > +        rc = vfio_save_data_device_memory(vdev, f);
> > +    }
> > +
> >     qemu_put_byte(f, VFIO_SAVE_FLAG_PCI | VFIO_SAVE_FLAG_CONTINUE);
> >     vfio_pci_save_config(vdev, f);
> > 
> > @@ -515,12 +727,22 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
> > 
> > static int vfio_save_setup(QEMUFile *f, void *opaque)
> > {
> > +    int rc = 0;
> >     VFIOPCIDevice *vdev = opaque;
> > -    qemu_put_byte(f, VFIO_SAVE_FLAG_SETUP);
> > +
> > +    if (vfio_device_data_cap_device_memory(vdev)) {
> > +        qemu_put_byte(f, VFIO_SAVE_FLAG_SETUP | VFIO_SAVE_FLAG_CONTINUE);
> > +        qemu_put_byte(f, VFIO_SAVE_FLAG_DEVMEMORY);
> > +        /* get whole snapshot of device memory */
> > +        vfio_get_device_memory_size(vdev);
> > +        rc = vfio_save_data_device_memory(vdev, f);
> > +    } else {
> > +        qemu_put_byte(f, VFIO_SAVE_FLAG_SETUP);
> > +    }
> > 
> >     vfio_set_device_state(vdev, VFIO_DEVICE_STATE_RUNNING |
> >             VFIO_DEVICE_STATE_LOGGING);
> > -    return 0;
> > +    return rc;
> > }
> > 
> > static int vfio_load_setup(QEMUFile *f, void *opaque)
> > @@ -576,8 +798,11 @@ int vfio_migration_init(VFIOPCIDevice *vdev, Error **errp)
> >         goto error;
> >     }
> > 
> > -    if (vfio_device_data_cap_device_memory(vdev)) {
> > -        error_report("No suppport of data cap device memory Yet");
> > +    if (vfio_device_data_cap_device_memory(vdev) &&
> > +            vfio_device_state_region_setup(vdev,
> > +              &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_DEVICE_MEMORY],
> > +              VFIO_REGION_SUBTYPE_DEVICE_STATE_DATA_MEMORY,
> > +              "device-state-data-device-memory")) {
> >         goto error;
> >     }
> > 
> > diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> > index 4b7b1bb..a2cc64b 100644
> > --- a/hw/vfio/pci.h
> > +++ b/hw/vfio/pci.h
> > @@ -69,6 +69,7 @@ typedef struct VFIOMigration {
> >     uint32_t data_caps;
> >     uint32_t device_state;
> >     uint64_t devconfig_size;
> > +    uint64_t devmem_size;
> >     VMChangeStateEntry *vm_state;
> > } VFIOMigration;
> > 
> > -- 
> > 2.7.4
> > 
> > _______________________________________________
> > intel-gvt-dev mailing list
> > intel-gvt-dev@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/intel-gvt-dev
> 
> _______________________________________________
> intel-gvt-dev mailing list
> intel-gvt-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gvt-dev
Christophe de Dinechin Feb. 20, 2019, 10:14 a.m. UTC | #5
> On 20 Feb 2019, at 08:58, Zhao Yan <yan.y.zhao@intel.com> wrote:
> 
> On Tue, Feb 19, 2019 at 03:42:36PM +0100, Christophe de Dinechin wrote:
>> 
>> 
>>> On 19 Feb 2019, at 09:53, Yan Zhao <yan.y.zhao@intel.com> wrote:
>>> 
>>> If a device has device memory capability, save/load data from device memory
>>> in pre-copy and stop-and-copy phases.
>>> 
>>> LOGGING state is set for device memory for dirty page logging:
>>> in LOGGING state, get device memory returns whole device memory snapshot;
>>> outside LOGGING state, get device memory returns dirty data since last get
>>> operation.
>>> 
>>> Usually, device memory is very big, qemu needs to chunk it into several
>>> pieces each with size of device memory region.
>>> 
>>> Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>>> ---
>>> hw/vfio/migration.c | 235 ++++++++++++++++++++++++++++++++++++++++++++++++++--
>>> hw/vfio/pci.h       |   1 +
>>> 2 files changed, 231 insertions(+), 5 deletions(-)
>>> 
>>> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
>>> index 16d6395..f1e9309 100644
>>> --- a/hw/vfio/migration.c
>>> +++ b/hw/vfio/migration.c
>>> @@ -203,6 +203,201 @@ static int vfio_load_data_device_config(VFIOPCIDevice *vdev,
>>>    return 0;
>>> }
>>> 
>>> +static int vfio_get_device_memory_size(VFIOPCIDevice *vdev)
>>> +{
>>> +    VFIODevice *vbasedev = &vdev->vbasedev;
>>> +    VFIORegion *region_ctl =
>>> +        &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL];
>>> +    uint64_t len;
>>> +    int sz;
>>> +
>>> +    sz = sizeof(len);
>>> +    if (pread(vbasedev->fd, &len, sz,
>>> +                region_ctl->fd_offset +
>>> +                offsetof(struct vfio_device_state_ctl, device_memory.size))
>>> +            != sz) {
>>> +        error_report("vfio: Failed to get length of device memory”);
>> 
>> s/length/size/ ? (to be consistent with function name)
> 
> ok. thanks
>>> +        return -1;
>>> +    }
>>> +    vdev->migration->devmem_size = len;
>>> +    return 0;
>>> +}
>>> +
>>> +static int vfio_set_device_memory_size(VFIOPCIDevice *vdev, uint64_t size)
>>> +{
>>> +    VFIODevice *vbasedev = &vdev->vbasedev;
>>> +    VFIORegion *region_ctl =
>>> +        &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL];
>>> +    int sz;
>>> +
>>> +    sz = sizeof(size);
>>> +    if (pwrite(vbasedev->fd, &size, sz,
>>> +                region_ctl->fd_offset +
>>> +                offsetof(struct vfio_device_state_ctl, device_memory.size))
>>> +            != sz) {
>>> +        error_report("vfio: Failed to set length of device comemory”);
>> 
>> What is comemory? Typo?
> 
> Right, typo. should be "memory" :)
>> 
>> Same comment about length vs size
>> 
> got it. thanks
> 
>>> +        return -1;
>>> +    }
>>> +    vdev->migration->devmem_size = size;
>>> +    return 0;
>>> +}
>>> +
>>> +static
>>> +int vfio_save_data_device_memory_chunk(VFIOPCIDevice *vdev, QEMUFile *f,
>>> +                                    uint64_t pos, uint64_t len)
>>> +{
>>> +    VFIODevice *vbasedev = &vdev->vbasedev;
>>> +    VFIORegion *region_ctl =
>>> +        &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL];
>>> +    VFIORegion *region_devmem =
>>> +        &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_DEVICE_MEMORY];
>>> +    void *dest;
>>> +    uint32_t sz;
>>> +    uint8_t *buf = NULL;
>>> +    uint32_t action = VFIO_DEVICE_DATA_ACTION_GET_BUFFER;
>>> +
>>> +    if (len > region_devmem->size) {
>> 
>> Is it intentional that there is no error_report here?
>> 
> an error_report here may be better.
>>> +        return -1;
>>> +    }
>>> +
>>> +    sz = sizeof(pos);
>>> +    if (pwrite(vbasedev->fd, &pos, sz,
>>> +                region_ctl->fd_offset +
>>> +                offsetof(struct vfio_device_state_ctl, device_memory.pos))
>>> +            != sz) {
>>> +        error_report("vfio: Failed to set save buffer pos");
>>> +        return -1;
>>> +    }
>>> +    sz = sizeof(action);
>>> +    if (pwrite(vbasedev->fd, &action, sz,
>>> +                region_ctl->fd_offset +
>>> +                offsetof(struct vfio_device_state_ctl, device_memory.action))
>>> +            != sz) {
>>> +        error_report("vfio: Failed to set save buffer action");
>>> +        return -1;
>>> +    }
>>> +
>>> +    if (!vfio_device_state_region_mmaped(region_devmem)) {
>>> +        buf = g_malloc(len);
>>> +        if (buf == NULL) {
>>> +            error_report("vfio: Failed to allocate memory for migrate”);
>> s/migrate/migration/ ?
> 
> yes, thanks
>>> +            return -1;
>>> +        }
>>> +        if (pread(vbasedev->fd, buf, len, region_devmem->fd_offset) != len) {
>>> +            error_report("vfio: error load device memory buffer”);
>> s/load/loading/ ?
> error to load? :)

I’d check with a native speaker, but I believe it’s “error loading”.

To me (to be checked), the two sentences don’t have the same meaning:

“It is an error to load device memory buffer” -> “You are not allowed to do that”
“I had an error loading device memory buffer” -> “I tried, but it failed"

> 
>>> +            return -1;
>>> +        }
>>> +        qemu_put_be64(f, len);
>>> +        qemu_put_be64(f, pos);
>>> +        qemu_put_buffer(f, buf, len);
>>> +        g_free(buf);
>>> +    } else {
>>> +        dest = region_devmem->mmaps[0].mmap;
>>> +        qemu_put_be64(f, len);
>>> +        qemu_put_be64(f, pos);
>>> +        qemu_put_buffer(f, dest, len);
>>> +    }
>>> +    return 0;
>>> +}
>>> +
>>> +static int vfio_save_data_device_memory(VFIOPCIDevice *vdev, QEMUFile *f)
>>> +{
>>> +    VFIORegion *region_devmem =
>>> +        &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_DEVICE_MEMORY];
>>> +    uint64_t total_len = vdev->migration->devmem_size;
>>> +    uint64_t pos = 0;
>>> +
>>> +    qemu_put_be64(f, total_len);
>>> +    while (pos < total_len) {
>>> +        uint64_t len = region_devmem->size;
>>> +
>>> +        if (pos + len >= total_len) {
>>> +            len = total_len - pos;
>>> +        }
>>> +        if (vfio_save_data_device_memory_chunk(vdev, f, pos, len)) {
>>> +            return -1;
>>> +        }
>> 
>> I don’t see where pos is incremented in this loop
>> 
> yes, missing one line "pos += len;"
> Currently, code is not verified in hardware with device memory cap on.
> Thanks:)
>>> +    }
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static
>>> +int vfio_load_data_device_memory_chunk(VFIOPCIDevice *vdev, QEMUFile *f,
>>> +                                uint64_t pos, uint64_t len)
>>> +{
>>> +    VFIODevice *vbasedev = &vdev->vbasedev;
>>> +    VFIORegion *region_ctl =
>>> +        &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL];
>>> +    VFIORegion *region_devmem =
>>> +        &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_DEVICE_MEMORY];
>>> +
>>> +    void *dest;
>>> +    uint32_t sz;
>>> +    uint8_t *buf = NULL;
>>> +    uint32_t action = VFIO_DEVICE_DATA_ACTION_SET_BUFFER;
>>> +
>>> +    if (len > region_devmem->size) {
>> 
>> error_report?
> 
> seems better to add error_report.
>>> +        return -1;
>>> +    }
>>> +
>>> +    sz = sizeof(pos);
>>> +    if (pwrite(vbasedev->fd, &pos, sz,
>>> +                region_ctl->fd_offset +
>>> +                offsetof(struct vfio_device_state_ctl, device_memory.pos))
>>> +            != sz) {
>>> +        error_report("vfio: Failed to set device memory buffer pos");
>>> +        return -1;
>>> +    }
>>> +    if (!vfio_device_state_region_mmaped(region_devmem)) {
>>> +        buf = g_malloc(len);
>>> +        if (buf == NULL) {
>>> +            error_report("vfio: Failed to allocate memory for migrate");
>>> +            return -1;
>>> +        }
>>> +        qemu_get_buffer(f, buf, len);
>>> +        if (pwrite(vbasedev->fd, buf, len,
>>> +                    region_devmem->fd_offset) != len) {
>>> +            error_report("vfio: Failed to load devie memory buffer");
>>> +            return -1;
>>> +        }
>>> +        g_free(buf);
>>> +    } else {
>>> +        dest = region_devmem->mmaps[0].mmap;
>>> +        qemu_get_buffer(f, dest, len);
>>> +    }
>>> +
>>> +    sz = sizeof(action);
>>> +    if (pwrite(vbasedev->fd, &action, sz,
>>> +                region_ctl->fd_offset +
>>> +                offsetof(struct vfio_device_state_ctl, device_memory.action))
>>> +            != sz) {
>>> +        error_report("vfio: Failed to set load device memory buffer action");
>>> +        return -1;
>>> +    }
>>> +
>>> +    return 0;
>>> +
>>> +}
>>> +
>>> +static int vfio_load_data_device_memory(VFIOPCIDevice *vdev,
>>> +                        QEMUFile *f, uint64_t total_len)
>>> +{
>>> +    uint64_t pos = 0, len = 0;
>>> +
>>> +    vfio_set_device_memory_size(vdev, total_len);
>>> +
>>> +    while (pos + len < total_len) {
>>> +        len = qemu_get_be64(f);
>>> +        pos = qemu_get_be64(f);
>> 
>> Nit: load reads len/pos in the loop, whereas save does it in the
>> inner function (vfio_save_data_device_memory_chunk)
> right, load has to read len/pos in the loop.
>> 
>>> +
>>> +        vfio_load_data_device_memory_chunk(vdev, f, pos, len);
>>> +    }
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +
>>> static int vfio_set_dirty_page_bitmap_chunk(VFIOPCIDevice *vdev,
>>>        uint64_t start_addr, uint64_t page_nr)
>>> {
>>> @@ -377,6 +572,10 @@ static void vfio_save_live_pending(QEMUFile *f, void *opaque,
>>>        return;
>>>    }
>>> 
>>> +    /* get dirty data size of device memory */
>>> +    vfio_get_device_memory_size(vdev);
>>> +
>>> +    *res_precopy_only += vdev->migration->devmem_size;
>>>    return;
>>> }
>>> 
>>> @@ -388,7 +587,9 @@ static int vfio_save_iterate(QEMUFile *f, void *opaque)
>>>        return 0;
>>>    }
>>> 
>>> -    return 0;
>>> +    qemu_put_byte(f, VFIO_SAVE_FLAG_DEVMEMORY);
>>> +    /* get dirty data of device memory */
>>> +    return vfio_save_data_device_memory(vdev, f);
>>> }
>>> 
>>> static void vfio_pci_load_config(VFIOPCIDevice *vdev, QEMUFile *f)
>>> @@ -458,6 +659,10 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
>>>            len = qemu_get_be64(f);
>>>            vfio_load_data_device_config(vdev, f, len);
>>>            break;
>>> +        case VFIO_SAVE_FLAG_DEVMEMORY:
>>> +            len = qemu_get_be64(f);
>>> +            vfio_load_data_device_memory(vdev, f, len);
>>> +            break;
>>>        default:
>>>            ret = -EINVAL;
>>>        }
>>> @@ -503,6 +708,13 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
>>>    VFIOPCIDevice *vdev = opaque;
>>>    int rc = 0;
>>> 
>>> +    if (vfio_device_data_cap_device_memory(vdev)) {
>>> +        qemu_put_byte(f, VFIO_SAVE_FLAG_DEVMEMORY | VFIO_SAVE_FLAG_CONTINUE);
>>> +        /* get dirty data of device memory */
>>> +        vfio_get_device_memory_size(vdev);
>>> +        rc = vfio_save_data_device_memory(vdev, f);
>>> +    }
>>> +
>>>    qemu_put_byte(f, VFIO_SAVE_FLAG_PCI | VFIO_SAVE_FLAG_CONTINUE);
>>>    vfio_pci_save_config(vdev, f);
>>> 
>>> @@ -515,12 +727,22 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
>>> 
>>> static int vfio_save_setup(QEMUFile *f, void *opaque)
>>> {
>>> +    int rc = 0;
>>>    VFIOPCIDevice *vdev = opaque;
>>> -    qemu_put_byte(f, VFIO_SAVE_FLAG_SETUP);
>>> +
>>> +    if (vfio_device_data_cap_device_memory(vdev)) {
>>> +        qemu_put_byte(f, VFIO_SAVE_FLAG_SETUP | VFIO_SAVE_FLAG_CONTINUE);
>>> +        qemu_put_byte(f, VFIO_SAVE_FLAG_DEVMEMORY);
>>> +        /* get whole snapshot of device memory */
>>> +        vfio_get_device_memory_size(vdev);
>>> +        rc = vfio_save_data_device_memory(vdev, f);
>>> +    } else {
>>> +        qemu_put_byte(f, VFIO_SAVE_FLAG_SETUP);
>>> +    }
>>> 
>>>    vfio_set_device_state(vdev, VFIO_DEVICE_STATE_RUNNING |
>>>            VFIO_DEVICE_STATE_LOGGING);
>>> -    return 0;
>>> +    return rc;
>>> }
>>> 
>>> static int vfio_load_setup(QEMUFile *f, void *opaque)
>>> @@ -576,8 +798,11 @@ int vfio_migration_init(VFIOPCIDevice *vdev, Error **errp)
>>>        goto error;
>>>    }
>>> 
>>> -    if (vfio_device_data_cap_device_memory(vdev)) {
>>> -        error_report("No suppport of data cap device memory Yet");
>>> +    if (vfio_device_data_cap_device_memory(vdev) &&
>>> +            vfio_device_state_region_setup(vdev,
>>> +              &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_DEVICE_MEMORY],
>>> +              VFIO_REGION_SUBTYPE_DEVICE_STATE_DATA_MEMORY,
>>> +              "device-state-data-device-memory")) {
>>>        goto error;
>>>    }
>>> 
>>> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
>>> index 4b7b1bb..a2cc64b 100644
>>> --- a/hw/vfio/pci.h
>>> +++ b/hw/vfio/pci.h
>>> @@ -69,6 +69,7 @@ typedef struct VFIOMigration {
>>>    uint32_t data_caps;
>>>    uint32_t device_state;
>>>    uint64_t devconfig_size;
>>> +    uint64_t devmem_size;
>>>    VMChangeStateEntry *vm_state;
>>> } VFIOMigration;
>>> 
>>> -- 
>>> 2.7.4
>>> 
>>> _______________________________________________
>>> intel-gvt-dev mailing list
>>> intel-gvt-dev@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/intel-gvt-dev
>> 
>> _______________________________________________
>> intel-gvt-dev mailing list
>> intel-gvt-dev@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/intel-gvt-dev
Yan Zhao Feb. 21, 2019, 12:07 a.m. UTC | #6
On Wed, Feb 20, 2019 at 11:14:24AM +0100, Christophe de Dinechin wrote:
> 
> 
> > On 20 Feb 2019, at 08:58, Zhao Yan <yan.y.zhao@intel.com> wrote:
> > 
> > On Tue, Feb 19, 2019 at 03:42:36PM +0100, Christophe de Dinechin wrote:
> >> 
> >> 
> >>> On 19 Feb 2019, at 09:53, Yan Zhao <yan.y.zhao@intel.com> wrote:
> >>> 
> >>> If a device has device memory capability, save/load data from device memory
> >>> in pre-copy and stop-and-copy phases.
> >>> 
> >>> LOGGING state is set for device memory for dirty page logging:
> >>> in LOGGING state, get device memory returns whole device memory snapshot;
> >>> outside LOGGING state, get device memory returns dirty data since last get
> >>> operation.
> >>> 
> >>> Usually, device memory is very big, qemu needs to chunk it into several
> >>> pieces each with size of device memory region.
> >>> 
> >>> Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
> >>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> >>> ---
> >>> hw/vfio/migration.c | 235 ++++++++++++++++++++++++++++++++++++++++++++++++++--
> >>> hw/vfio/pci.h       |   1 +
> >>> 2 files changed, 231 insertions(+), 5 deletions(-)
> >>> 
> >>> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
> >>> index 16d6395..f1e9309 100644
> >>> --- a/hw/vfio/migration.c
> >>> +++ b/hw/vfio/migration.c
> >>> @@ -203,6 +203,201 @@ static int vfio_load_data_device_config(VFIOPCIDevice *vdev,
> >>>    return 0;
> >>> }
> >>> 
> >>> +static int vfio_get_device_memory_size(VFIOPCIDevice *vdev)
> >>> +{
> >>> +    VFIODevice *vbasedev = &vdev->vbasedev;
> >>> +    VFIORegion *region_ctl =
> >>> +        &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL];
> >>> +    uint64_t len;
> >>> +    int sz;
> >>> +
> >>> +    sz = sizeof(len);
> >>> +    if (pread(vbasedev->fd, &len, sz,
> >>> +                region_ctl->fd_offset +
> >>> +                offsetof(struct vfio_device_state_ctl, device_memory.size))
> >>> +            != sz) {
> >>> +        error_report("vfio: Failed to get length of device memory”);
> >> 
> >> s/length/size/ ? (to be consistent with function name)
> > 
> > ok. thanks
> >>> +        return -1;
> >>> +    }
> >>> +    vdev->migration->devmem_size = len;
> >>> +    return 0;
> >>> +}
> >>> +
> >>> +static int vfio_set_device_memory_size(VFIOPCIDevice *vdev, uint64_t size)
> >>> +{
> >>> +    VFIODevice *vbasedev = &vdev->vbasedev;
> >>> +    VFIORegion *region_ctl =
> >>> +        &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL];
> >>> +    int sz;
> >>> +
> >>> +    sz = sizeof(size);
> >>> +    if (pwrite(vbasedev->fd, &size, sz,
> >>> +                region_ctl->fd_offset +
> >>> +                offsetof(struct vfio_device_state_ctl, device_memory.size))
> >>> +            != sz) {
> >>> +        error_report("vfio: Failed to set length of device comemory”);
> >> 
> >> What is comemory? Typo?
> > 
> > Right, typo. should be "memory" :)
> >> 
> >> Same comment about length vs size
> >> 
> > got it. thanks
> > 
> >>> +        return -1;
> >>> +    }
> >>> +    vdev->migration->devmem_size = size;
> >>> +    return 0;
> >>> +}
> >>> +
> >>> +static
> >>> +int vfio_save_data_device_memory_chunk(VFIOPCIDevice *vdev, QEMUFile *f,
> >>> +                                    uint64_t pos, uint64_t len)
> >>> +{
> >>> +    VFIODevice *vbasedev = &vdev->vbasedev;
> >>> +    VFIORegion *region_ctl =
> >>> +        &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL];
> >>> +    VFIORegion *region_devmem =
> >>> +        &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_DEVICE_MEMORY];
> >>> +    void *dest;
> >>> +    uint32_t sz;
> >>> +    uint8_t *buf = NULL;
> >>> +    uint32_t action = VFIO_DEVICE_DATA_ACTION_GET_BUFFER;
> >>> +
> >>> +    if (len > region_devmem->size) {
> >> 
> >> Is it intentional that there is no error_report here?
> >> 
> > an error_report here may be better.
> >>> +        return -1;
> >>> +    }
> >>> +
> >>> +    sz = sizeof(pos);
> >>> +    if (pwrite(vbasedev->fd, &pos, sz,
> >>> +                region_ctl->fd_offset +
> >>> +                offsetof(struct vfio_device_state_ctl, device_memory.pos))
> >>> +            != sz) {
> >>> +        error_report("vfio: Failed to set save buffer pos");
> >>> +        return -1;
> >>> +    }
> >>> +    sz = sizeof(action);
> >>> +    if (pwrite(vbasedev->fd, &action, sz,
> >>> +                region_ctl->fd_offset +
> >>> +                offsetof(struct vfio_device_state_ctl, device_memory.action))
> >>> +            != sz) {
> >>> +        error_report("vfio: Failed to set save buffer action");
> >>> +        return -1;
> >>> +    }
> >>> +
> >>> +    if (!vfio_device_state_region_mmaped(region_devmem)) {
> >>> +        buf = g_malloc(len);
> >>> +        if (buf == NULL) {
> >>> +            error_report("vfio: Failed to allocate memory for migrate”);
> >> s/migrate/migration/ ?
> > 
> > yes, thanks
> >>> +            return -1;
> >>> +        }
> >>> +        if (pread(vbasedev->fd, buf, len, region_devmem->fd_offset) != len) {
> >>> +            error_report("vfio: error load device memory buffer”);
> >> s/load/loading/ ?
> > error to load? :)
> 
> I’d check with a native speaker, but I believe it’s “error loading”.
> 
> To me (to be checked), the two sentences don’t have the same meaning:
> 
> “It is an error to load device memory buffer” -> “You are not allowed to do that”
> “I had an error loading device memory buffer” -> “I tried, but it failed"
>
haha, ok, I'll change it to loading, thanks :)
> > 
> >>> +            return -1;
> >>> +        }
> >>> +        qemu_put_be64(f, len);
> >>> +        qemu_put_be64(f, pos);
> >>> +        qemu_put_buffer(f, buf, len);
> >>> +        g_free(buf);
> >>> +    } else {
> >>> +        dest = region_devmem->mmaps[0].mmap;
> >>> +        qemu_put_be64(f, len);
> >>> +        qemu_put_be64(f, pos);
> >>> +        qemu_put_buffer(f, dest, len);
> >>> +    }
> >>> +    return 0;
> >>> +}
> >>> +
> >>> +static int vfio_save_data_device_memory(VFIOPCIDevice *vdev, QEMUFile *f)
> >>> +{
> >>> +    VFIORegion *region_devmem =
> >>> +        &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_DEVICE_MEMORY];
> >>> +    uint64_t total_len = vdev->migration->devmem_size;
> >>> +    uint64_t pos = 0;
> >>> +
> >>> +    qemu_put_be64(f, total_len);
> >>> +    while (pos < total_len) {
> >>> +        uint64_t len = region_devmem->size;
> >>> +
> >>> +        if (pos + len >= total_len) {
> >>> +            len = total_len - pos;
> >>> +        }
> >>> +        if (vfio_save_data_device_memory_chunk(vdev, f, pos, len)) {
> >>> +            return -1;
> >>> +        }
> >> 
> >> I don’t see where pos is incremented in this loop
> >> 
> > yes, missing one line "pos += len;"
> > Currently, code is not verified in hardware with device memory cap on.
> > Thanks:)
> >>> +    }
> >>> +
> >>> +    return 0;
> >>> +}
> >>> +
> >>> +static
> >>> +int vfio_load_data_device_memory_chunk(VFIOPCIDevice *vdev, QEMUFile *f,
> >>> +                                uint64_t pos, uint64_t len)
> >>> +{
> >>> +    VFIODevice *vbasedev = &vdev->vbasedev;
> >>> +    VFIORegion *region_ctl =
> >>> +        &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL];
> >>> +    VFIORegion *region_devmem =
> >>> +        &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_DEVICE_MEMORY];
> >>> +
> >>> +    void *dest;
> >>> +    uint32_t sz;
> >>> +    uint8_t *buf = NULL;
> >>> +    uint32_t action = VFIO_DEVICE_DATA_ACTION_SET_BUFFER;
> >>> +
> >>> +    if (len > region_devmem->size) {
> >> 
> >> error_report?
> > 
> > seems better to add error_report.
> >>> +        return -1;
> >>> +    }
> >>> +
> >>> +    sz = sizeof(pos);
> >>> +    if (pwrite(vbasedev->fd, &pos, sz,
> >>> +                region_ctl->fd_offset +
> >>> +                offsetof(struct vfio_device_state_ctl, device_memory.pos))
> >>> +            != sz) {
> >>> +        error_report("vfio: Failed to set device memory buffer pos");
> >>> +        return -1;
> >>> +    }
> >>> +    if (!vfio_device_state_region_mmaped(region_devmem)) {
> >>> +        buf = g_malloc(len);
> >>> +        if (buf == NULL) {
> >>> +            error_report("vfio: Failed to allocate memory for migrate");
> >>> +            return -1;
> >>> +        }
> >>> +        qemu_get_buffer(f, buf, len);
> >>> +        if (pwrite(vbasedev->fd, buf, len,
> >>> +                    region_devmem->fd_offset) != len) {
> >>> +            error_report("vfio: Failed to load devie memory buffer");
> >>> +            return -1;
> >>> +        }
> >>> +        g_free(buf);
> >>> +    } else {
> >>> +        dest = region_devmem->mmaps[0].mmap;
> >>> +        qemu_get_buffer(f, dest, len);
> >>> +    }
> >>> +
> >>> +    sz = sizeof(action);
> >>> +    if (pwrite(vbasedev->fd, &action, sz,
> >>> +                region_ctl->fd_offset +
> >>> +                offsetof(struct vfio_device_state_ctl, device_memory.action))
> >>> +            != sz) {
> >>> +        error_report("vfio: Failed to set load device memory buffer action");
> >>> +        return -1;
> >>> +    }
> >>> +
> >>> +    return 0;
> >>> +
> >>> +}
> >>> +
> >>> +static int vfio_load_data_device_memory(VFIOPCIDevice *vdev,
> >>> +                        QEMUFile *f, uint64_t total_len)
> >>> +{
> >>> +    uint64_t pos = 0, len = 0;
> >>> +
> >>> +    vfio_set_device_memory_size(vdev, total_len);
> >>> +
> >>> +    while (pos + len < total_len) {
> >>> +        len = qemu_get_be64(f);
> >>> +        pos = qemu_get_be64(f);
> >> 
> >> Nit: load reads len/pos in the loop, whereas save does it in the
> >> inner function (vfio_save_data_device_memory_chunk)
> > right, load has to read len/pos in the loop.
> >> 
> >>> +
> >>> +        vfio_load_data_device_memory_chunk(vdev, f, pos, len);
> >>> +    }
> >>> +
> >>> +    return 0;
> >>> +}
> >>> +
> >>> +
> >>> static int vfio_set_dirty_page_bitmap_chunk(VFIOPCIDevice *vdev,
> >>>        uint64_t start_addr, uint64_t page_nr)
> >>> {
> >>> @@ -377,6 +572,10 @@ static void vfio_save_live_pending(QEMUFile *f, void *opaque,
> >>>        return;
> >>>    }
> >>> 
> >>> +    /* get dirty data size of device memory */
> >>> +    vfio_get_device_memory_size(vdev);
> >>> +
> >>> +    *res_precopy_only += vdev->migration->devmem_size;
> >>>    return;
> >>> }
> >>> 
> >>> @@ -388,7 +587,9 @@ static int vfio_save_iterate(QEMUFile *f, void *opaque)
> >>>        return 0;
> >>>    }
> >>> 
> >>> -    return 0;
> >>> +    qemu_put_byte(f, VFIO_SAVE_FLAG_DEVMEMORY);
> >>> +    /* get dirty data of device memory */
> >>> +    return vfio_save_data_device_memory(vdev, f);
> >>> }
> >>> 
> >>> static void vfio_pci_load_config(VFIOPCIDevice *vdev, QEMUFile *f)
> >>> @@ -458,6 +659,10 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
> >>>            len = qemu_get_be64(f);
> >>>            vfio_load_data_device_config(vdev, f, len);
> >>>            break;
> >>> +        case VFIO_SAVE_FLAG_DEVMEMORY:
> >>> +            len = qemu_get_be64(f);
> >>> +            vfio_load_data_device_memory(vdev, f, len);
> >>> +            break;
> >>>        default:
> >>>            ret = -EINVAL;
> >>>        }
> >>> @@ -503,6 +708,13 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
> >>>    VFIOPCIDevice *vdev = opaque;
> >>>    int rc = 0;
> >>> 
> >>> +    if (vfio_device_data_cap_device_memory(vdev)) {
> >>> +        qemu_put_byte(f, VFIO_SAVE_FLAG_DEVMEMORY | VFIO_SAVE_FLAG_CONTINUE);
> >>> +        /* get dirty data of device memory */
> >>> +        vfio_get_device_memory_size(vdev);
> >>> +        rc = vfio_save_data_device_memory(vdev, f);
> >>> +    }
> >>> +
> >>>    qemu_put_byte(f, VFIO_SAVE_FLAG_PCI | VFIO_SAVE_FLAG_CONTINUE);
> >>>    vfio_pci_save_config(vdev, f);
> >>> 
> >>> @@ -515,12 +727,22 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
> >>> 
> >>> static int vfio_save_setup(QEMUFile *f, void *opaque)
> >>> {
> >>> +    int rc = 0;
> >>>    VFIOPCIDevice *vdev = opaque;
> >>> -    qemu_put_byte(f, VFIO_SAVE_FLAG_SETUP);
> >>> +
> >>> +    if (vfio_device_data_cap_device_memory(vdev)) {
> >>> +        qemu_put_byte(f, VFIO_SAVE_FLAG_SETUP | VFIO_SAVE_FLAG_CONTINUE);
> >>> +        qemu_put_byte(f, VFIO_SAVE_FLAG_DEVMEMORY);
> >>> +        /* get whole snapshot of device memory */
> >>> +        vfio_get_device_memory_size(vdev);
> >>> +        rc = vfio_save_data_device_memory(vdev, f);
> >>> +    } else {
> >>> +        qemu_put_byte(f, VFIO_SAVE_FLAG_SETUP);
> >>> +    }
> >>> 
> >>>    vfio_set_device_state(vdev, VFIO_DEVICE_STATE_RUNNING |
> >>>            VFIO_DEVICE_STATE_LOGGING);
> >>> -    return 0;
> >>> +    return rc;
> >>> }
> >>> 
> >>> static int vfio_load_setup(QEMUFile *f, void *opaque)
> >>> @@ -576,8 +798,11 @@ int vfio_migration_init(VFIOPCIDevice *vdev, Error **errp)
> >>>        goto error;
> >>>    }
> >>> 
> >>> -    if (vfio_device_data_cap_device_memory(vdev)) {
> >>> -        error_report("No suppport of data cap device memory Yet");
> >>> +    if (vfio_device_data_cap_device_memory(vdev) &&
> >>> +            vfio_device_state_region_setup(vdev,
> >>> +              &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_DEVICE_MEMORY],
> >>> +              VFIO_REGION_SUBTYPE_DEVICE_STATE_DATA_MEMORY,
> >>> +              "device-state-data-device-memory")) {
> >>>        goto error;
> >>>    }
> >>> 
> >>> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> >>> index 4b7b1bb..a2cc64b 100644
> >>> --- a/hw/vfio/pci.h
> >>> +++ b/hw/vfio/pci.h
> >>> @@ -69,6 +69,7 @@ typedef struct VFIOMigration {
> >>>    uint32_t data_caps;
> >>>    uint32_t device_state;
> >>>    uint64_t devconfig_size;
> >>> +    uint64_t devmem_size;
> >>>    VMChangeStateEntry *vm_state;
> >>> } VFIOMigration;
> >>> 
> >>> -- 
> >>> 2.7.4
> >>> 
> >>> _______________________________________________
> >>> intel-gvt-dev mailing list
> >>> intel-gvt-dev@lists.freedesktop.org
> >>> https://lists.freedesktop.org/mailman/listinfo/intel-gvt-dev
> >> 
> >> _______________________________________________
> >> intel-gvt-dev mailing list
> >> intel-gvt-dev@lists.freedesktop.org
> >> https://lists.freedesktop.org/mailman/listinfo/intel-gvt-dev
>
diff mbox series

Patch

diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 16d6395..f1e9309 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -203,6 +203,201 @@  static int vfio_load_data_device_config(VFIOPCIDevice *vdev,
     return 0;
 }
 
+static int vfio_get_device_memory_size(VFIOPCIDevice *vdev)
+{
+    VFIODevice *vbasedev = &vdev->vbasedev;
+    VFIORegion *region_ctl =
+        &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL];
+    uint64_t len;
+    int sz;
+
+    sz = sizeof(len);
+    if (pread(vbasedev->fd, &len, sz,
+                region_ctl->fd_offset +
+                offsetof(struct vfio_device_state_ctl, device_memory.size))
+            != sz) {
+        error_report("vfio: Failed to get length of device memory");
+        return -1;
+    }
+    vdev->migration->devmem_size = len;
+    return 0;
+}
+
+static int vfio_set_device_memory_size(VFIOPCIDevice *vdev, uint64_t size)
+{
+    VFIODevice *vbasedev = &vdev->vbasedev;
+    VFIORegion *region_ctl =
+        &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL];
+    int sz;
+
+    sz = sizeof(size);
+    if (pwrite(vbasedev->fd, &size, sz,
+                region_ctl->fd_offset +
+                offsetof(struct vfio_device_state_ctl, device_memory.size))
+            != sz) {
+        error_report("vfio: Failed to set length of device comemory");
+        return -1;
+    }
+    vdev->migration->devmem_size = size;
+    return 0;
+}
+
+static
+int vfio_save_data_device_memory_chunk(VFIOPCIDevice *vdev, QEMUFile *f,
+                                    uint64_t pos, uint64_t len)
+{
+    VFIODevice *vbasedev = &vdev->vbasedev;
+    VFIORegion *region_ctl =
+        &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL];
+    VFIORegion *region_devmem =
+        &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_DEVICE_MEMORY];
+    void *dest;
+    uint32_t sz;
+    uint8_t *buf = NULL;
+    uint32_t action = VFIO_DEVICE_DATA_ACTION_GET_BUFFER;
+
+    if (len > region_devmem->size) {
+        return -1;
+    }
+
+    sz = sizeof(pos);
+    if (pwrite(vbasedev->fd, &pos, sz,
+                region_ctl->fd_offset +
+                offsetof(struct vfio_device_state_ctl, device_memory.pos))
+            != sz) {
+        error_report("vfio: Failed to set save buffer pos");
+        return -1;
+    }
+    sz = sizeof(action);
+    if (pwrite(vbasedev->fd, &action, sz,
+                region_ctl->fd_offset +
+                offsetof(struct vfio_device_state_ctl, device_memory.action))
+            != sz) {
+        error_report("vfio: Failed to set save buffer action");
+        return -1;
+    }
+
+    if (!vfio_device_state_region_mmaped(region_devmem)) {
+        buf = g_malloc(len);
+        if (buf == NULL) {
+            error_report("vfio: Failed to allocate memory for migrate");
+            return -1;
+        }
+        if (pread(vbasedev->fd, buf, len, region_devmem->fd_offset) != len) {
+            error_report("vfio: error load device memory buffer");
+            return -1;
+        }
+        qemu_put_be64(f, len);
+        qemu_put_be64(f, pos);
+        qemu_put_buffer(f, buf, len);
+        g_free(buf);
+    } else {
+        dest = region_devmem->mmaps[0].mmap;
+        qemu_put_be64(f, len);
+        qemu_put_be64(f, pos);
+        qemu_put_buffer(f, dest, len);
+    }
+    return 0;
+}
+
+static int vfio_save_data_device_memory(VFIOPCIDevice *vdev, QEMUFile *f)
+{
+    VFIORegion *region_devmem =
+        &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_DEVICE_MEMORY];
+    uint64_t total_len = vdev->migration->devmem_size;
+    uint64_t pos = 0;
+
+    qemu_put_be64(f, total_len);
+    while (pos < total_len) {
+        uint64_t len = region_devmem->size;
+
+        if (pos + len >= total_len) {
+            len = total_len - pos;
+        }
+        if (vfio_save_data_device_memory_chunk(vdev, f, pos, len)) {
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+static
+int vfio_load_data_device_memory_chunk(VFIOPCIDevice *vdev, QEMUFile *f,
+                                uint64_t pos, uint64_t len)
+{
+    VFIODevice *vbasedev = &vdev->vbasedev;
+    VFIORegion *region_ctl =
+        &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL];
+    VFIORegion *region_devmem =
+        &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_DEVICE_MEMORY];
+
+    void *dest;
+    uint32_t sz;
+    uint8_t *buf = NULL;
+    uint32_t action = VFIO_DEVICE_DATA_ACTION_SET_BUFFER;
+
+    if (len > region_devmem->size) {
+        return -1;
+    }
+
+    sz = sizeof(pos);
+    if (pwrite(vbasedev->fd, &pos, sz,
+                region_ctl->fd_offset +
+                offsetof(struct vfio_device_state_ctl, device_memory.pos))
+            != sz) {
+        error_report("vfio: Failed to set device memory buffer pos");
+        return -1;
+    }
+    if (!vfio_device_state_region_mmaped(region_devmem)) {
+        buf = g_malloc(len);
+        if (buf == NULL) {
+            error_report("vfio: Failed to allocate memory for migrate");
+            return -1;
+        }
+        qemu_get_buffer(f, buf, len);
+        if (pwrite(vbasedev->fd, buf, len,
+                    region_devmem->fd_offset) != len) {
+            error_report("vfio: Failed to load devie memory buffer");
+            return -1;
+        }
+        g_free(buf);
+    } else {
+        dest = region_devmem->mmaps[0].mmap;
+        qemu_get_buffer(f, dest, len);
+    }
+
+    sz = sizeof(action);
+    if (pwrite(vbasedev->fd, &action, sz,
+                region_ctl->fd_offset +
+                offsetof(struct vfio_device_state_ctl, device_memory.action))
+            != sz) {
+        error_report("vfio: Failed to set load device memory buffer action");
+        return -1;
+    }
+
+    return 0;
+
+}
+
+static int vfio_load_data_device_memory(VFIOPCIDevice *vdev,
+                        QEMUFile *f, uint64_t total_len)
+{
+    uint64_t pos = 0, len = 0;
+
+    vfio_set_device_memory_size(vdev, total_len);
+
+    while (pos + len < total_len) {
+        len = qemu_get_be64(f);
+        pos = qemu_get_be64(f);
+
+        vfio_load_data_device_memory_chunk(vdev, f, pos, len);
+    }
+
+    return 0;
+}
+
+
 static int vfio_set_dirty_page_bitmap_chunk(VFIOPCIDevice *vdev,
         uint64_t start_addr, uint64_t page_nr)
 {
@@ -377,6 +572,10 @@  static void vfio_save_live_pending(QEMUFile *f, void *opaque,
         return;
     }
 
+    /* get dirty data size of device memory */
+    vfio_get_device_memory_size(vdev);
+
+    *res_precopy_only += vdev->migration->devmem_size;
     return;
 }
 
@@ -388,7 +587,9 @@  static int vfio_save_iterate(QEMUFile *f, void *opaque)
         return 0;
     }
 
-    return 0;
+    qemu_put_byte(f, VFIO_SAVE_FLAG_DEVMEMORY);
+    /* get dirty data of device memory */
+    return vfio_save_data_device_memory(vdev, f);
 }
 
 static void vfio_pci_load_config(VFIOPCIDevice *vdev, QEMUFile *f)
@@ -458,6 +659,10 @@  static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
             len = qemu_get_be64(f);
             vfio_load_data_device_config(vdev, f, len);
             break;
+        case VFIO_SAVE_FLAG_DEVMEMORY:
+            len = qemu_get_be64(f);
+            vfio_load_data_device_memory(vdev, f, len);
+            break;
         default:
             ret = -EINVAL;
         }
@@ -503,6 +708,13 @@  static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
     VFIOPCIDevice *vdev = opaque;
     int rc = 0;
 
+    if (vfio_device_data_cap_device_memory(vdev)) {
+        qemu_put_byte(f, VFIO_SAVE_FLAG_DEVMEMORY | VFIO_SAVE_FLAG_CONTINUE);
+        /* get dirty data of device memory */
+        vfio_get_device_memory_size(vdev);
+        rc = vfio_save_data_device_memory(vdev, f);
+    }
+
     qemu_put_byte(f, VFIO_SAVE_FLAG_PCI | VFIO_SAVE_FLAG_CONTINUE);
     vfio_pci_save_config(vdev, f);
 
@@ -515,12 +727,22 @@  static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
 
 static int vfio_save_setup(QEMUFile *f, void *opaque)
 {
+    int rc = 0;
     VFIOPCIDevice *vdev = opaque;
-    qemu_put_byte(f, VFIO_SAVE_FLAG_SETUP);
+
+    if (vfio_device_data_cap_device_memory(vdev)) {
+        qemu_put_byte(f, VFIO_SAVE_FLAG_SETUP | VFIO_SAVE_FLAG_CONTINUE);
+        qemu_put_byte(f, VFIO_SAVE_FLAG_DEVMEMORY);
+        /* get whole snapshot of device memory */
+        vfio_get_device_memory_size(vdev);
+        rc = vfio_save_data_device_memory(vdev, f);
+    } else {
+        qemu_put_byte(f, VFIO_SAVE_FLAG_SETUP);
+    }
 
     vfio_set_device_state(vdev, VFIO_DEVICE_STATE_RUNNING |
             VFIO_DEVICE_STATE_LOGGING);
-    return 0;
+    return rc;
 }
 
 static int vfio_load_setup(QEMUFile *f, void *opaque)
@@ -576,8 +798,11 @@  int vfio_migration_init(VFIOPCIDevice *vdev, Error **errp)
         goto error;
     }
 
-    if (vfio_device_data_cap_device_memory(vdev)) {
-        error_report("No suppport of data cap device memory Yet");
+    if (vfio_device_data_cap_device_memory(vdev) &&
+            vfio_device_state_region_setup(vdev,
+              &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_DEVICE_MEMORY],
+              VFIO_REGION_SUBTYPE_DEVICE_STATE_DATA_MEMORY,
+              "device-state-data-device-memory")) {
         goto error;
     }
 
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 4b7b1bb..a2cc64b 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -69,6 +69,7 @@  typedef struct VFIOMigration {
     uint32_t data_caps;
     uint32_t device_state;
     uint64_t devconfig_size;
+    uint64_t devmem_size;
     VMChangeStateEntry *vm_state;
 } VFIOMigration;