diff mbox series

[v5,17/18] vfio-user: register handlers to facilitate migration

Message ID 0ec006e5c309166cc1920871688508c1f6270ac3.1642626515.git.jag.raman@oracle.com
State New
Headers show
Series vfio-user server in QEMU | expand

Commit Message

Jag Raman Jan. 19, 2022, 9:42 p.m. UTC
Store and load the device's state during migration. use libvfio-user's
handlers for this purpose

Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
---
 include/migration/vmstate.h |   2 +
 migration/savevm.h          |   2 +
 hw/remote/vfio-user-obj.c   | 323 ++++++++++++++++++++++++++++++++++++
 migration/savevm.c          |  73 ++++++++
 migration/vmstate.c         |  19 +++
 5 files changed, 419 insertions(+)

Comments

Stefan Hajnoczi Jan. 25, 2022, 3:48 p.m. UTC | #1
On Wed, Jan 19, 2022 at 04:42:06PM -0500, Jagannathan Raman wrote:
> +     * The client subsequetly asks the remote server for any data that

subsequently

> +static void vfu_mig_state_running(vfu_ctx_t *vfu_ctx)
> +{
> +    VfuObject *o = vfu_get_private(vfu_ctx);
> +    VfuObjectClass *k = VFU_OBJECT_GET_CLASS(OBJECT(o));
> +    static int migrated_devs;
> +    Error *local_err = NULL;
> +    int ret;
> +
> +    /**
> +     * TODO: move to VFU_MIGR_STATE_RESUME handler. Presently, the
> +     * VMSD data from source is not available at RESUME state.
> +     * Working on a fix for this.
> +     */
> +    if (!o->vfu_mig_file) {
> +        o->vfu_mig_file = qemu_fopen_ops(o, &vfu_mig_fops_load, false);
> +    }
> +
> +    ret = qemu_remote_loadvm(o->vfu_mig_file);
> +    if (ret) {
> +        VFU_OBJECT_ERROR(o, "vfu: failed to restore device state");
> +        return;
> +    }
> +
> +    qemu_file_shutdown(o->vfu_mig_file);
> +    o->vfu_mig_file = NULL;
> +
> +    /* VFU_MIGR_STATE_RUNNING begins here */
> +    if (++migrated_devs == k->nr_devs) {

When is this counter reset so migration can be tried again if it
fails/cancels?

> +static ssize_t vfu_mig_read_data(vfu_ctx_t *vfu_ctx, void *buf,
> +                                 uint64_t size, uint64_t offset)
> +{
> +    VfuObject *o = vfu_get_private(vfu_ctx);
> +
> +    if (offset > o->vfu_mig_buf_size) {
> +        return -1;
> +    }
> +
> +    if ((offset + size) > o->vfu_mig_buf_size) {
> +        warn_report("vfu: buffer overflow - check pending_bytes");
> +        size = o->vfu_mig_buf_size - offset;
> +    }
> +
> +    memcpy(buf, (o->vfu_mig_buf + offset), size);
> +
> +    o->vfu_mig_buf_pending -= size;

This assumes that the caller increments offset by size each time. If
that assumption is okay, then we can just trust offset and don't need to
do arithmetic on vfu_mig_buf_pending. If that assumption is not correct,
then the code needs to be extended to safely update vfu_mig_buf_pending
when offset jumps around arbitrarily between calls.

> +uint64_t vmstate_vmsd_size(PCIDevice *pci_dev)
> +{
> +    DeviceClass *dc = DEVICE_GET_CLASS(DEVICE(pci_dev));
> +    const VMStateField *field = NULL;
> +    uint64_t size = 0;
> +
> +    if (!dc->vmsd) {
> +        return 0;
> +    }
> +
> +    field = dc->vmsd->fields;
> +    while (field && field->name) {
> +        size += vmstate_size(pci_dev, field);
> +        field++;
> +    }
> +
> +    return size;
> +}

This function looks incorrect because it ignores subsections as well as
runtime behavior during save(). Although VMStateDescription is partially
declarative, there is still a bunch of imperative code that can write to
the QEMUFile at save() time so there's no way of knowing the size ahead
of time.

I asked this in a previous revision of this series but I'm not sure if
it was answered: is it really necessary to know the size of the vmstate?
I thought the VFIO migration interface is designed to support
streaming reads/writes. We could choose a fixed size like 64KB and
stream the vmstate in 64KB chunks.
Jag Raman Jan. 27, 2022, 5:04 p.m. UTC | #2
> On Jan 25, 2022, at 10:48 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> 
> On Wed, Jan 19, 2022 at 04:42:06PM -0500, Jagannathan Raman wrote:
>> +     * The client subsequetly asks the remote server for any data that
> 
> subsequently
> 
>> +static void vfu_mig_state_running(vfu_ctx_t *vfu_ctx)
>> +{
>> +    VfuObject *o = vfu_get_private(vfu_ctx);
>> +    VfuObjectClass *k = VFU_OBJECT_GET_CLASS(OBJECT(o));
>> +    static int migrated_devs;
>> +    Error *local_err = NULL;
>> +    int ret;
>> +
>> +    /**
>> +     * TODO: move to VFU_MIGR_STATE_RESUME handler. Presently, the
>> +     * VMSD data from source is not available at RESUME state.
>> +     * Working on a fix for this.
>> +     */
>> +    if (!o->vfu_mig_file) {
>> +        o->vfu_mig_file = qemu_fopen_ops(o, &vfu_mig_fops_load, false);
>> +    }
>> +
>> +    ret = qemu_remote_loadvm(o->vfu_mig_file);
>> +    if (ret) {
>> +        VFU_OBJECT_ERROR(o, "vfu: failed to restore device state");
>> +        return;
>> +    }
>> +
>> +    qemu_file_shutdown(o->vfu_mig_file);
>> +    o->vfu_mig_file = NULL;
>> +
>> +    /* VFU_MIGR_STATE_RUNNING begins here */
>> +    if (++migrated_devs == k->nr_devs) {
> 
> When is this counter reset so migration can be tried again if it
> fails/cancels?

Detecting cancellation is a pending item. We will address it in the
next rev. Will check with you if  we get stuck during the process
of implementing it.

> 
>> +static ssize_t vfu_mig_read_data(vfu_ctx_t *vfu_ctx, void *buf,
>> +                                 uint64_t size, uint64_t offset)
>> +{
>> +    VfuObject *o = vfu_get_private(vfu_ctx);
>> +
>> +    if (offset > o->vfu_mig_buf_size) {
>> +        return -1;
>> +    }
>> +
>> +    if ((offset + size) > o->vfu_mig_buf_size) {
>> +        warn_report("vfu: buffer overflow - check pending_bytes");
>> +        size = o->vfu_mig_buf_size - offset;
>> +    }
>> +
>> +    memcpy(buf, (o->vfu_mig_buf + offset), size);
>> +
>> +    o->vfu_mig_buf_pending -= size;
> 
> This assumes that the caller increments offset by size each time. If
> that assumption is okay, then we can just trust offset and don't need to
> do arithmetic on vfu_mig_buf_pending. If that assumption is not correct,
> then the code needs to be extended to safely update vfu_mig_buf_pending
> when offset jumps around arbitrarily between calls.

Going by the definition of vfu_migration_callbacks_t in the library, I assumed
that read_data advances the offset by size bytes.

Will add a comment a comment to explain that.

> 
>> +uint64_t vmstate_vmsd_size(PCIDevice *pci_dev)
>> +{
>> +    DeviceClass *dc = DEVICE_GET_CLASS(DEVICE(pci_dev));
>> +    const VMStateField *field = NULL;
>> +    uint64_t size = 0;
>> +
>> +    if (!dc->vmsd) {
>> +        return 0;
>> +    }
>> +
>> +    field = dc->vmsd->fields;
>> +    while (field && field->name) {
>> +        size += vmstate_size(pci_dev, field);
>> +        field++;
>> +    }
>> +
>> +    return size;
>> +}
> 
> This function looks incorrect because it ignores subsections as well as
> runtime behavior during save(). Although VMStateDescription is partially
> declarative, there is still a bunch of imperative code that can write to
> the QEMUFile at save() time so there's no way of knowing the size ahead
> of time.

I see your point, it would be a problem for any field which has the
(VMS_BUFFER | VMS_ALLOC) flags set.

> 
> I asked this in a previous revision of this series but I'm not sure if
> it was answered: is it really necessary to know the size of the vmstate?
> I thought the VFIO migration interface is designed to support
> streaming reads/writes. We could choose a fixed size like 64KB and
> stream the vmstate in 64KB chunks.

The library exposes the migration data to the client as a device BAR with
fixed size - the size of which is fixed at boot time, even when using
vfu_migration_callbacks_t callbacks.

I don’t believe the library supports streaming vmstate/migration-data - see
the following comment in migration_region_access() defined in the library:

* Does this mean that partial reads are not allowed?

Thanos or John,

    Could you please clarify this?

Stefan,
    We attempted to answer the migration cancellation and vmstate size
    questions previously also, in the following email:

https://lore.kernel.org/all/F48606B1-15A4-4DD2-9D71-2FCAFC0E671F@oracle.com/

Thank you very much!
--
Jag
Stefan Hajnoczi Jan. 28, 2022, 8:29 a.m. UTC | #3
On Thu, Jan 27, 2022 at 05:04:26PM +0000, Jag Raman wrote:
> 
> 
> > On Jan 25, 2022, at 10:48 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > 
> > On Wed, Jan 19, 2022 at 04:42:06PM -0500, Jagannathan Raman wrote:
> >> +     * The client subsequetly asks the remote server for any data that
> > 
> > subsequently
> > 
> >> +static void vfu_mig_state_running(vfu_ctx_t *vfu_ctx)
> >> +{
> >> +    VfuObject *o = vfu_get_private(vfu_ctx);
> >> +    VfuObjectClass *k = VFU_OBJECT_GET_CLASS(OBJECT(o));
> >> +    static int migrated_devs;
> >> +    Error *local_err = NULL;
> >> +    int ret;
> >> +
> >> +    /**
> >> +     * TODO: move to VFU_MIGR_STATE_RESUME handler. Presently, the
> >> +     * VMSD data from source is not available at RESUME state.
> >> +     * Working on a fix for this.
> >> +     */
> >> +    if (!o->vfu_mig_file) {
> >> +        o->vfu_mig_file = qemu_fopen_ops(o, &vfu_mig_fops_load, false);
> >> +    }
> >> +
> >> +    ret = qemu_remote_loadvm(o->vfu_mig_file);
> >> +    if (ret) {
> >> +        VFU_OBJECT_ERROR(o, "vfu: failed to restore device state");
> >> +        return;
> >> +    }
> >> +
> >> +    qemu_file_shutdown(o->vfu_mig_file);
> >> +    o->vfu_mig_file = NULL;
> >> +
> >> +    /* VFU_MIGR_STATE_RUNNING begins here */
> >> +    if (++migrated_devs == k->nr_devs) {
> > 
> > When is this counter reset so migration can be tried again if it
> > fails/cancels?
> 
> Detecting cancellation is a pending item. We will address it in the
> next rev. Will check with you if  we get stuck during the process
> of implementing it.
> 
> > 
> >> +static ssize_t vfu_mig_read_data(vfu_ctx_t *vfu_ctx, void *buf,
> >> +                                 uint64_t size, uint64_t offset)
> >> +{
> >> +    VfuObject *o = vfu_get_private(vfu_ctx);
> >> +
> >> +    if (offset > o->vfu_mig_buf_size) {
> >> +        return -1;
> >> +    }
> >> +
> >> +    if ((offset + size) > o->vfu_mig_buf_size) {
> >> +        warn_report("vfu: buffer overflow - check pending_bytes");
> >> +        size = o->vfu_mig_buf_size - offset;
> >> +    }
> >> +
> >> +    memcpy(buf, (o->vfu_mig_buf + offset), size);
> >> +
> >> +    o->vfu_mig_buf_pending -= size;
> > 
> > This assumes that the caller increments offset by size each time. If
> > that assumption is okay, then we can just trust offset and don't need to
> > do arithmetic on vfu_mig_buf_pending. If that assumption is not correct,
> > then the code needs to be extended to safely update vfu_mig_buf_pending
> > when offset jumps around arbitrarily between calls.
> 
> Going by the definition of vfu_migration_callbacks_t in the library, I assumed
> that read_data advances the offset by size bytes.
> 
> Will add a comment a comment to explain that.
> 
> > 
> >> +uint64_t vmstate_vmsd_size(PCIDevice *pci_dev)
> >> +{
> >> +    DeviceClass *dc = DEVICE_GET_CLASS(DEVICE(pci_dev));
> >> +    const VMStateField *field = NULL;
> >> +    uint64_t size = 0;
> >> +
> >> +    if (!dc->vmsd) {
> >> +        return 0;
> >> +    }
> >> +
> >> +    field = dc->vmsd->fields;
> >> +    while (field && field->name) {
> >> +        size += vmstate_size(pci_dev, field);
> >> +        field++;
> >> +    }
> >> +
> >> +    return size;
> >> +}
> > 
> > This function looks incorrect because it ignores subsections as well as
> > runtime behavior during save(). Although VMStateDescription is partially
> > declarative, there is still a bunch of imperative code that can write to
> > the QEMUFile at save() time so there's no way of knowing the size ahead
> > of time.
> 
> I see your point, it would be a problem for any field which has the
> (VMS_BUFFER | VMS_ALLOC) flags set.
> 
> > 
> > I asked this in a previous revision of this series but I'm not sure if
> > it was answered: is it really necessary to know the size of the vmstate?
> > I thought the VFIO migration interface is designed to support
> > streaming reads/writes. We could choose a fixed size like 64KB and
> > stream the vmstate in 64KB chunks.
> 
> The library exposes the migration data to the client as a device BAR with
> fixed size - the size of which is fixed at boot time, even when using
> vfu_migration_callbacks_t callbacks.
> 
> I don’t believe the library supports streaming vmstate/migration-data - see
> the following comment in migration_region_access() defined in the library:
> 
> * Does this mean that partial reads are not allowed?
> 
> Thanos or John,
> 
>     Could you please clarify this?
> 
> Stefan,
>     We attempted to answer the migration cancellation and vmstate size
>     questions previously also, in the following email:
> 
> https://lore.kernel.org/all/F48606B1-15A4-4DD2-9D71-2FCAFC0E671F@oracle.com/

>  libvfio-user has the vfu_migration_callbacks_t interface that allows the
>  device to save/load more data regardless of the size of the migration
>  region. I don't see the issue here since the region doesn't need to be
>  sized to fit the savevm data?

The answer didn't make sense to me:

"In both scenarios at the server end - whether using the migration BAR or
using callbacks, the migration data is transported to the other end using
the BAR. As such we need to specify the BAR’s size during initialization.

In the case of the callbacks, the library translates the BAR access to callbacks."

The BAR and the migration region within it need a size but my
understanding is that VFIO migration is designed to stream the device
state, allowing it to be broken up into multiple reads/writes with
knowing the device state's size upfront. Here is the description from
<linux/vfio.h>:

  * The sequence to be followed while in pre-copy state and stop-and-copy state
  * is as follows:
  * a. Read pending_bytes, indicating the start of a new iteration to get device
  *    data. Repeated read on pending_bytes at this stage should have no side
  *    effects.
  *    If pending_bytes == 0, the user application should not iterate to get data
  *    for that device.
  *    If pending_bytes > 0, perform the following steps.
  * b. Read data_offset, indicating that the vendor driver should make data
  *    available through the data section. The vendor driver should return this
  *    read operation only after data is available from (region + data_offset)
  *    to (region + data_offset + data_size).
  * c. Read data_size, which is the amount of data in bytes available through
  *    the migration region.
  *    Read on data_offset and data_size should return the offset and size of
  *    the current buffer if the user application reads data_offset and
  *    data_size more than once here.
  * d. Read data_size bytes of data from (region + data_offset) from the
  *    migration region.
  * e. Process the data.
  * f. Read pending_bytes, which indicates that the data from the previous
  *    iteration has been read. If pending_bytes > 0, go to step b.
  *
  * The user application can transition from the _SAVING|_RUNNING
  * (pre-copy state) to the _SAVING (stop-and-copy) state regardless of the
  * number of pending bytes. The user application should iterate in _SAVING
  * (stop-and-copy) until pending_bytes is 0.

This means you can report pending_bytes > 0 until the entire vmstate has
been read and can pick a fixed chunk size like 64KB for the migration
region. There's no need to size the migration region to fit the entire
vmstate.

Stefan
Thanos Makatos Jan. 28, 2022, 2:49 p.m. UTC | #4
> -----Original Message-----
> From: Stefan Hajnoczi <stefanha@redhat.com>
> Sent: 28 January 2022 08:29
> To: Jag Raman <jag.raman@oracle.com>
> Cc: John Levon <john.levon@nutanix.com>; Thanos Makatos
> <thanos.makatos@nutanix.com>; qemu-devel <qemu-devel@nongnu.org>;
> Marc-André Lureau <marcandre.lureau@gmail.com>; Philippe Mathieu-Daudé
> <f4bug@amsat.org>; Paolo Bonzini <pbonzini@redhat.com>; Beraldo Leal
> <bleal@redhat.com>; Daniel P. Berrangé <berrange@redhat.com>;
> eduardo@habkost.net; Michael S. Tsirkin <mst@redhat.com>; Marcel
> Apfelbaum <marcel.apfelbaum@gmail.com>; Eric Blake <eblake@redhat.com>;
> Markus Armbruster <armbru@redhat.com>; Juan Quintela
> <quintela@redhat.com>; Dr . David Alan Gilbert <dgilbert@redhat.com>; Elena
> Ufimtseva <elena.ufimtseva@oracle.com>; John Johnson
> <john.g.johnson@oracle.com>
> Subject: Re: [PATCH v5 17/18] vfio-user: register handlers to facilitate migration
> 
> On Thu, Jan 27, 2022 at 05:04:26PM +0000, Jag Raman wrote:
> >
> >
> > > On Jan 25, 2022, at 10:48 AM, Stefan Hajnoczi <stefanha@redhat.com>
> wrote:
> > >
> > > On Wed, Jan 19, 2022 at 04:42:06PM -0500, Jagannathan Raman wrote:
> > >> +     * The client subsequetly asks the remote server for any data that
> > >
> > > subsequently
> > >
> > >> +static void vfu_mig_state_running(vfu_ctx_t *vfu_ctx)
> > >> +{
> > >> +    VfuObject *o = vfu_get_private(vfu_ctx);
> > >> +    VfuObjectClass *k = VFU_OBJECT_GET_CLASS(OBJECT(o));
> > >> +    static int migrated_devs;
> > >> +    Error *local_err = NULL;
> > >> +    int ret;
> > >> +
> > >> +    /**
> > >> +     * TODO: move to VFU_MIGR_STATE_RESUME handler. Presently, the
> > >> +     * VMSD data from source is not available at RESUME state.
> > >> +     * Working on a fix for this.
> > >> +     */
> > >> +    if (!o->vfu_mig_file) {
> > >> +        o->vfu_mig_file = qemu_fopen_ops(o, &vfu_mig_fops_load, false);
> > >> +    }
> > >> +
> > >> +    ret = qemu_remote_loadvm(o->vfu_mig_file);
> > >> +    if (ret) {
> > >> +        VFU_OBJECT_ERROR(o, "vfu: failed to restore device state");
> > >> +        return;
> > >> +    }
> > >> +
> > >> +    qemu_file_shutdown(o->vfu_mig_file);
> > >> +    o->vfu_mig_file = NULL;
> > >> +
> > >> +    /* VFU_MIGR_STATE_RUNNING begins here */
> > >> +    if (++migrated_devs == k->nr_devs) {
> > >
> > > When is this counter reset so migration can be tried again if it
> > > fails/cancels?
> >
> > Detecting cancellation is a pending item. We will address it in the
> > next rev. Will check with you if  we get stuck during the process
> > of implementing it.
> >
> > >
> > >> +static ssize_t vfu_mig_read_data(vfu_ctx_t *vfu_ctx, void *buf,
> > >> +                                 uint64_t size, uint64_t offset)
> > >> +{
> > >> +    VfuObject *o = vfu_get_private(vfu_ctx);
> > >> +
> > >> +    if (offset > o->vfu_mig_buf_size) {
> > >> +        return -1;
> > >> +    }
> > >> +
> > >> +    if ((offset + size) > o->vfu_mig_buf_size) {
> > >> +        warn_report("vfu: buffer overflow - check pending_bytes");
> > >> +        size = o->vfu_mig_buf_size - offset;
> > >> +    }
> > >> +
> > >> +    memcpy(buf, (o->vfu_mig_buf + offset), size);
> > >> +
> > >> +    o->vfu_mig_buf_pending -= size;
> > >
> > > This assumes that the caller increments offset by size each time. If
> > > that assumption is okay, then we can just trust offset and don't need to
> > > do arithmetic on vfu_mig_buf_pending. If that assumption is not correct,
> > > then the code needs to be extended to safely update vfu_mig_buf_pending
> > > when offset jumps around arbitrarily between calls.
> >
> > Going by the definition of vfu_migration_callbacks_t in the library, I assumed
> > that read_data advances the offset by size bytes.
> >
> > Will add a comment a comment to explain that.

libvfio-user does not automatically increment offset by size each time, since
the vfio-user client can re-read the migration data multiple times. In
libvfio-user API we state:

    Function that is called to read migration data. offset and size can be
    any subrange on the offset and size previously returned by prepare_data.

Reading the pending_bytes register is what marks the end of the iteration, and
this is where you need to decrement vfu_mig_buf_pending.

I'll add more unit tests to libvfio-user to validate this behavior.

> >
> > >
> > >> +uint64_t vmstate_vmsd_size(PCIDevice *pci_dev)
> > >> +{
> > >> +    DeviceClass *dc = DEVICE_GET_CLASS(DEVICE(pci_dev));
> > >> +    const VMStateField *field = NULL;
> > >> +    uint64_t size = 0;
> > >> +
> > >> +    if (!dc->vmsd) {
> > >> +        return 0;
> > >> +    }
> > >> +
> > >> +    field = dc->vmsd->fields;
> > >> +    while (field && field->name) {
> > >> +        size += vmstate_size(pci_dev, field);
> > >> +        field++;
> > >> +    }
> > >> +
> > >> +    return size;
> > >> +}
> > >
> > > This function looks incorrect because it ignores subsections as well as
> > > runtime behavior during save(). Although VMStateDescription is partially
> > > declarative, there is still a bunch of imperative code that can write to
> > > the QEMUFile at save() time so there's no way of knowing the size ahead
> > > of time.
> >
> > I see your point, it would be a problem for any field which has the
> > (VMS_BUFFER | VMS_ALLOC) flags set.
> >
> > >
> > > I asked this in a previous revision of this series but I'm not sure if
> > > it was answered: is it really necessary to know the size of the vmstate?
> > > I thought the VFIO migration interface is designed to support
> > > streaming reads/writes. We could choose a fixed size like 64KB and
> > > stream the vmstate in 64KB chunks.
> >
> > The library exposes the migration data to the client as a device BAR with
> > fixed size - the size of which is fixed at boot time, even when using
> > vfu_migration_callbacks_t callbacks.
> >
> > I don’t believe the library supports streaming vmstate/migration-data - see
> > the following comment in migration_region_access() defined in the library:
> >
> > * Does this mean that partial reads are not allowed?
> >
> > Thanos or John,
> >
> >     Could you please clarify this?

libvfio-user does support streaming of migration data, this comment is based on
the VFIO documentation:

    d. Read data_size bytes of data from (region + data_offset) from the
        migration region.

It's not clear to me whether streaming should be allowed, I'd be surprised if
it didn't.

> >
> > Stefan,
> >     We attempted to answer the migration cancellation and vmstate size
> >     questions previously also, in the following email:
> >
> > https://lore.kernel.org/all/F48606B1-15A4-4DD2-9D71-
> 2FCAFC0E671F@oracle.com/
> 
> >  libvfio-user has the vfu_migration_callbacks_t interface that allows the
> >  device to save/load more data regardless of the size of the migration
> >  region. I don't see the issue here since the region doesn't need to be
> >  sized to fit the savevm data?
> 
> The answer didn't make sense to me:
> 
> "In both scenarios at the server end - whether using the migration BAR or
> using callbacks, the migration data is transported to the other end using
> the BAR. As such we need to specify the BAR’s size during initialization.
> 
> In the case of the callbacks, the library translates the BAR access to callbacks."
> 
> The BAR and the migration region within it need a size but my
> understanding is that VFIO migration is designed to stream the device
> state, allowing it to be broken up into multiple reads/writes with
> knowing the device state's size upfront. Here is the description from
> <linux/vfio.h>:
> 
>   * The sequence to be followed while in pre-copy state and stop-and-copy state
>   * is as follows:
>   * a. Read pending_bytes, indicating the start of a new iteration to get device
>   *    data. Repeated read on pending_bytes at this stage should have no side
>   *    effects.
>   *    If pending_bytes == 0, the user application should not iterate to get data
>   *    for that device.
>   *    If pending_bytes > 0, perform the following steps.
>   * b. Read data_offset, indicating that the vendor driver should make data
>   *    available through the data section. The vendor driver should return this
>   *    read operation only after data is available from (region + data_offset)
>   *    to (region + data_offset + data_size).
>   * c. Read data_size, which is the amount of data in bytes available through
>   *    the migration region.
>   *    Read on data_offset and data_size should return the offset and size of
>   *    the current buffer if the user application reads data_offset and
>   *    data_size more than once here.
>   * d. Read data_size bytes of data from (region + data_offset) from the
>   *    migration region.
>   * e. Process the data.
>   * f. Read pending_bytes, which indicates that the data from the previous
>   *    iteration has been read. If pending_bytes > 0, go to step b.
>   *
>   * The user application can transition from the _SAVING|_RUNNING
>   * (pre-copy state) to the _SAVING (stop-and-copy) state regardless of the
>   * number of pending bytes. The user application should iterate in _SAVING
>   * (stop-and-copy) until pending_bytes is 0.
> 
> This means you can report pending_bytes > 0 until the entire vmstate has
> been read and can pick a fixed chunk size like 64KB for the migration
> region. There's no need to size the migration region to fit the entire
> vmstate.
> 
> Stefan
Jag Raman Feb. 1, 2022, 3:49 a.m. UTC | #5
> On Jan 28, 2022, at 3:29 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> 
> On Thu, Jan 27, 2022 at 05:04:26PM +0000, Jag Raman wrote:
>> 
>> 
>>> On Jan 25, 2022, at 10:48 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
>>> 
>>> On Wed, Jan 19, 2022 at 04:42:06PM -0500, Jagannathan Raman wrote:
>>>> +     * The client subsequetly asks the remote server for any data that
>>> 
>>> subsequently
>>> 
>>>> +static void vfu_mig_state_running(vfu_ctx_t *vfu_ctx)
>>>> +{
>>>> +    VfuObject *o = vfu_get_private(vfu_ctx);
>>>> +    VfuObjectClass *k = VFU_OBJECT_GET_CLASS(OBJECT(o));
>>>> +    static int migrated_devs;
>>>> +    Error *local_err = NULL;
>>>> +    int ret;
>>>> +
>>>> +    /**
>>>> +     * TODO: move to VFU_MIGR_STATE_RESUME handler. Presently, the
>>>> +     * VMSD data from source is not available at RESUME state.
>>>> +     * Working on a fix for this.
>>>> +     */
>>>> +    if (!o->vfu_mig_file) {
>>>> +        o->vfu_mig_file = qemu_fopen_ops(o, &vfu_mig_fops_load, false);
>>>> +    }
>>>> +
>>>> +    ret = qemu_remote_loadvm(o->vfu_mig_file);
>>>> +    if (ret) {
>>>> +        VFU_OBJECT_ERROR(o, "vfu: failed to restore device state");
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    qemu_file_shutdown(o->vfu_mig_file);
>>>> +    o->vfu_mig_file = NULL;
>>>> +
>>>> +    /* VFU_MIGR_STATE_RUNNING begins here */
>>>> +    if (++migrated_devs == k->nr_devs) {
>>> 
>>> When is this counter reset so migration can be tried again if it
>>> fails/cancels?
>> 
>> Detecting cancellation is a pending item. We will address it in the
>> next rev. Will check with you if  we get stuck during the process
>> of implementing it.
>> 
>>> 
>>>> +static ssize_t vfu_mig_read_data(vfu_ctx_t *vfu_ctx, void *buf,
>>>> +                                 uint64_t size, uint64_t offset)
>>>> +{
>>>> +    VfuObject *o = vfu_get_private(vfu_ctx);
>>>> +
>>>> +    if (offset > o->vfu_mig_buf_size) {
>>>> +        return -1;
>>>> +    }
>>>> +
>>>> +    if ((offset + size) > o->vfu_mig_buf_size) {
>>>> +        warn_report("vfu: buffer overflow - check pending_bytes");
>>>> +        size = o->vfu_mig_buf_size - offset;
>>>> +    }
>>>> +
>>>> +    memcpy(buf, (o->vfu_mig_buf + offset), size);
>>>> +
>>>> +    o->vfu_mig_buf_pending -= size;
>>> 
>>> This assumes that the caller increments offset by size each time. If
>>> that assumption is okay, then we can just trust offset and don't need to
>>> do arithmetic on vfu_mig_buf_pending. If that assumption is not correct,
>>> then the code needs to be extended to safely update vfu_mig_buf_pending
>>> when offset jumps around arbitrarily between calls.
>> 
>> Going by the definition of vfu_migration_callbacks_t in the library, I assumed
>> that read_data advances the offset by size bytes.
>> 
>> Will add a comment a comment to explain that.
>> 
>>> 
>>>> +uint64_t vmstate_vmsd_size(PCIDevice *pci_dev)
>>>> +{
>>>> +    DeviceClass *dc = DEVICE_GET_CLASS(DEVICE(pci_dev));
>>>> +    const VMStateField *field = NULL;
>>>> +    uint64_t size = 0;
>>>> +
>>>> +    if (!dc->vmsd) {
>>>> +        return 0;
>>>> +    }
>>>> +
>>>> +    field = dc->vmsd->fields;
>>>> +    while (field && field->name) {
>>>> +        size += vmstate_size(pci_dev, field);
>>>> +        field++;
>>>> +    }
>>>> +
>>>> +    return size;
>>>> +}
>>> 
>>> This function looks incorrect because it ignores subsections as well as
>>> runtime behavior during save(). Although VMStateDescription is partially
>>> declarative, there is still a bunch of imperative code that can write to
>>> the QEMUFile at save() time so there's no way of knowing the size ahead
>>> of time.
>> 
>> I see your point, it would be a problem for any field which has the
>> (VMS_BUFFER | VMS_ALLOC) flags set.
>> 
>>> 
>>> I asked this in a previous revision of this series but I'm not sure if
>>> it was answered: is it really necessary to know the size of the vmstate?
>>> I thought the VFIO migration interface is designed to support
>>> streaming reads/writes. We could choose a fixed size like 64KB and
>>> stream the vmstate in 64KB chunks.
>> 
>> The library exposes the migration data to the client as a device BAR with
>> fixed size - the size of which is fixed at boot time, even when using
>> vfu_migration_callbacks_t callbacks.
>> 
>> I don’t believe the library supports streaming vmstate/migration-data - see
>> the following comment in migration_region_access() defined in the library:
>> 
>> * Does this mean that partial reads are not allowed?
>> 
>> Thanos or John,
>> 
>>    Could you please clarify this?
>> 
>> Stefan,
>>    We attempted to answer the migration cancellation and vmstate size
>>    questions previously also, in the following email:
>> 
>> https://lore.kernel.org/all/F48606B1-15A4-4DD2-9D71-2FCAFC0E671F@oracle.com/
> 
>> libvfio-user has the vfu_migration_callbacks_t interface that allows the
>> device to save/load more data regardless of the size of the migration
>> region. I don't see the issue here since the region doesn't need to be
>> sized to fit the savevm data?
> 
> The answer didn't make sense to me:
> 
> "In both scenarios at the server end - whether using the migration BAR or
> using callbacks, the migration data is transported to the other end using
> the BAR. As such we need to specify the BAR’s size during initialization.
> 
> In the case of the callbacks, the library translates the BAR access to callbacks."
> 
> The BAR and the migration region within it need a size but my
> understanding is that VFIO migration is designed to stream the device
> state, allowing it to be broken up into multiple reads/writes with
> knowing the device state's size upfront. Here is the description from
> <linux/vfio.h>:
> 
>  * The sequence to be followed while in pre-copy state and stop-and-copy state
>  * is as follows:
>  * a. Read pending_bytes, indicating the start of a new iteration to get device
>  *    data. Repeated read on pending_bytes at this stage should have no side
>  *    effects.
>  *    If pending_bytes == 0, the user application should not iterate to get data
>  *    for that device.
>  *    If pending_bytes > 0, perform the following steps.
>  * b. Read data_offset, indicating that the vendor driver should make data
>  *    available through the data section. The vendor driver should return this
>  *    read operation only after data is available from (region + data_offset)
>  *    to (region + data_offset + data_size).
>  * c. Read data_size, which is the amount of data in bytes available through
>  *    the migration region.
>  *    Read on data_offset and data_size should return the offset and size of
>  *    the current buffer if the user application reads data_offset and
>  *    data_size more than once here.
>  * d. Read data_size bytes of data from (region + data_offset) from the
>  *    migration region.
>  * e. Process the data.
>  * f. Read pending_bytes, which indicates that the data from the previous
>  *    iteration has been read. If pending_bytes > 0, go to step b.
>  *
>  * The user application can transition from the _SAVING|_RUNNING
>  * (pre-copy state) to the _SAVING (stop-and-copy) state regardless of the
>  * number of pending bytes. The user application should iterate in _SAVING
>  * (stop-and-copy) until pending_bytes is 0.
> 
> This means you can report pending_bytes > 0 until the entire vmstate has
> been read and can pick a fixed chunk size like 64KB for the migration
> region. There's no need to size the migration region to fit the entire
> vmstate.

Thank you for the pointer to generic VFIO migration, Stefan! Makes sense.

So I understand that the VFIO migration region carves out a section to
stream/shuttle device data between the app (QEMU client in this case) and the
driver (QEMU server). This section starts at data_offset within the region and spans
data_size bytes.

We could change the server to stream the data as outlined above. Do you have a
preference for the section size? Does qemu_target_page_size() work? I just tested
and am able to stream with a fixed BAR size such as qemu_target_page_size().

Thank you!
--
Jag

> 
> Stefan
Stefan Hajnoczi Feb. 1, 2022, 9:37 a.m. UTC | #6
On Tue, Feb 01, 2022 at 03:49:40AM +0000, Jag Raman wrote:
> 
> 
> > On Jan 28, 2022, at 3:29 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > 
> > On Thu, Jan 27, 2022 at 05:04:26PM +0000, Jag Raman wrote:
> >> 
> >> 
> >>> On Jan 25, 2022, at 10:48 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> >>> 
> >>> On Wed, Jan 19, 2022 at 04:42:06PM -0500, Jagannathan Raman wrote:
> >>>> +     * The client subsequetly asks the remote server for any data that
> >>> 
> >>> subsequently
> >>> 
> >>>> +static void vfu_mig_state_running(vfu_ctx_t *vfu_ctx)
> >>>> +{
> >>>> +    VfuObject *o = vfu_get_private(vfu_ctx);
> >>>> +    VfuObjectClass *k = VFU_OBJECT_GET_CLASS(OBJECT(o));
> >>>> +    static int migrated_devs;
> >>>> +    Error *local_err = NULL;
> >>>> +    int ret;
> >>>> +
> >>>> +    /**
> >>>> +     * TODO: move to VFU_MIGR_STATE_RESUME handler. Presently, the
> >>>> +     * VMSD data from source is not available at RESUME state.
> >>>> +     * Working on a fix for this.
> >>>> +     */
> >>>> +    if (!o->vfu_mig_file) {
> >>>> +        o->vfu_mig_file = qemu_fopen_ops(o, &vfu_mig_fops_load, false);
> >>>> +    }
> >>>> +
> >>>> +    ret = qemu_remote_loadvm(o->vfu_mig_file);
> >>>> +    if (ret) {
> >>>> +        VFU_OBJECT_ERROR(o, "vfu: failed to restore device state");
> >>>> +        return;
> >>>> +    }
> >>>> +
> >>>> +    qemu_file_shutdown(o->vfu_mig_file);
> >>>> +    o->vfu_mig_file = NULL;
> >>>> +
> >>>> +    /* VFU_MIGR_STATE_RUNNING begins here */
> >>>> +    if (++migrated_devs == k->nr_devs) {
> >>> 
> >>> When is this counter reset so migration can be tried again if it
> >>> fails/cancels?
> >> 
> >> Detecting cancellation is a pending item. We will address it in the
> >> next rev. Will check with you if  we get stuck during the process
> >> of implementing it.
> >> 
> >>> 
> >>>> +static ssize_t vfu_mig_read_data(vfu_ctx_t *vfu_ctx, void *buf,
> >>>> +                                 uint64_t size, uint64_t offset)
> >>>> +{
> >>>> +    VfuObject *o = vfu_get_private(vfu_ctx);
> >>>> +
> >>>> +    if (offset > o->vfu_mig_buf_size) {
> >>>> +        return -1;
> >>>> +    }
> >>>> +
> >>>> +    if ((offset + size) > o->vfu_mig_buf_size) {
> >>>> +        warn_report("vfu: buffer overflow - check pending_bytes");
> >>>> +        size = o->vfu_mig_buf_size - offset;
> >>>> +    }
> >>>> +
> >>>> +    memcpy(buf, (o->vfu_mig_buf + offset), size);
> >>>> +
> >>>> +    o->vfu_mig_buf_pending -= size;
> >>> 
> >>> This assumes that the caller increments offset by size each time. If
> >>> that assumption is okay, then we can just trust offset and don't need to
> >>> do arithmetic on vfu_mig_buf_pending. If that assumption is not correct,
> >>> then the code needs to be extended to safely update vfu_mig_buf_pending
> >>> when offset jumps around arbitrarily between calls.
> >> 
> >> Going by the definition of vfu_migration_callbacks_t in the library, I assumed
> >> that read_data advances the offset by size bytes.
> >> 
> >> Will add a comment a comment to explain that.
> >> 
> >>> 
> >>>> +uint64_t vmstate_vmsd_size(PCIDevice *pci_dev)
> >>>> +{
> >>>> +    DeviceClass *dc = DEVICE_GET_CLASS(DEVICE(pci_dev));
> >>>> +    const VMStateField *field = NULL;
> >>>> +    uint64_t size = 0;
> >>>> +
> >>>> +    if (!dc->vmsd) {
> >>>> +        return 0;
> >>>> +    }
> >>>> +
> >>>> +    field = dc->vmsd->fields;
> >>>> +    while (field && field->name) {
> >>>> +        size += vmstate_size(pci_dev, field);
> >>>> +        field++;
> >>>> +    }
> >>>> +
> >>>> +    return size;
> >>>> +}
> >>> 
> >>> This function looks incorrect because it ignores subsections as well as
> >>> runtime behavior during save(). Although VMStateDescription is partially
> >>> declarative, there is still a bunch of imperative code that can write to
> >>> the QEMUFile at save() time so there's no way of knowing the size ahead
> >>> of time.
> >> 
> >> I see your point, it would be a problem for any field which has the
> >> (VMS_BUFFER | VMS_ALLOC) flags set.
> >> 
> >>> 
> >>> I asked this in a previous revision of this series but I'm not sure if
> >>> it was answered: is it really necessary to know the size of the vmstate?
> >>> I thought the VFIO migration interface is designed to support
> >>> streaming reads/writes. We could choose a fixed size like 64KB and
> >>> stream the vmstate in 64KB chunks.
> >> 
> >> The library exposes the migration data to the client as a device BAR with
> >> fixed size - the size of which is fixed at boot time, even when using
> >> vfu_migration_callbacks_t callbacks.
> >> 
> >> I don’t believe the library supports streaming vmstate/migration-data - see
> >> the following comment in migration_region_access() defined in the library:
> >> 
> >> * Does this mean that partial reads are not allowed?
> >> 
> >> Thanos or John,
> >> 
> >>    Could you please clarify this?
> >> 
> >> Stefan,
> >>    We attempted to answer the migration cancellation and vmstate size
> >>    questions previously also, in the following email:
> >> 
> >> https://lore.kernel.org/all/F48606B1-15A4-4DD2-9D71-2FCAFC0E671F@oracle.com/
> > 
> >> libvfio-user has the vfu_migration_callbacks_t interface that allows the
> >> device to save/load more data regardless of the size of the migration
> >> region. I don't see the issue here since the region doesn't need to be
> >> sized to fit the savevm data?
> > 
> > The answer didn't make sense to me:
> > 
> > "In both scenarios at the server end - whether using the migration BAR or
> > using callbacks, the migration data is transported to the other end using
> > the BAR. As such we need to specify the BAR’s size during initialization.
> > 
> > In the case of the callbacks, the library translates the BAR access to callbacks."
> > 
> > The BAR and the migration region within it need a size but my
> > understanding is that VFIO migration is designed to stream the device
> > state, allowing it to be broken up into multiple reads/writes with
> > knowing the device state's size upfront. Here is the description from
> > <linux/vfio.h>:
> > 
> >  * The sequence to be followed while in pre-copy state and stop-and-copy state
> >  * is as follows:
> >  * a. Read pending_bytes, indicating the start of a new iteration to get device
> >  *    data. Repeated read on pending_bytes at this stage should have no side
> >  *    effects.
> >  *    If pending_bytes == 0, the user application should not iterate to get data
> >  *    for that device.
> >  *    If pending_bytes > 0, perform the following steps.
> >  * b. Read data_offset, indicating that the vendor driver should make data
> >  *    available through the data section. The vendor driver should return this
> >  *    read operation only after data is available from (region + data_offset)
> >  *    to (region + data_offset + data_size).
> >  * c. Read data_size, which is the amount of data in bytes available through
> >  *    the migration region.
> >  *    Read on data_offset and data_size should return the offset and size of
> >  *    the current buffer if the user application reads data_offset and
> >  *    data_size more than once here.
> >  * d. Read data_size bytes of data from (region + data_offset) from the
> >  *    migration region.
> >  * e. Process the data.
> >  * f. Read pending_bytes, which indicates that the data from the previous
> >  *    iteration has been read. If pending_bytes > 0, go to step b.
> >  *
> >  * The user application can transition from the _SAVING|_RUNNING
> >  * (pre-copy state) to the _SAVING (stop-and-copy) state regardless of the
> >  * number of pending bytes. The user application should iterate in _SAVING
> >  * (stop-and-copy) until pending_bytes is 0.
> > 
> > This means you can report pending_bytes > 0 until the entire vmstate has
> > been read and can pick a fixed chunk size like 64KB for the migration
> > region. There's no need to size the migration region to fit the entire
> > vmstate.
> 
> Thank you for the pointer to generic VFIO migration, Stefan! Makes sense.
> 
> So I understand that the VFIO migration region carves out a section to
> stream/shuttle device data between the app (QEMU client in this case) and the
> driver (QEMU server). This section starts at data_offset within the region and spans
> data_size bytes.
> 
> We could change the server to stream the data as outlined above. Do you have a
> preference for the section size? Does qemu_target_page_size() work? I just tested
> and am able to stream with a fixed BAR size such as qemu_target_page_size().

The VFIO migration API requires that data is written in the same chunk
sizes as it was read, so there is no way to merge or split chunks for
performance reasons once they have been read.

4KB may result in lots of chunks and that means more network traffic and
read()/write() calls. I think it's too small.

Something large like 1MB might create issues with responsiveness because
a 1MB chunk hogs the migration stream and read()/write() latency could
hog the event loop.

I'd go for 64KB. Dave and Juan might also have a suggestion for the size.

Stefan
diff mbox series

Patch

diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
index 017c03675c..68bea576ea 100644
--- a/include/migration/vmstate.h
+++ b/include/migration/vmstate.h
@@ -1165,6 +1165,8 @@  extern const VMStateInfo vmstate_info_qlist;
 #define VMSTATE_END_OF_LIST()                                         \
     {}
 
+uint64_t vmstate_vmsd_size(PCIDevice *pci_dev);
+
 int vmstate_load_state(QEMUFile *f, const VMStateDescription *vmsd,
                        void *opaque, int version_id);
 int vmstate_save_state(QEMUFile *f, const VMStateDescription *vmsd,
diff --git a/migration/savevm.h b/migration/savevm.h
index 6461342cb4..8007064ff2 100644
--- a/migration/savevm.h
+++ b/migration/savevm.h
@@ -67,5 +67,7 @@  int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis);
 int qemu_load_device_state(QEMUFile *f);
 int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
         bool in_postcopy, bool inactivate_disks);
+int qemu_remote_savevm(QEMUFile *f, DeviceState *dev);
+int qemu_remote_loadvm(QEMUFile *f);
 
 #endif
diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c
index 1771dba1bf..d3c51577bd 100644
--- a/hw/remote/vfio-user-obj.c
+++ b/hw/remote/vfio-user-obj.c
@@ -55,6 +55,11 @@ 
 #include "qemu/timer.h"
 #include "hw/pci/msi.h"
 #include "hw/pci/msix.h"
+#include "migration/qemu-file.h"
+#include "migration/savevm.h"
+#include "migration/vmstate.h"
+#include "migration/global_state.h"
+#include "block/block.h"
 
 #define TYPE_VFU_OBJECT "x-vfio-user-server"
 OBJECT_DECLARE_TYPE(VfuObject, VfuObjectClass, VFU_OBJECT)
@@ -106,6 +111,35 @@  struct VfuObject {
     Error *unplug_blocker;
 
     int vfu_poll_fd;
+
+    /*
+     * vfu_mig_buf holds the migration data. In the remote server, this
+     * buffer replaces the role of an IO channel which links the source
+     * and the destination.
+     *
+     * Whenever the client QEMU process initiates migration, the remote
+     * server gets notified via libvfio-user callbacks. The remote server
+     * sets up a QEMUFile object using this buffer as backend. The remote
+     * server passes this object to its migration subsystem, which slurps
+     * the VMSD of the device ('devid' above) referenced by this object
+     * and stores the VMSD in this buffer.
+     *
+     * The client subsequetly asks the remote server for any data that
+     * needs to be moved over to the destination via libvfio-user
+     * library's vfu_migration_callbacks_t callbacks. The remote hands
+     * over this buffer as data at this time.
+     *
+     * A reverse of this process happens at the destination.
+     */
+    uint8_t *vfu_mig_buf;
+
+    uint64_t vfu_mig_buf_size;
+
+    uint64_t vfu_mig_buf_pending;
+
+    QEMUFile *vfu_mig_file;
+
+    vfu_migr_state_t vfu_state;
 };
 
 static GHashTable *vfu_object_dev_to_ctx_table;
@@ -157,6 +191,272 @@  static void vfu_object_set_device(Object *obj, const char *str, Error **errp)
     vfu_object_init_ctx(o, errp);
 }
 
+/**
+ * Migration helper functions
+ *
+ * vfu_mig_buf_read & vfu_mig_buf_write are used by QEMU's migration
+ * subsystem - qemu_remote_loadvm & qemu_remote_savevm. loadvm/savevm
+ * call these functions via QEMUFileOps to load/save the VMSD of a
+ * device into vfu_mig_buf
+ *
+ */
+static ssize_t vfu_mig_buf_read(void *opaque, uint8_t *buf, int64_t pos,
+                                size_t size, Error **errp)
+{
+    VfuObject *o = opaque;
+
+    if (pos > o->vfu_mig_buf_size) {
+        size = 0;
+    } else if ((pos + size) > o->vfu_mig_buf_size) {
+        size = o->vfu_mig_buf_size - pos;
+    }
+
+    memcpy(buf, (o->vfu_mig_buf + pos), size);
+
+    return size;
+}
+
+static ssize_t vfu_mig_buf_write(void *opaque, struct iovec *iov, int iovcnt,
+                                 int64_t pos, Error **errp)
+{
+    VfuObject *o = opaque;
+    uint64_t end = pos + iov_size(iov, iovcnt);
+    int i;
+
+    if (end > o->vfu_mig_buf_size) {
+        o->vfu_mig_buf = g_realloc(o->vfu_mig_buf, end);
+    }
+
+    for (i = 0; i < iovcnt; i++) {
+        memcpy((o->vfu_mig_buf + o->vfu_mig_buf_size), iov[i].iov_base,
+               iov[i].iov_len);
+        o->vfu_mig_buf_size += iov[i].iov_len;
+        o->vfu_mig_buf_pending += iov[i].iov_len;
+    }
+
+    return iov_size(iov, iovcnt);
+}
+
+static int vfu_mig_buf_shutdown(void *opaque, bool rd, bool wr, Error **errp)
+{
+    VfuObject *o = opaque;
+
+    o->vfu_mig_buf_size = 0;
+
+    g_free(o->vfu_mig_buf);
+
+    o->vfu_mig_buf = NULL;
+
+    o->vfu_mig_buf_pending = 0;
+
+    return 0;
+}
+
+static const QEMUFileOps vfu_mig_fops_save = {
+    .writev_buffer  = vfu_mig_buf_write,
+    .shut_down      = vfu_mig_buf_shutdown,
+};
+
+static const QEMUFileOps vfu_mig_fops_load = {
+    .get_buffer     = vfu_mig_buf_read,
+    .shut_down      = vfu_mig_buf_shutdown,
+};
+
+/**
+ * handlers for vfu_migration_callbacks_t
+ *
+ * The libvfio-user library accesses these handlers to drive the migration
+ * at the remote end, and also to transport the data stored in vfu_mig_buf
+ *
+ */
+static void vfu_mig_state_stop_and_copy(vfu_ctx_t *vfu_ctx)
+{
+    VfuObject *o = vfu_get_private(vfu_ctx);
+    int ret;
+
+    if (!o->vfu_mig_file) {
+        o->vfu_mig_file = qemu_fopen_ops(o, &vfu_mig_fops_save, false);
+    }
+
+    ret = qemu_remote_savevm(o->vfu_mig_file, DEVICE(o->pci_dev));
+    if (ret) {
+        qemu_file_shutdown(o->vfu_mig_file);
+        o->vfu_mig_file = NULL;
+        return;
+    }
+
+    qemu_fflush(o->vfu_mig_file);
+}
+
+static void vfu_mig_state_running(vfu_ctx_t *vfu_ctx)
+{
+    VfuObject *o = vfu_get_private(vfu_ctx);
+    VfuObjectClass *k = VFU_OBJECT_GET_CLASS(OBJECT(o));
+    static int migrated_devs;
+    Error *local_err = NULL;
+    int ret;
+
+    /**
+     * TODO: move to VFU_MIGR_STATE_RESUME handler. Presently, the
+     * VMSD data from source is not available at RESUME state.
+     * Working on a fix for this.
+     */
+    if (!o->vfu_mig_file) {
+        o->vfu_mig_file = qemu_fopen_ops(o, &vfu_mig_fops_load, false);
+    }
+
+    ret = qemu_remote_loadvm(o->vfu_mig_file);
+    if (ret) {
+        VFU_OBJECT_ERROR(o, "vfu: failed to restore device state");
+        return;
+    }
+
+    qemu_file_shutdown(o->vfu_mig_file);
+    o->vfu_mig_file = NULL;
+
+    /* VFU_MIGR_STATE_RUNNING begins here */
+    if (++migrated_devs == k->nr_devs) {
+        bdrv_invalidate_cache_all(&local_err);
+        if (local_err) {
+            error_report_err(local_err);
+            return;
+        }
+
+        vm_start();
+    }
+}
+
+static void vfu_mig_state_stop(vfu_ctx_t *vfu_ctx)
+{
+    VfuObject *o = vfu_get_private(vfu_ctx);
+    VfuObjectClass *k = VFU_OBJECT_GET_CLASS(OBJECT(o));
+    static int migrated_devs;
+
+    /**
+     * note: calling bdrv_inactivate_all() is not the best approach.
+     *
+     *  Ideally, we would identify the block devices (if any) indirectly
+     *  linked (such as via a scsi-hd device) to each of the migrated devices,
+     *  and inactivate them individually. This is essential while operating
+     *  the server in a storage daemon mode, with devices from different VMs.
+     *
+     *  However, we currently don't have this capability. As such, we need to
+     *  inactivate all devices at the same time when migration is completed.
+     */
+    if (++migrated_devs == k->nr_devs) {
+        vm_stop(RUN_STATE_PAUSED);
+        bdrv_inactivate_all();
+    }
+}
+
+static int vfu_mig_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state)
+{
+    VfuObject *o = vfu_get_private(vfu_ctx);
+
+    if (o->vfu_state == state) {
+        return 0;
+    }
+
+    switch (state) {
+    case VFU_MIGR_STATE_RESUME:
+        break;
+    case VFU_MIGR_STATE_STOP_AND_COPY:
+        vfu_mig_state_stop_and_copy(vfu_ctx);
+        break;
+    case VFU_MIGR_STATE_STOP:
+        vfu_mig_state_stop(vfu_ctx);
+        break;
+    case VFU_MIGR_STATE_PRE_COPY:
+        break;
+    case VFU_MIGR_STATE_RUNNING:
+        if (!runstate_is_running()) {
+            vfu_mig_state_running(vfu_ctx);
+        }
+        break;
+    default:
+        warn_report("vfu: Unknown migration state %d", state);
+    }
+
+    o->vfu_state = state;
+
+    return 0;
+}
+
+static uint64_t vfu_mig_get_pending_bytes(vfu_ctx_t *vfu_ctx)
+{
+    VfuObject *o = vfu_get_private(vfu_ctx);
+
+    return o->vfu_mig_buf_pending;
+}
+
+static int vfu_mig_prepare_data(vfu_ctx_t *vfu_ctx, uint64_t *offset,
+                                uint64_t *size)
+{
+    VfuObject *o = vfu_get_private(vfu_ctx);
+
+    if (offset) {
+        *offset = 0;
+    }
+
+    if (size) {
+        *size = o->vfu_mig_buf_size;
+    }
+
+    return 0;
+}
+
+static ssize_t vfu_mig_read_data(vfu_ctx_t *vfu_ctx, void *buf,
+                                 uint64_t size, uint64_t offset)
+{
+    VfuObject *o = vfu_get_private(vfu_ctx);
+
+    if (offset > o->vfu_mig_buf_size) {
+        return -1;
+    }
+
+    if ((offset + size) > o->vfu_mig_buf_size) {
+        warn_report("vfu: buffer overflow - check pending_bytes");
+        size = o->vfu_mig_buf_size - offset;
+    }
+
+    memcpy(buf, (o->vfu_mig_buf + offset), size);
+
+    o->vfu_mig_buf_pending -= size;
+
+    return size;
+}
+
+static ssize_t vfu_mig_write_data(vfu_ctx_t *vfu_ctx, void *data,
+                                  uint64_t size, uint64_t offset)
+{
+    VfuObject *o = vfu_get_private(vfu_ctx);
+    uint64_t end = offset + size;
+
+    if (end > o->vfu_mig_buf_size) {
+        o->vfu_mig_buf = g_realloc(o->vfu_mig_buf, end);
+        o->vfu_mig_buf_size = end;
+    }
+
+    memcpy((o->vfu_mig_buf + offset), data, size);
+
+    return size;
+}
+
+static int vfu_mig_data_written(vfu_ctx_t *vfu_ctx, uint64_t count)
+{
+    return 0;
+}
+
+static const vfu_migration_callbacks_t vfu_mig_cbs = {
+    .version = VFU_MIGR_CALLBACKS_VERS,
+    .transition = &vfu_mig_transition,
+    .get_pending_bytes = &vfu_mig_get_pending_bytes,
+    .prepare_data = &vfu_mig_prepare_data,
+    .read_data = &vfu_mig_read_data,
+    .data_written = &vfu_mig_data_written,
+    .write_data = &vfu_mig_write_data,
+};
+
 static void vfu_object_ctx_run(void *opaque)
 {
     VfuObject *o = opaque;
@@ -476,6 +776,7 @@  static void vfu_object_init_ctx(VfuObject *o, Error **errp)
     ERRP_GUARD();
     DeviceState *dev = NULL;
     vfu_pci_type_t pci_type = VFU_PCI_TYPE_CONVENTIONAL;
+    uint64_t migr_regs_size, migr_size;
     int ret;
 
     if (o->vfu_ctx || !o->socket || !o->device ||
@@ -555,6 +856,26 @@  static void vfu_object_init_ctx(VfuObject *o, Error **errp)
         goto fail;
     }
 
+    migr_regs_size = vfu_get_migr_register_area_size();
+    migr_size = migr_regs_size + vmstate_vmsd_size(o->pci_dev);
+
+    ret = vfu_setup_region(o->vfu_ctx, VFU_PCI_DEV_MIGR_REGION_IDX,
+                           migr_size, NULL,
+                           VFU_REGION_FLAG_RW, NULL, 0, -1, 0);
+    if (ret < 0) {
+        error_setg(errp, "vfu: Failed to register migration BAR %s- %s",
+                   o->device, strerror(errno));
+        goto fail;
+    }
+
+    ret = vfu_setup_device_migration_callbacks(o->vfu_ctx, &vfu_mig_cbs,
+                                               migr_regs_size);
+    if (ret < 0) {
+        error_setg(errp, "vfu: Failed to setup migration %s- %s",
+                   o->device, strerror(errno));
+        goto fail;
+    }
+
     ret = vfu_realize_ctx(o->vfu_ctx);
     if (ret < 0) {
         error_setg(errp, "vfu: Failed to realize device %s- %s",
@@ -604,6 +925,8 @@  static void vfu_object_init(Object *obj)
     }
 
     o->vfu_poll_fd = -1;
+
+    o->vfu_state = VFU_MIGR_STATE_STOP;
 }
 
 static void vfu_object_finalize(Object *obj)
diff --git a/migration/savevm.c b/migration/savevm.c
index 0bef031acb..be119e2e59 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1605,6 +1605,49 @@  static int qemu_savevm_state(QEMUFile *f, Error **errp)
     return ret;
 }
 
+static SaveStateEntry *find_se_from_dev(DeviceState *dev)
+{
+    SaveStateEntry *se;
+
+    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
+        if (se->opaque == dev) {
+            return se;
+        }
+    }
+
+    return NULL;
+}
+
+int qemu_remote_savevm(QEMUFile *f, DeviceState *dev)
+{
+    SaveStateEntry *se;
+    int ret = 0;
+
+    se = find_se_from_dev(dev);
+    if (!se) {
+        return -ENODEV;
+    }
+
+    if (!se->vmsd || !vmstate_save_needed(se->vmsd, se->opaque)) {
+        return ret;
+    }
+
+    save_section_header(f, se, QEMU_VM_SECTION_FULL);
+
+    ret = vmstate_save(f, se, NULL);
+    if (ret) {
+        qemu_file_set_error(f, ret);
+        return ret;
+    }
+
+    save_section_footer(f, se);
+
+    qemu_put_byte(f, QEMU_VM_EOF);
+    qemu_fflush(f);
+
+    return 0;
+}
+
 void qemu_savevm_live_state(QEMUFile *f)
 {
     /* save QEMU_VM_SECTION_END section */
@@ -2446,6 +2489,36 @@  qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis)
     return 0;
 }
 
+int qemu_remote_loadvm(QEMUFile *f)
+{
+    uint8_t section_type;
+    int ret = 0;
+
+    while (true) {
+        section_type = qemu_get_byte(f);
+
+        ret = qemu_file_get_error(f);
+        if (ret) {
+            break;
+        }
+
+        switch (section_type) {
+        case QEMU_VM_SECTION_FULL:
+            ret = qemu_loadvm_section_start_full(f, NULL);
+            if (ret < 0) {
+                break;
+            }
+            break;
+        case QEMU_VM_EOF:
+            return ret;
+        default:
+            return -EINVAL;
+        }
+    }
+
+    return ret;
+}
+
 static int
 qemu_loadvm_section_part_end(QEMUFile *f, MigrationIncomingState *mis)
 {
diff --git a/migration/vmstate.c b/migration/vmstate.c
index 05f87cdddc..83f8562792 100644
--- a/migration/vmstate.c
+++ b/migration/vmstate.c
@@ -63,6 +63,25 @@  static int vmstate_size(void *opaque, const VMStateField *field)
     return size;
 }
 
+uint64_t vmstate_vmsd_size(PCIDevice *pci_dev)
+{
+    DeviceClass *dc = DEVICE_GET_CLASS(DEVICE(pci_dev));
+    const VMStateField *field = NULL;
+    uint64_t size = 0;
+
+    if (!dc->vmsd) {
+        return 0;
+    }
+
+    field = dc->vmsd->fields;
+    while (field && field->name) {
+        size += vmstate_size(pci_dev, field);
+        field++;
+    }
+
+    return size;
+}
+
 static void vmstate_handle_alloc(void *ptr, const VMStateField *field,
                                  void *opaque)
 {