diff mbox series

[RFC,2/2] vhost-user-fs: Implement stateful migration

Message ID 20230313174833.28790-3-hreitz@redhat.com
State New
Headers show
Series vhost-user-fs: Stateful migration | expand

Commit Message

Hanna Czenczek March 13, 2023, 5:48 p.m. UTC
A virtio-fs device's VM state consists of:
- the virtio device (vring) state (VMSTATE_VIRTIO_DEVICE)
- the back-end's (virtiofsd's) internal state

We get/set the latter via the new vhost-user operations FS_SET_STATE_FD,
FS_GET_STATE, and FS_SET_STATE.

Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
---
 hw/virtio/vhost-user-fs.c | 171 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 170 insertions(+), 1 deletion(-)

Comments

Anton Kuchin March 17, 2023, 5:19 p.m. UTC | #1
On 13/03/2023 19:48, Hanna Czenczek wrote:
> A virtio-fs device's VM state consists of:
> - the virtio device (vring) state (VMSTATE_VIRTIO_DEVICE)
> - the back-end's (virtiofsd's) internal state
>
> We get/set the latter via the new vhost-user operations FS_SET_STATE_FD,
> FS_GET_STATE, and FS_SET_STATE.
>
> Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
> ---
>   hw/virtio/vhost-user-fs.c | 171 +++++++++++++++++++++++++++++++++++++-
>   1 file changed, 170 insertions(+), 1 deletion(-)
>
> diff --git a/hw/virtio/vhost-user-fs.c b/hw/virtio/vhost-user-fs.c
> index 83fc20e49e..df1fb02acc 100644
> --- a/hw/virtio/vhost-user-fs.c
> +++ b/hw/virtio/vhost-user-fs.c
> @@ -20,8 +20,10 @@
>   #include "hw/virtio/virtio-bus.h"
>   #include "hw/virtio/virtio-access.h"
>   #include "qemu/error-report.h"
> +#include "qemu/memfd.h"
>   #include "hw/virtio/vhost.h"
>   #include "hw/virtio/vhost-user-fs.h"
> +#include "migration/qemu-file-types.h"
>   #include "monitor/monitor.h"
>   #include "sysemu/sysemu.h"
>   
> @@ -298,9 +300,176 @@ static struct vhost_dev *vuf_get_vhost(VirtIODevice *vdev)
>       return &fs->vhost_dev;
>   }
>   
> +/**
> + * Fetch the internal state from the back-end (virtiofsd) and save it
> + * to `f`.
> + */
> +static int vuf_save_state(QEMUFile *f, void *pv, size_t size,
> +                          const VMStateField *field, JSONWriter *vmdesc)
> +{
> +    VirtIODevice *vdev = pv;
> +    VHostUserFS *fs = VHOST_USER_FS(vdev);
> +    int memfd = -1;
> +    /* Size of the shared memory through which to transfer the state */
> +    const size_t chunk_size = 4 * 1024 * 1024;
> +    size_t state_offset;
> +    ssize_t remaining;
> +    void *shm_buf;
> +    Error *local_err = NULL;
> +    int ret, ret2;
> +
> +    /* Set up shared memory through which to receive the state from virtiofsd */
> +    shm_buf = qemu_memfd_alloc("vhost-fs-state", chunk_size,
> +                               F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW,
> +                               &memfd, &local_err);
> +    if (!shm_buf) {
> +        error_report_err(local_err);
> +        ret = -ENOMEM;
> +        goto early_fail;
> +    }
> +
> +    /* Share the SHM area with virtiofsd */
> +    ret = vhost_fs_set_state_fd(&fs->vhost_dev, memfd, chunk_size);
> +    if (ret < 0) {
> +        goto early_fail;

Don't we need some log message here too?

> +    }
> +
> +    /* Receive the virtiofsd state in chunks, and write them to `f` */
> +    state_offset = 0;
> +    do {
> +        size_t this_chunk_size;
> +
> +        remaining = vhost_fs_get_state(&fs->vhost_dev, state_offset,
> +                                       chunk_size);
> +        if (remaining < 0) {
> +            ret = remaining;
> +            goto fail;
> +        }
> +
> +        /* Prefix the whole state by its total length */
> +        if (state_offset == 0) {
> +            qemu_put_be64(f, remaining);
> +        }
> +
> +        this_chunk_size = MIN(remaining, chunk_size);
> +        qemu_put_buffer(f, shm_buf, this_chunk_size);
> +        state_offset += this_chunk_size;
> +    } while (remaining >= chunk_size);
> +
> +    ret = 0;
> +fail:
> +    /* Have virtiofsd close the shared memory */
> +    ret2 = vhost_fs_set_state_fd(&fs->vhost_dev, -1, 0);
> +    if (ret2 < 0) {
> +        error_report("Failed to remove state FD from the vhost-user-fs back "
> +                     "end: %s", strerror(-ret));
> +        if (ret == 0) {
> +            ret = ret2;
> +        }
> +    }
> +
> +early_fail:
> +    if (shm_buf) {
> +        qemu_memfd_free(shm_buf, chunk_size, memfd);
> +    }
> +
> +    return ret;
> +}
> +
> +/**
> + * Load the back-end's (virtiofsd's) internal state from `f` and send
> + * it over to that back-end.
> + */
> +static int vuf_load_state(QEMUFile *f, void *pv, size_t size,
> +                          const VMStateField *field)
> +{
> +    VirtIODevice *vdev = pv;
> +    VHostUserFS *fs = VHOST_USER_FS(vdev);
> +    int memfd = -1;
> +    /* Size of the shared memory through which to transfer the state */
> +    const size_t chunk_size = 4 * 1024 * 1024;
> +    size_t state_offset;
> +    uint64_t remaining;
> +    void *shm_buf;
> +    Error *local_err = NULL;
> +    int ret, ret2;
> +
> +    /* The state is prefixed by its total length, read that first */
> +    remaining = qemu_get_be64(f);
> +
> +    /* Set up shared memory through which to send the state to virtiofsd */
> +    shm_buf = qemu_memfd_alloc("vhost-fs-state", chunk_size,
> +                               F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW,
> +                               &memfd, &local_err);
> +    if (!shm_buf) {
> +        error_report_err(local_err);
> +        ret = -ENOMEM;
> +        goto early_fail;
> +    }
> +
> +    /* Share the SHM area with virtiofsd */
> +    ret = vhost_fs_set_state_fd(&fs->vhost_dev, memfd, chunk_size);
> +    if (ret < 0) {
> +        goto early_fail;
> +    }
> +
> +    /*
> +     * Read the virtiofsd state in chunks from `f`, and send them over
> +     * to virtiofsd
> +     */
> +    state_offset = 0;
> +    do {
> +        size_t this_chunk_size = MIN(remaining, chunk_size);
> +
> +        if (qemu_get_buffer(f, shm_buf, this_chunk_size) < this_chunk_size) {
> +            ret = -EINVAL;
> +            goto fail;
> +        }
> +
> +        ret = vhost_fs_set_state(&fs->vhost_dev, state_offset, this_chunk_size);
> +        if (ret < 0) {
> +            goto fail;
> +        }
> +
> +        state_offset += this_chunk_size;
> +        remaining -= this_chunk_size;
> +    } while (remaining > 0);
> +
> +    ret = 0;
> +fail:
> +    ret2 = vhost_fs_set_state_fd(&fs->vhost_dev, -1, 0);
> +    if (ret2 < 0) {
> +        error_report("Failed to remove state FD from the vhost-user-fs back "
> +                     "end -- perhaps it failed to deserialize/apply the state: "
> +                     "%s", strerror(-ret2));
> +        if (ret == 0) {
> +            ret = ret2;
> +        }
> +    }
> +
> +early_fail:
> +    if (shm_buf) {
> +        qemu_memfd_free(shm_buf, chunk_size, memfd);
> +    }
> +
> +    return ret;
> +}
> +
>   static const VMStateDescription vuf_vmstate = {
>       .name = "vhost-user-fs",
> -    .unmigratable = 1,
> +    .version_id = 1,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_VIRTIO_DEVICE,
> +        {
> +            .name = "back-end",
> +            .info = &(const VMStateInfo) {
> +                .name = "virtio-fs back-end state",
> +                .get = vuf_load_state,
> +                .put = vuf_save_state,
> +            },
> +        },

I've been working on stateless migration patch [1] and there was 
discussed that we
need to keep some kind of blocker by default if orchestrators rely on 
unmigratable
field in virtio-fs vmstate to block the migration.
For this purpose I've implemented flag that selects "none" or "external" 
and is checked
in pre_save, so it could be extended with "internal" option.
We didn't come to conclusion if we also need to check incoming 
migration, the discussion
has stopped for a while but I'm going back to it now.

I would appreciate if you have time to take a look at the discussion and 
consider the idea
proposed there to store internal state as a subsection of vmstate to 
make it as an option
but not mandatory.

[1] 
https://patchew.org/QEMU/20230217170038.1273710-1-antonkuchin@yandex-team.ru/

> +        VMSTATE_END_OF_LIST()
> +    },
>   };
>   
>   static Property vuf_properties[] = {
Hanna Czenczek March 17, 2023, 5:52 p.m. UTC | #2
On 17.03.23 18:19, Anton Kuchin wrote:
> On 13/03/2023 19:48, Hanna Czenczek wrote:
>> A virtio-fs device's VM state consists of:
>> - the virtio device (vring) state (VMSTATE_VIRTIO_DEVICE)
>> - the back-end's (virtiofsd's) internal state
>>
>> We get/set the latter via the new vhost-user operations FS_SET_STATE_FD,
>> FS_GET_STATE, and FS_SET_STATE.
>>
>> Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
>> ---
>>   hw/virtio/vhost-user-fs.c | 171 +++++++++++++++++++++++++++++++++++++-
>>   1 file changed, 170 insertions(+), 1 deletion(-)
>>
>> diff --git a/hw/virtio/vhost-user-fs.c b/hw/virtio/vhost-user-fs.c
>> index 83fc20e49e..df1fb02acc 100644
>> --- a/hw/virtio/vhost-user-fs.c
>> +++ b/hw/virtio/vhost-user-fs.c
>> @@ -20,8 +20,10 @@
>>   #include "hw/virtio/virtio-bus.h"
>>   #include "hw/virtio/virtio-access.h"
>>   #include "qemu/error-report.h"
>> +#include "qemu/memfd.h"
>>   #include "hw/virtio/vhost.h"
>>   #include "hw/virtio/vhost-user-fs.h"
>> +#include "migration/qemu-file-types.h"
>>   #include "monitor/monitor.h"
>>   #include "sysemu/sysemu.h"
>>   @@ -298,9 +300,176 @@ static struct vhost_dev 
>> *vuf_get_vhost(VirtIODevice *vdev)
>>       return &fs->vhost_dev;
>>   }
>>   +/**
>> + * Fetch the internal state from the back-end (virtiofsd) and save it
>> + * to `f`.
>> + */
>> +static int vuf_save_state(QEMUFile *f, void *pv, size_t size,
>> +                          const VMStateField *field, JSONWriter 
>> *vmdesc)
>> +{
>> +    VirtIODevice *vdev = pv;
>> +    VHostUserFS *fs = VHOST_USER_FS(vdev);
>> +    int memfd = -1;
>> +    /* Size of the shared memory through which to transfer the state */
>> +    const size_t chunk_size = 4 * 1024 * 1024;
>> +    size_t state_offset;
>> +    ssize_t remaining;
>> +    void *shm_buf;
>> +    Error *local_err = NULL;
>> +    int ret, ret2;
>> +
>> +    /* Set up shared memory through which to receive the state from 
>> virtiofsd */
>> +    shm_buf = qemu_memfd_alloc("vhost-fs-state", chunk_size,
>> +                               F_SEAL_SEAL | F_SEAL_SHRINK | 
>> F_SEAL_GROW,
>> +                               &memfd, &local_err);
>> +    if (!shm_buf) {
>> +        error_report_err(local_err);
>> +        ret = -ENOMEM;
>> +        goto early_fail;
>> +    }
>> +
>> +    /* Share the SHM area with virtiofsd */
>> +    ret = vhost_fs_set_state_fd(&fs->vhost_dev, memfd, chunk_size);
>> +    if (ret < 0) {
>> +        goto early_fail;
>
> Don't we need some log message here too?

Sure, why not.  There are other places in this patch that just return 
-errno but print no error, I think they could all use a verbose error 
message.

>> +    }
>> +
>> +    /* Receive the virtiofsd state in chunks, and write them to `f` */
>> +    state_offset = 0;
>> +    do {
>> +        size_t this_chunk_size;
>> +
>> +        remaining = vhost_fs_get_state(&fs->vhost_dev, state_offset,
>> +                                       chunk_size);
>> +        if (remaining < 0) {
>> +            ret = remaining;
>> +            goto fail;
>> +        }
>> +
>> +        /* Prefix the whole state by its total length */
>> +        if (state_offset == 0) {
>> +            qemu_put_be64(f, remaining);
>> +        }
>> +
>> +        this_chunk_size = MIN(remaining, chunk_size);
>> +        qemu_put_buffer(f, shm_buf, this_chunk_size);
>> +        state_offset += this_chunk_size;
>> +    } while (remaining >= chunk_size);
>> +
>> +    ret = 0;
>> +fail:
>> +    /* Have virtiofsd close the shared memory */
>> +    ret2 = vhost_fs_set_state_fd(&fs->vhost_dev, -1, 0);
>> +    if (ret2 < 0) {
>> +        error_report("Failed to remove state FD from the 
>> vhost-user-fs back "
>> +                     "end: %s", strerror(-ret));
>> +        if (ret == 0) {
>> +            ret = ret2;
>> +        }
>> +    }
>> +
>> +early_fail:
>> +    if (shm_buf) {
>> +        qemu_memfd_free(shm_buf, chunk_size, memfd);
>> +    }
>> +
>> +    return ret;
>> +}
>> +
>> +/**
>> + * Load the back-end's (virtiofsd's) internal state from `f` and send
>> + * it over to that back-end.
>> + */
>> +static int vuf_load_state(QEMUFile *f, void *pv, size_t size,
>> +                          const VMStateField *field)
>> +{
>> +    VirtIODevice *vdev = pv;
>> +    VHostUserFS *fs = VHOST_USER_FS(vdev);
>> +    int memfd = -1;
>> +    /* Size of the shared memory through which to transfer the state */
>> +    const size_t chunk_size = 4 * 1024 * 1024;
>> +    size_t state_offset;
>> +    uint64_t remaining;
>> +    void *shm_buf;
>> +    Error *local_err = NULL;
>> +    int ret, ret2;
>> +
>> +    /* The state is prefixed by its total length, read that first */
>> +    remaining = qemu_get_be64(f);
>> +
>> +    /* Set up shared memory through which to send the state to 
>> virtiofsd */
>> +    shm_buf = qemu_memfd_alloc("vhost-fs-state", chunk_size,
>> +                               F_SEAL_SEAL | F_SEAL_SHRINK | 
>> F_SEAL_GROW,
>> +                               &memfd, &local_err);
>> +    if (!shm_buf) {
>> +        error_report_err(local_err);
>> +        ret = -ENOMEM;
>> +        goto early_fail;
>> +    }
>> +
>> +    /* Share the SHM area with virtiofsd */
>> +    ret = vhost_fs_set_state_fd(&fs->vhost_dev, memfd, chunk_size);
>> +    if (ret < 0) {
>> +        goto early_fail;
>> +    }
>> +
>> +    /*
>> +     * Read the virtiofsd state in chunks from `f`, and send them over
>> +     * to virtiofsd
>> +     */
>> +    state_offset = 0;
>> +    do {
>> +        size_t this_chunk_size = MIN(remaining, chunk_size);
>> +
>> +        if (qemu_get_buffer(f, shm_buf, this_chunk_size) < 
>> this_chunk_size) {
>> +            ret = -EINVAL;
>> +            goto fail;
>> +        }
>> +
>> +        ret = vhost_fs_set_state(&fs->vhost_dev, state_offset, 
>> this_chunk_size);
>> +        if (ret < 0) {
>> +            goto fail;
>> +        }
>> +
>> +        state_offset += this_chunk_size;
>> +        remaining -= this_chunk_size;
>> +    } while (remaining > 0);
>> +
>> +    ret = 0;
>> +fail:
>> +    ret2 = vhost_fs_set_state_fd(&fs->vhost_dev, -1, 0);
>> +    if (ret2 < 0) {
>> +        error_report("Failed to remove state FD from the 
>> vhost-user-fs back "
>> +                     "end -- perhaps it failed to deserialize/apply 
>> the state: "
>> +                     "%s", strerror(-ret2));
>> +        if (ret == 0) {
>> +            ret = ret2;
>> +        }
>> +    }
>> +
>> +early_fail:
>> +    if (shm_buf) {
>> +        qemu_memfd_free(shm_buf, chunk_size, memfd);
>> +    }
>> +
>> +    return ret;
>> +}
>> +
>>   static const VMStateDescription vuf_vmstate = {
>>       .name = "vhost-user-fs",
>> -    .unmigratable = 1,
>> +    .version_id = 1,
>> +    .fields = (VMStateField[]) {
>> +        VMSTATE_VIRTIO_DEVICE,
>> +        {
>> +            .name = "back-end",
>> +            .info = &(const VMStateInfo) {
>> +                .name = "virtio-fs back-end state",
>> +                .get = vuf_load_state,
>> +                .put = vuf_save_state,
>> +            },
>> +        },
>
> I've been working on stateless migration patch [1] and there was 
> discussed that we
> need to keep some kind of blocker by default if orchestrators rely on 
> unmigratable
> field in virtio-fs vmstate to block the migration.
> For this purpose I've implemented flag that selects "none" or 
> "external" and is checked
> in pre_save, so it could be extended with "internal" option.
> We didn't come to conclusion if we also need to check incoming 
> migration, the discussion
> has stopped for a while but I'm going back to it now.
>
> I would appreciate if you have time to take a look at the discussion 
> and consider the idea
> proposed there to store internal state as a subsection of vmstate to 
> make it as an option
> but not mandatory.
>
> [1] 
> https://patchew.org/QEMU/20230217170038.1273710-1-antonkuchin@yandex-team.ru/

So far I’ve mostly considered these issues orthogonal.  If your 
stateless migration goes in first, then state is optional and I’ll 
adjust this series.  If stateful migration goes in first, then your 
series can simply make state optional by introducing the external 
option, no?

But maybe we could also consider making stateless migration a special 
case of stateful migration; if we had stateful migration, can’t we just 
implement stateless migration by telling virtiofsd that it should submit 
a special “I have no state” pseudo-state, i.e. by having a switch on 
virtiofsd instead?

Off the top of my head, some downsides of that approach would be (1) 
it’d need a switch on the virtiofsd side, not on the qemu side (not sure 
if that’s a downside, but a difference for sure), and (2) we’d need at 
least some support for this on the virtiofsd side, i.e. practically 
can’t come quicker than stateful migration support.

Hanna
Anton Kuchin March 17, 2023, 6:37 p.m. UTC | #3
On 17/03/2023 19:52, Hanna Czenczek wrote:
> On 17.03.23 18:19, Anton Kuchin wrote:
>> On 13/03/2023 19:48, Hanna Czenczek wrote:
>>> A virtio-fs device's VM state consists of:
>>> - the virtio device (vring) state (VMSTATE_VIRTIO_DEVICE)
>>> - the back-end's (virtiofsd's) internal state
>>>
>>> We get/set the latter via the new vhost-user operations 
>>> FS_SET_STATE_FD,
>>> FS_GET_STATE, and FS_SET_STATE.
>>>
>>> Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
>>> ---
>>>   hw/virtio/vhost-user-fs.c | 171 
>>> +++++++++++++++++++++++++++++++++++++-
>>>   1 file changed, 170 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/hw/virtio/vhost-user-fs.c b/hw/virtio/vhost-user-fs.c
>>> index 83fc20e49e..df1fb02acc 100644
>>> --- a/hw/virtio/vhost-user-fs.c
>>> +++ b/hw/virtio/vhost-user-fs.c
>>> @@ -20,8 +20,10 @@
>>>   #include "hw/virtio/virtio-bus.h"
>>>   #include "hw/virtio/virtio-access.h"
>>>   #include "qemu/error-report.h"
>>> +#include "qemu/memfd.h"
>>>   #include "hw/virtio/vhost.h"
>>>   #include "hw/virtio/vhost-user-fs.h"
>>> +#include "migration/qemu-file-types.h"
>>>   #include "monitor/monitor.h"
>>>   #include "sysemu/sysemu.h"
>>>   @@ -298,9 +300,176 @@ static struct vhost_dev 
>>> *vuf_get_vhost(VirtIODevice *vdev)
>>>       return &fs->vhost_dev;
>>>   }
>>>   +/**
>>> + * Fetch the internal state from the back-end (virtiofsd) and save it
>>> + * to `f`.
>>> + */
>>> +static int vuf_save_state(QEMUFile *f, void *pv, size_t size,
>>> +                          const VMStateField *field, JSONWriter 
>>> *vmdesc)
>>> +{
>>> +    VirtIODevice *vdev = pv;
>>> +    VHostUserFS *fs = VHOST_USER_FS(vdev);
>>> +    int memfd = -1;
>>> +    /* Size of the shared memory through which to transfer the 
>>> state */
>>> +    const size_t chunk_size = 4 * 1024 * 1024;
>>> +    size_t state_offset;
>>> +    ssize_t remaining;
>>> +    void *shm_buf;
>>> +    Error *local_err = NULL;
>>> +    int ret, ret2;
>>> +
>>> +    /* Set up shared memory through which to receive the state from 
>>> virtiofsd */
>>> +    shm_buf = qemu_memfd_alloc("vhost-fs-state", chunk_size,
>>> +                               F_SEAL_SEAL | F_SEAL_SHRINK | 
>>> F_SEAL_GROW,
>>> +                               &memfd, &local_err);
>>> +    if (!shm_buf) {
>>> +        error_report_err(local_err);
>>> +        ret = -ENOMEM;
>>> +        goto early_fail;
>>> +    }
>>> +
>>> +    /* Share the SHM area with virtiofsd */
>>> +    ret = vhost_fs_set_state_fd(&fs->vhost_dev, memfd, chunk_size);
>>> +    if (ret < 0) {
>>> +        goto early_fail;
>>
>> Don't we need some log message here too?
>
> Sure, why not.  There are other places in this patch that just return 
> -errno but print no error, I think they could all use a verbose error 
> message.
>
>>> +    }
>>> +
>>> +    /* Receive the virtiofsd state in chunks, and write them to `f` */
>>> +    state_offset = 0;
>>> +    do {
>>> +        size_t this_chunk_size;
>>> +
>>> +        remaining = vhost_fs_get_state(&fs->vhost_dev, state_offset,
>>> +                                       chunk_size);
>>> +        if (remaining < 0) {
>>> +            ret = remaining;
>>> +            goto fail;
>>> +        }
>>> +
>>> +        /* Prefix the whole state by its total length */
>>> +        if (state_offset == 0) {
>>> +            qemu_put_be64(f, remaining);
>>> +        }
>>> +
>>> +        this_chunk_size = MIN(remaining, chunk_size);
>>> +        qemu_put_buffer(f, shm_buf, this_chunk_size);
>>> +        state_offset += this_chunk_size;
>>> +    } while (remaining >= chunk_size);
>>> +
>>> +    ret = 0;
>>> +fail:
>>> +    /* Have virtiofsd close the shared memory */
>>> +    ret2 = vhost_fs_set_state_fd(&fs->vhost_dev, -1, 0);
>>> +    if (ret2 < 0) {
>>> +        error_report("Failed to remove state FD from the 
>>> vhost-user-fs back "
>>> +                     "end: %s", strerror(-ret));
>>> +        if (ret == 0) {
>>> +            ret = ret2;
>>> +        }
>>> +    }
>>> +
>>> +early_fail:
>>> +    if (shm_buf) {
>>> +        qemu_memfd_free(shm_buf, chunk_size, memfd);
>>> +    }
>>> +
>>> +    return ret;
>>> +}
>>> +
>>> +/**
>>> + * Load the back-end's (virtiofsd's) internal state from `f` and send
>>> + * it over to that back-end.
>>> + */
>>> +static int vuf_load_state(QEMUFile *f, void *pv, size_t size,
>>> +                          const VMStateField *field)
>>> +{
>>> +    VirtIODevice *vdev = pv;
>>> +    VHostUserFS *fs = VHOST_USER_FS(vdev);
>>> +    int memfd = -1;
>>> +    /* Size of the shared memory through which to transfer the 
>>> state */
>>> +    const size_t chunk_size = 4 * 1024 * 1024;
>>> +    size_t state_offset;
>>> +    uint64_t remaining;
>>> +    void *shm_buf;
>>> +    Error *local_err = NULL;
>>> +    int ret, ret2;
>>> +
>>> +    /* The state is prefixed by its total length, read that first */
>>> +    remaining = qemu_get_be64(f);
>>> +
>>> +    /* Set up shared memory through which to send the state to 
>>> virtiofsd */
>>> +    shm_buf = qemu_memfd_alloc("vhost-fs-state", chunk_size,
>>> +                               F_SEAL_SEAL | F_SEAL_SHRINK | 
>>> F_SEAL_GROW,
>>> +                               &memfd, &local_err);
>>> +    if (!shm_buf) {
>>> +        error_report_err(local_err);
>>> +        ret = -ENOMEM;
>>> +        goto early_fail;
>>> +    }
>>> +
>>> +    /* Share the SHM area with virtiofsd */
>>> +    ret = vhost_fs_set_state_fd(&fs->vhost_dev, memfd, chunk_size);
>>> +    if (ret < 0) {
>>> +        goto early_fail;
>>> +    }
>>> +
>>> +    /*
>>> +     * Read the virtiofsd state in chunks from `f`, and send them over
>>> +     * to virtiofsd
>>> +     */
>>> +    state_offset = 0;
>>> +    do {
>>> +        size_t this_chunk_size = MIN(remaining, chunk_size);
>>> +
>>> +        if (qemu_get_buffer(f, shm_buf, this_chunk_size) < 
>>> this_chunk_size) {
>>> +            ret = -EINVAL;
>>> +            goto fail;
>>> +        }
>>> +
>>> +        ret = vhost_fs_set_state(&fs->vhost_dev, state_offset, 
>>> this_chunk_size);
>>> +        if (ret < 0) {
>>> +            goto fail;
>>> +        }
>>> +
>>> +        state_offset += this_chunk_size;
>>> +        remaining -= this_chunk_size;
>>> +    } while (remaining > 0);
>>> +
>>> +    ret = 0;
>>> +fail:
>>> +    ret2 = vhost_fs_set_state_fd(&fs->vhost_dev, -1, 0);
>>> +    if (ret2 < 0) {
>>> +        error_report("Failed to remove state FD from the 
>>> vhost-user-fs back "
>>> +                     "end -- perhaps it failed to deserialize/apply 
>>> the state: "
>>> +                     "%s", strerror(-ret2));
>>> +        if (ret == 0) {
>>> +            ret = ret2;
>>> +        }
>>> +    }
>>> +
>>> +early_fail:
>>> +    if (shm_buf) {
>>> +        qemu_memfd_free(shm_buf, chunk_size, memfd);
>>> +    }
>>> +
>>> +    return ret;
>>> +}
>>> +
>>>   static const VMStateDescription vuf_vmstate = {
>>>       .name = "vhost-user-fs",
>>> -    .unmigratable = 1,
>>> +    .version_id = 1,
>>> +    .fields = (VMStateField[]) {
>>> +        VMSTATE_VIRTIO_DEVICE,
>>> +        {
>>> +            .name = "back-end",
>>> +            .info = &(const VMStateInfo) {
>>> +                .name = "virtio-fs back-end state",
>>> +                .get = vuf_load_state,
>>> +                .put = vuf_save_state,
>>> +            },
>>> +        },
>>
>> I've been working on stateless migration patch [1] and there was 
>> discussed that we
>> need to keep some kind of blocker by default if orchestrators rely on 
>> unmigratable
>> field in virtio-fs vmstate to block the migration.
>> For this purpose I've implemented flag that selects "none" or 
>> "external" and is checked
>> in pre_save, so it could be extended with "internal" option.
>> We didn't come to conclusion if we also need to check incoming 
>> migration, the discussion
>> has stopped for a while but I'm going back to it now.
>>
>> I would appreciate if you have time to take a look at the discussion 
>> and consider the idea
>> proposed there to store internal state as a subsection of vmstate to 
>> make it as an option
>> but not mandatory.
>>
>> [1] 
>> https://patchew.org/QEMU/20230217170038.1273710-1-antonkuchin@yandex-team.ru/
>
> So far I’ve mostly considered these issues orthogonal.  If your 
> stateless migration goes in first, then state is optional and I’ll 
> adjust this series.
> If stateful migration goes in first, then your series can simply make 
> state optional by introducing the external option, no?

Not really. State can be easily extended by subsections but not trimmed. 
Maybe this can be worked around by defining two types of vmstate and 
selecting the correct one at migration, but I'm not sure.

>
> But maybe we could also consider making stateless migration a special 
> case of stateful migration; if we had stateful migration, can’t we 
> just implement stateless migration by telling virtiofsd that it should 
> submit a special “I have no state” pseudo-state, i.e. by having a 
> switch on virtiofsd instead?

Sure. Backend can send empty state (as your patch treats 0 length as a 
valid response and not error) or dummy state that can be recognized as 
stateless. The only potential problem is that then we need support in 
backend for new commands even to return dummy state, and if backend can 
support both types then we'll need some switch in backend to reply with 
real or empty state.

>
> Off the top of my head, some downsides of that approach would be
> (1) it’d need a switch on the virtiofsd side, not on the qemu side 
> (not sure if that’s a downside, but a difference for sure),

Why would you? It seems to me that this affects only how qemu treats the 
vmstate of device. If the state was requested backend sends it to qemu. 
If state subsection is present in stream qemu sends it to the backend 
for loading. Stateless one just doesn't request state from the backend. 
Or am I missing something?

> and (2) we’d need at least some support for this on the virtiofsd 
> side, i.e. practically can’t come quicker than stateful migration 
> support.

Not much, essentially this is just a reconnect. I've sent a draft of a 
reconnect patch for old C-virtiofsd, for rust version it takes much 
longer because I'm learning rust and I'm not really good at it yet.

>
> Hanna
>
Hanna Czenczek March 20, 2023, 9:33 a.m. UTC | #4
On 17.03.23 19:37, Anton Kuchin wrote:
> On 17/03/2023 19:52, Hanna Czenczek wrote:
>> On 17.03.23 18:19, Anton Kuchin wrote:
>>> On 13/03/2023 19:48, Hanna Czenczek wrote:
>>>> A virtio-fs device's VM state consists of:
>>>> - the virtio device (vring) state (VMSTATE_VIRTIO_DEVICE)
>>>> - the back-end's (virtiofsd's) internal state
>>>>
>>>> We get/set the latter via the new vhost-user operations 
>>>> FS_SET_STATE_FD,
>>>> FS_GET_STATE, and FS_SET_STATE.
>>>>

[...]

>>>>   static const VMStateDescription vuf_vmstate = {
>>>>       .name = "vhost-user-fs",
>>>> -    .unmigratable = 1,
>>>> +    .version_id = 1,
>>>> +    .fields = (VMStateField[]) {
>>>> +        VMSTATE_VIRTIO_DEVICE,
>>>> +        {
>>>> +            .name = "back-end",
>>>> +            .info = &(const VMStateInfo) {
>>>> +                .name = "virtio-fs back-end state",
>>>> +                .get = vuf_load_state,
>>>> +                .put = vuf_save_state,
>>>> +            },
>>>> +        },
>>>
>>> I've been working on stateless migration patch [1] and there was 
>>> discussed that we
>>> need to keep some kind of blocker by default if orchestrators rely 
>>> on unmigratable
>>> field in virtio-fs vmstate to block the migration.
>>> For this purpose I've implemented flag that selects "none" or 
>>> "external" and is checked
>>> in pre_save, so it could be extended with "internal" option.
>>> We didn't come to conclusion if we also need to check incoming 
>>> migration, the discussion
>>> has stopped for a while but I'm going back to it now.
>>>
>>> I would appreciate if you have time to take a look at the discussion 
>>> and consider the idea
>>> proposed there to store internal state as a subsection of vmstate to 
>>> make it as an option
>>> but not mandatory.
>>>
>>> [1] 
>>> https://patchew.org/QEMU/20230217170038.1273710-1-antonkuchin@yandex-team.ru/
>>
>> So far I’ve mostly considered these issues orthogonal.  If your 
>> stateless migration goes in first, then state is optional and I’ll 
>> adjust this series.
>> If stateful migration goes in first, then your series can simply make 
>> state optional by introducing the external option, no?
>
> Not really. State can be easily extended by subsections but not 
> trimmed. Maybe this can be worked around by defining two types of 
> vmstate and selecting the correct one at migration, but I'm not sure.

I thought your patch included a switch on the vhost-user-fs device (on 
the qemu side) to tell qemu what migration data to expect.  Can we not 
transfer a 0-length state for 'external', and assert this on the 
destination side?

>>
>> But maybe we could also consider making stateless migration a special 
>> case of stateful migration; if we had stateful migration, can’t we 
>> just implement stateless migration by telling virtiofsd that it 
>> should submit a special “I have no state” pseudo-state, i.e. by 
>> having a switch on virtiofsd instead?
>
> Sure. Backend can send empty state (as your patch treats 0 length as a 
> valid response and not error) or dummy state that can be recognized as 
> stateless. The only potential problem is that then we need support in 
> backend for new commands even to return dummy state, and if backend 
> can support both types then we'll need some switch in backend to reply 
> with real or empty state.

Yes, exactly.

>>
>> Off the top of my head, some downsides of that approach would be
>> (1) it’d need a switch on the virtiofsd side, not on the qemu side 
>> (not sure if that’s a downside, but a difference for sure),
>
> Why would you? It seems to me that this affects only how qemu treats 
> the vmstate of device. If the state was requested backend sends it to 
> qemu. If state subsection is present in stream qemu sends it to the 
> backend for loading. Stateless one just doesn't request state from the 
> backend. Or am I missing something?
>
>> and (2) we’d need at least some support for this on the virtiofsd 
>> side, i.e. practically can’t come quicker than stateful migration 
>> support.
>
> Not much, essentially this is just a reconnect. I've sent a draft of a 
> reconnect patch for old C-virtiofsd, for rust version it takes much 
> longer because I'm learning rust and I'm not really good at it yet.

I meant these two downsides not for your proposal, but instead if we 
implemented stateless migration only in the back-end; i.e. the front-end 
would only implement stateful migration, and the back-end would send and 
accept an empty state.

Then, qemu would always request a “state” (even if it turns out empty 
for stateless migration), and qemu would always treat it the same, so 
there wouldn’t be a switch on the qemu side, but only on the virtiofsd 
side.  Doesn’t really matter, but what does matter is that we’d need to 
implement the migration interface in virtiofsd, even if in the end no 
state is transferred.

So exactly what you’ve said above (“The only potential problem is […]”). :)

Hanna
Anton Kuchin March 20, 2023, 12:39 p.m. UTC | #5
On 20/03/2023 11:33, Hanna Czenczek wrote:
> On 17.03.23 19:37, Anton Kuchin wrote:
>> On 17/03/2023 19:52, Hanna Czenczek wrote:
>>> On 17.03.23 18:19, Anton Kuchin wrote:
>>>> On 13/03/2023 19:48, Hanna Czenczek wrote:
>>>>> A virtio-fs device's VM state consists of:
>>>>> - the virtio device (vring) state (VMSTATE_VIRTIO_DEVICE)
>>>>> - the back-end's (virtiofsd's) internal state
>>>>>
>>>>> We get/set the latter via the new vhost-user operations 
>>>>> FS_SET_STATE_FD,
>>>>> FS_GET_STATE, and FS_SET_STATE.
>>>>>
>
> [...]
>
>>>>>   static const VMStateDescription vuf_vmstate = {
>>>>>       .name = "vhost-user-fs",
>>>>> -    .unmigratable = 1,
>>>>> +    .version_id = 1,
>>>>> +    .fields = (VMStateField[]) {
>>>>> +        VMSTATE_VIRTIO_DEVICE,
>>>>> +        {
>>>>> +            .name = "back-end",
>>>>> +            .info = &(const VMStateInfo) {
>>>>> +                .name = "virtio-fs back-end state",
>>>>> +                .get = vuf_load_state,
>>>>> +                .put = vuf_save_state,
>>>>> +            },
>>>>> +        },
>>>>
>>>> I've been working on stateless migration patch [1] and there was 
>>>> discussed that we
>>>> need to keep some kind of blocker by default if orchestrators rely 
>>>> on unmigratable
>>>> field in virtio-fs vmstate to block the migration.
>>>> For this purpose I've implemented flag that selects "none" or 
>>>> "external" and is checked
>>>> in pre_save, so it could be extended with "internal" option.
>>>> We didn't come to conclusion if we also need to check incoming 
>>>> migration, the discussion
>>>> has stopped for a while but I'm going back to it now.
>>>>
>>>> I would appreciate if you have time to take a look at the 
>>>> discussion and consider the idea
>>>> proposed there to store internal state as a subsection of vmstate 
>>>> to make it as an option
>>>> but not mandatory.
>>>>
>>>> [1] 
>>>> https://patchew.org/QEMU/20230217170038.1273710-1-antonkuchin@yandex-team.ru/
>>>
>>> So far I’ve mostly considered these issues orthogonal.  If your 
>>> stateless migration goes in first, then state is optional and I’ll 
>>> adjust this series.
>>> If stateful migration goes in first, then your series can simply 
>>> make state optional by introducing the external option, no?
>>
>> Not really. State can be easily extended by subsections but not 
>> trimmed. Maybe this can be worked around by defining two types of 
>> vmstate and selecting the correct one at migration, but I'm not sure.
>
> I thought your patch included a switch on the vhost-user-fs device (on 
> the qemu side) to tell qemu what migration data to expect. Can we not 
> transfer a 0-length state for 'external', and assert this on the 
> destination side?

Looks like I really need to make the description of my patch and the 
documentation more clear :) My patch proposes switch on _source_ side to 
select which data to save in the stream mostly to protect old 
orchestrators that don't expect virtio-fs to be migratable (and for 
internal case it can be extended to select if qemu needs to request 
state from backend), Michael insists that we also need to check on 
destination but I disagree because I believe that we can figure this out 
from stream data without additional device flags.

>
>>>
>>> But maybe we could also consider making stateless migration a 
>>> special case of stateful migration; if we had stateful migration, 
>>> can’t we just implement stateless migration by telling virtiofsd 
>>> that it should submit a special “I have no state” pseudo-state, i.e. 
>>> by having a switch on virtiofsd instead?
>>
>> Sure. Backend can send empty state (as your patch treats 0 length as 
>> a valid response and not error) or dummy state that can be recognized 
>> as stateless. The only potential problem is that then we need support 
>> in backend for new commands even to return dummy state, and if 
>> backend can support both types then we'll need some switch in backend 
>> to reply with real or empty state.
>
> Yes, exactly.
>
>>>
>>> Off the top of my head, some downsides of that approach would be
>>> (1) it’d need a switch on the virtiofsd side, not on the qemu side 
>>> (not sure if that’s a downside, but a difference for sure),
>>
>> Why would you? It seems to me that this affects only how qemu treats 
>> the vmstate of device. If the state was requested backend sends it to 
>> qemu. If state subsection is present in stream qemu sends it to the 
>> backend for loading. Stateless one just doesn't request state from 
>> the backend. Or am I missing something?
>>
>>> and (2) we’d need at least some support for this on the virtiofsd 
>>> side, i.e. practically can’t come quicker than stateful migration 
>>> support.
>>
>> Not much, essentially this is just a reconnect. I've sent a draft of 
>> a reconnect patch for old C-virtiofsd, for rust version it takes much 
>> longer because I'm learning rust and I'm not really good at it yet.
>
> I meant these two downsides not for your proposal, but instead if we 
> implemented stateless migration only in the back-end; i.e. the 
> front-end would only implement stateful migration, and the back-end 
> would send and accept an empty state.
>
> Then, qemu would always request a “state” (even if it turns out empty 
> for stateless migration), and qemu would always treat it the same, so 
> there wouldn’t be a switch on the qemu side, but only on the virtiofsd 
> side.  Doesn’t really matter, but what does matter is that we’d need 
> to implement the migration interface in virtiofsd, even if in the end 
> no state is transferred.
>
> So exactly what you’ve said above (“The only potential problem is 
> […]”). :)
>
> Hanna
>

Oh, yes, we were talking about the same thing. So do you agree that 
storing internal state data in subsection will allow backend code to be 
more straightforward without additional switches?
Hanna Czenczek March 21, 2023, 5:49 p.m. UTC | #6
On 20.03.23 13:39, Anton Kuchin wrote:
> On 20/03/2023 11:33, Hanna Czenczek wrote:
>> On 17.03.23 19:37, Anton Kuchin wrote:
>>> On 17/03/2023 19:52, Hanna Czenczek wrote:
>>>> On 17.03.23 18:19, Anton Kuchin wrote:
>>>>> On 13/03/2023 19:48, Hanna Czenczek wrote:
>>>>>> A virtio-fs device's VM state consists of:
>>>>>> - the virtio device (vring) state (VMSTATE_VIRTIO_DEVICE)
>>>>>> - the back-end's (virtiofsd's) internal state
>>>>>>
>>>>>> We get/set the latter via the new vhost-user operations 
>>>>>> FS_SET_STATE_FD,
>>>>>> FS_GET_STATE, and FS_SET_STATE.
>>>>>>
>>
>> [...]
>>
>>>>>>   static const VMStateDescription vuf_vmstate = {
>>>>>>       .name = "vhost-user-fs",
>>>>>> -    .unmigratable = 1,
>>>>>> +    .version_id = 1,
>>>>>> +    .fields = (VMStateField[]) {
>>>>>> +        VMSTATE_VIRTIO_DEVICE,
>>>>>> +        {
>>>>>> +            .name = "back-end",
>>>>>> +            .info = &(const VMStateInfo) {
>>>>>> +                .name = "virtio-fs back-end state",
>>>>>> +                .get = vuf_load_state,
>>>>>> +                .put = vuf_save_state,
>>>>>> +            },
>>>>>> +        },
>>>>>
>>>>> I've been working on stateless migration patch [1] and there was 
>>>>> discussed that we
>>>>> need to keep some kind of blocker by default if orchestrators rely 
>>>>> on unmigratable
>>>>> field in virtio-fs vmstate to block the migration.
>>>>> For this purpose I've implemented flag that selects "none" or 
>>>>> "external" and is checked
>>>>> in pre_save, so it could be extended with "internal" option.
>>>>> We didn't come to conclusion if we also need to check incoming 
>>>>> migration, the discussion
>>>>> has stopped for a while but I'm going back to it now.
>>>>>
>>>>> I would appreciate if you have time to take a look at the 
>>>>> discussion and consider the idea
>>>>> proposed there to store internal state as a subsection of vmstate 
>>>>> to make it as an option
>>>>> but not mandatory.
>>>>>
>>>>> [1] 
>>>>> https://patchew.org/QEMU/20230217170038.1273710-1-antonkuchin@yandex-team.ru/
>>>>
>>>> So far I’ve mostly considered these issues orthogonal.  If your 
>>>> stateless migration goes in first, then state is optional and I’ll 
>>>> adjust this series.
>>>> If stateful migration goes in first, then your series can simply 
>>>> make state optional by introducing the external option, no?
>>>
>>> Not really. State can be easily extended by subsections but not 
>>> trimmed. Maybe this can be worked around by defining two types of 
>>> vmstate and selecting the correct one at migration, but I'm not sure.
>>
>> I thought your patch included a switch on the vhost-user-fs device 
>> (on the qemu side) to tell qemu what migration data to expect. Can we 
>> not transfer a 0-length state for 'external', and assert this on the 
>> destination side?
>
> Looks like I really need to make the description of my patch and the 
> documentation more clear :) My patch proposes switch on _source_ side 
> to select which data to save in the stream mostly to protect old 
> orchestrators that don't expect virtio-fs to be migratable (and for 
> internal case it can be extended to select if qemu needs to request 
> state from backend), Michael insists that we also need to check on 
> destination but I disagree because I believe that we can figure this 
> out from stream data without additional device flags.
>
>>
>>>>
>>>> But maybe we could also consider making stateless migration a 
>>>> special case of stateful migration; if we had stateful migration, 
>>>> can’t we just implement stateless migration by telling virtiofsd 
>>>> that it should submit a special “I have no state” pseudo-state, 
>>>> i.e. by having a switch on virtiofsd instead?
>>>
>>> Sure. Backend can send empty state (as your patch treats 0 length as 
>>> a valid response and not error) or dummy state that can be 
>>> recognized as stateless. The only potential problem is that then we 
>>> need support in backend for new commands even to return dummy state, 
>>> and if backend can support both types then we'll need some switch in 
>>> backend to reply with real or empty state.
>>
>> Yes, exactly.
>>
>>>>
>>>> Off the top of my head, some downsides of that approach would be
>>>> (1) it’d need a switch on the virtiofsd side, not on the qemu side 
>>>> (not sure if that’s a downside, but a difference for sure),
>>>
>>> Why would you? It seems to me that this affects only how qemu treats 
>>> the vmstate of device. If the state was requested backend sends it 
>>> to qemu. If state subsection is present in stream qemu sends it to 
>>> the backend for loading. Stateless one just doesn't request state 
>>> from the backend. Or am I missing something?
>>>
>>>> and (2) we’d need at least some support for this on the virtiofsd 
>>>> side, i.e. practically can’t come quicker than stateful migration 
>>>> support.
>>>
>>> Not much, essentially this is just a reconnect. I've sent a draft of 
>>> a reconnect patch for old C-virtiofsd, for rust version it takes 
>>> much longer because I'm learning rust and I'm not really good at it 
>>> yet.
>>
>> I meant these two downsides not for your proposal, but instead if we 
>> implemented stateless migration only in the back-end; i.e. the 
>> front-end would only implement stateful migration, and the back-end 
>> would send and accept an empty state.
>>
>> Then, qemu would always request a “state” (even if it turns out empty 
>> for stateless migration), and qemu would always treat it the same, so 
>> there wouldn’t be a switch on the qemu side, but only on the 
>> virtiofsd side.  Doesn’t really matter, but what does matter is that 
>> we’d need to implement the migration interface in virtiofsd, even if 
>> in the end no state is transferred.
>>
>> So exactly what you’ve said above (“The only potential problem is 
>> […]”). :)
>>
>> Hanna
>>
>
> Oh, yes, we were talking about the same thing. So do you agree that 
> storing internal state data in subsection will allow backend code to 
> be more straightforward without additional switches?

Sounds good.  I think we should rename VHOST_USER_MIGRATION_TYPE_NONE to 
..._INTERNAL, though, and then use that (default) mode for stateful 
migration (via VMState), because I think that should be the default 
migration type (and while there’s no support for it, it will just 
continue to block migration).

So I suppose you mean something like this, where 
vuf_stateful_migration() basically returns `fs->migration_type == 
VHOST_USER_MIGRATION_TYPE_INTERNAL`, and on the destination, you’d check 
whether the subsection is present to decide which kind of migration it 
is, right?

static const VMStateDescription vuf_backend_vmstate;

static const VMStateDescription vuf_vmstate = {
     .name = "vhost-user-fs",
     .version_id = 0,
     .fields = (VMStateField[]) {
         VMSTATE_VIRTIO_DEVICE,
         VMSTATE_END_OF_LIST()
     },
     .subsections = (const VMStateDescription*[]) {
         &vuf_backend_vmstate,
         NULL,
     }
};

static const VMStateDescription vuf_backend_vmstate = {
     .name = "vhost-user-fs-backend",
     .version_id = 0,
     .needed = vuf_stateful_migration,
     .fields = (VMStateField[]) {
         {
             .name = "back-end",
             .info = &(const VMStateInfo) {
                 .name = "virtio-fs back-end state",
                 .get = vuf_load_state,
                 .put = vuf_save_state,
             },
         },
         VMSTATE_END_OF_LIST()
     },
};
Anton Kuchin March 22, 2023, 11:18 a.m. UTC | #7
On 21/03/2023 19:49, Hanna Czenczek wrote:
> On 20.03.23 13:39, Anton Kuchin wrote:
>> On 20/03/2023 11:33, Hanna Czenczek wrote:
>>> On 17.03.23 19:37, Anton Kuchin wrote:
>>>> On 17/03/2023 19:52, Hanna Czenczek wrote:
>>>>> On 17.03.23 18:19, Anton Kuchin wrote:
>>>>>> On 13/03/2023 19:48, Hanna Czenczek wrote:
>>>>>>> A virtio-fs device's VM state consists of:
>>>>>>> - the virtio device (vring) state (VMSTATE_VIRTIO_DEVICE)
>>>>>>> - the back-end's (virtiofsd's) internal state
>>>>>>>
>>>>>>> We get/set the latter via the new vhost-user operations 
>>>>>>> FS_SET_STATE_FD,
>>>>>>> FS_GET_STATE, and FS_SET_STATE.
>>>>>>>
>>>
>>> [...]
>>>
>>>>>>>   static const VMStateDescription vuf_vmstate = {
>>>>>>>       .name = "vhost-user-fs",
>>>>>>> -    .unmigratable = 1,
>>>>>>> +    .version_id = 1,
>>>>>>> +    .fields = (VMStateField[]) {
>>>>>>> +        VMSTATE_VIRTIO_DEVICE,
>>>>>>> +        {
>>>>>>> +            .name = "back-end",
>>>>>>> +            .info = &(const VMStateInfo) {
>>>>>>> +                .name = "virtio-fs back-end state",
>>>>>>> +                .get = vuf_load_state,
>>>>>>> +                .put = vuf_save_state,
>>>>>>> +            },
>>>>>>> +        },
>>>>>>
>>>>>> I've been working on stateless migration patch [1] and there was 
>>>>>> discussed that we
>>>>>> need to keep some kind of blocker by default if orchestrators 
>>>>>> rely on unmigratable
>>>>>> field in virtio-fs vmstate to block the migration.
>>>>>> For this purpose I've implemented flag that selects "none" or 
>>>>>> "external" and is checked
>>>>>> in pre_save, so it could be extended with "internal" option.
>>>>>> We didn't come to conclusion if we also need to check incoming 
>>>>>> migration, the discussion
>>>>>> has stopped for a while but I'm going back to it now.
>>>>>>
>>>>>> I would appreciate if you have time to take a look at the 
>>>>>> discussion and consider the idea
>>>>>> proposed there to store internal state as a subsection of vmstate 
>>>>>> to make it as an option
>>>>>> but not mandatory.
>>>>>>
>>>>>> [1] 
>>>>>> https://patchew.org/QEMU/20230217170038.1273710-1-antonkuchin@yandex-team.ru/
>>>>>
>>>>> So far I’ve mostly considered these issues orthogonal.  If your 
>>>>> stateless migration goes in first, then state is optional and I’ll 
>>>>> adjust this series.
>>>>> If stateful migration goes in first, then your series can simply 
>>>>> make state optional by introducing the external option, no?
>>>>
>>>> Not really. State can be easily extended by subsections but not 
>>>> trimmed. Maybe this can be worked around by defining two types of 
>>>> vmstate and selecting the correct one at migration, but I'm not sure.
>>>
>>> I thought your patch included a switch on the vhost-user-fs device 
>>> (on the qemu side) to tell qemu what migration data to expect. Can 
>>> we not transfer a 0-length state for 'external', and assert this on 
>>> the destination side?
>>
>> Looks like I really need to make the description of my patch and the 
>> documentation more clear :) My patch proposes switch on _source_ side 
>> to select which data to save in the stream mostly to protect old 
>> orchestrators that don't expect virtio-fs to be migratable (and for 
>> internal case it can be extended to select if qemu needs to request 
>> state from backend), Michael insists that we also need to check on 
>> destination but I disagree because I believe that we can figure this 
>> out from stream data without additional device flags.
>>
>>>
>>>>>
>>>>> But maybe we could also consider making stateless migration a 
>>>>> special case of stateful migration; if we had stateful migration, 
>>>>> can’t we just implement stateless migration by telling virtiofsd 
>>>>> that it should submit a special “I have no state” pseudo-state, 
>>>>> i.e. by having a switch on virtiofsd instead?
>>>>
>>>> Sure. Backend can send empty state (as your patch treats 0 length 
>>>> as a valid response and not error) or dummy state that can be 
>>>> recognized as stateless. The only potential problem is that then we 
>>>> need support in backend for new commands even to return dummy 
>>>> state, and if backend can support both types then we'll need some 
>>>> switch in backend to reply with real or empty state.
>>>
>>> Yes, exactly.
>>>
>>>>>
>>>>> Off the top of my head, some downsides of that approach would be
>>>>> (1) it’d need a switch on the virtiofsd side, not on the qemu side 
>>>>> (not sure if that’s a downside, but a difference for sure),
>>>>
>>>> Why would you? It seems to me that this affects only how qemu 
>>>> treats the vmstate of device. If the state was requested backend 
>>>> sends it to qemu. If state subsection is present in stream qemu 
>>>> sends it to the backend for loading. Stateless one just doesn't 
>>>> request state from the backend. Or am I missing something?
>>>>
>>>>> and (2) we’d need at least some support for this on the virtiofsd 
>>>>> side, i.e. practically can’t come quicker than stateful migration 
>>>>> support.
>>>>
>>>> Not much, essentially this is just a reconnect. I've sent a draft 
>>>> of a reconnect patch for old C-virtiofsd, for rust version it takes 
>>>> much longer because I'm learning rust and I'm not really good at it 
>>>> yet.
>>>
>>> I meant these two downsides not for your proposal, but instead if we 
>>> implemented stateless migration only in the back-end; i.e. the 
>>> front-end would only implement stateful migration, and the back-end 
>>> would send and accept an empty state.
>>>
>>> Then, qemu would always request a “state” (even if it turns out 
>>> empty for stateless migration), and qemu would always treat it the 
>>> same, so there wouldn’t be a switch on the qemu side, but only on 
>>> the virtiofsd side.  Doesn’t really matter, but what does matter is 
>>> that we’d need to implement the migration interface in virtiofsd, 
>>> even if in the end no state is transferred.
>>>
>>> So exactly what you’ve said above (“The only potential problem is 
>>> […]”). :)
>>>
>>> Hanna
>>>
>>
>> Oh, yes, we were talking about the same thing. So do you agree that 
>> storing internal state data in subsection will allow backend code to 
>> be more straightforward without additional switches?
>
> Sounds good.  I think we should rename VHOST_USER_MIGRATION_TYPE_NONE 
> to ..._INTERNAL, though, and then use that (default) mode for stateful 
> migration (via VMState), because I think that should be the default 
> migration type (and while there’s no support for it, it will just 
> continue to block migration).

My plan was to add new ..._INTERNAL option and keep ..._NONE as default 
one that blocks migration unless orchestrator explicitly selects 
migration type to avoid accidental migrations of old orchestrators that 
expect blocker for virtio-fs. But if INTERNAL blocks migration when 
backend doesn't support new commands your idea to make it default may be 
even better.

>
>
> So I suppose you mean something like this, where 
> vuf_stateful_migration() basically returns `fs->migration_type == 
> VHOST_USER_MIGRATION_TYPE_INTERNAL`, and on the destination, you’d 
> check whether the subsection is present to decide which kind of 
> migration it is, right?

Yes, exactly.

>
>
> static const VMStateDescription vuf_backend_vmstate;
>
> static const VMStateDescription vuf_vmstate = {
>     .name = "vhost-user-fs",
>     .version_id = 0,
>     .fields = (VMStateField[]) {
>         VMSTATE_VIRTIO_DEVICE,
>         VMSTATE_END_OF_LIST()
>     },
>     .subsections = (const VMStateDescription*[]) {
>         &vuf_backend_vmstate,
>         NULL,
>     }
> };
>
> static const VMStateDescription vuf_backend_vmstate = {
>     .name = "vhost-user-fs-backend",
>     .version_id = 0,
>     .needed = vuf_stateful_migration,
>     .fields = (VMStateField[]) {
>         {
>             .name = "back-end",
>             .info = &(const VMStateInfo) {
>                 .name = "virtio-fs back-end state",
>                 .get = vuf_load_state,
>                 .put = vuf_save_state,
>             },
>         },
>         VMSTATE_END_OF_LIST()
>     },
> };
>
diff mbox series

Patch

diff --git a/hw/virtio/vhost-user-fs.c b/hw/virtio/vhost-user-fs.c
index 83fc20e49e..df1fb02acc 100644
--- a/hw/virtio/vhost-user-fs.c
+++ b/hw/virtio/vhost-user-fs.c
@@ -20,8 +20,10 @@ 
 #include "hw/virtio/virtio-bus.h"
 #include "hw/virtio/virtio-access.h"
 #include "qemu/error-report.h"
+#include "qemu/memfd.h"
 #include "hw/virtio/vhost.h"
 #include "hw/virtio/vhost-user-fs.h"
+#include "migration/qemu-file-types.h"
 #include "monitor/monitor.h"
 #include "sysemu/sysemu.h"
 
@@ -298,9 +300,176 @@  static struct vhost_dev *vuf_get_vhost(VirtIODevice *vdev)
     return &fs->vhost_dev;
 }
 
+/**
+ * Fetch the internal state from the back-end (virtiofsd) and save it
+ * to `f`.
+ */
+static int vuf_save_state(QEMUFile *f, void *pv, size_t size,
+                          const VMStateField *field, JSONWriter *vmdesc)
+{
+    VirtIODevice *vdev = pv;
+    VHostUserFS *fs = VHOST_USER_FS(vdev);
+    int memfd = -1;
+    /* Size of the shared memory through which to transfer the state */
+    const size_t chunk_size = 4 * 1024 * 1024;
+    size_t state_offset;
+    ssize_t remaining;
+    void *shm_buf;
+    Error *local_err = NULL;
+    int ret, ret2;
+
+    /* Set up shared memory through which to receive the state from virtiofsd */
+    shm_buf = qemu_memfd_alloc("vhost-fs-state", chunk_size,
+                               F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW,
+                               &memfd, &local_err);
+    if (!shm_buf) {
+        error_report_err(local_err);
+        ret = -ENOMEM;
+        goto early_fail;
+    }
+
+    /* Share the SHM area with virtiofsd */
+    ret = vhost_fs_set_state_fd(&fs->vhost_dev, memfd, chunk_size);
+    if (ret < 0) {
+        goto early_fail;
+    }
+
+    /* Receive the virtiofsd state in chunks, and write them to `f` */
+    state_offset = 0;
+    do {
+        size_t this_chunk_size;
+
+        remaining = vhost_fs_get_state(&fs->vhost_dev, state_offset,
+                                       chunk_size);
+        if (remaining < 0) {
+            ret = remaining;
+            goto fail;
+        }
+
+        /* Prefix the whole state by its total length */
+        if (state_offset == 0) {
+            qemu_put_be64(f, remaining);
+        }
+
+        this_chunk_size = MIN(remaining, chunk_size);
+        qemu_put_buffer(f, shm_buf, this_chunk_size);
+        state_offset += this_chunk_size;
+    } while (remaining >= chunk_size);
+
+    ret = 0;
+fail:
+    /* Have virtiofsd close the shared memory */
+    ret2 = vhost_fs_set_state_fd(&fs->vhost_dev, -1, 0);
+    if (ret2 < 0) {
+        error_report("Failed to remove state FD from the vhost-user-fs back "
+                     "end: %s", strerror(-ret));
+        if (ret == 0) {
+            ret = ret2;
+        }
+    }
+
+early_fail:
+    if (shm_buf) {
+        qemu_memfd_free(shm_buf, chunk_size, memfd);
+    }
+
+    return ret;
+}
+
+/**
+ * Load the back-end's (virtiofsd's) internal state from `f` and send
+ * it over to that back-end.
+ */
+static int vuf_load_state(QEMUFile *f, void *pv, size_t size,
+                          const VMStateField *field)
+{
+    VirtIODevice *vdev = pv;
+    VHostUserFS *fs = VHOST_USER_FS(vdev);
+    int memfd = -1;
+    /* Size of the shared memory through which to transfer the state */
+    const size_t chunk_size = 4 * 1024 * 1024;
+    size_t state_offset;
+    uint64_t remaining;
+    void *shm_buf;
+    Error *local_err = NULL;
+    int ret, ret2;
+
+    /* The state is prefixed by its total length, read that first */
+    remaining = qemu_get_be64(f);
+
+    /* Set up shared memory through which to send the state to virtiofsd */
+    shm_buf = qemu_memfd_alloc("vhost-fs-state", chunk_size,
+                               F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW,
+                               &memfd, &local_err);
+    if (!shm_buf) {
+        error_report_err(local_err);
+        ret = -ENOMEM;
+        goto early_fail;
+    }
+
+    /* Share the SHM area with virtiofsd */
+    ret = vhost_fs_set_state_fd(&fs->vhost_dev, memfd, chunk_size);
+    if (ret < 0) {
+        goto early_fail;
+    }
+
+    /*
+     * Read the virtiofsd state in chunks from `f`, and send them over
+     * to virtiofsd
+     */
+    state_offset = 0;
+    do {
+        size_t this_chunk_size = MIN(remaining, chunk_size);
+
+        if (qemu_get_buffer(f, shm_buf, this_chunk_size) < this_chunk_size) {
+            ret = -EINVAL;
+            goto fail;
+        }
+
+        ret = vhost_fs_set_state(&fs->vhost_dev, state_offset, this_chunk_size);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        state_offset += this_chunk_size;
+        remaining -= this_chunk_size;
+    } while (remaining > 0);
+
+    ret = 0;
+fail:
+    ret2 = vhost_fs_set_state_fd(&fs->vhost_dev, -1, 0);
+    if (ret2 < 0) {
+        error_report("Failed to remove state FD from the vhost-user-fs back "
+                     "end -- perhaps it failed to deserialize/apply the state: "
+                     "%s", strerror(-ret2));
+        if (ret == 0) {
+            ret = ret2;
+        }
+    }
+
+early_fail:
+    if (shm_buf) {
+        qemu_memfd_free(shm_buf, chunk_size, memfd);
+    }
+
+    return ret;
+}
+
 static const VMStateDescription vuf_vmstate = {
     .name = "vhost-user-fs",
-    .unmigratable = 1,
+    .version_id = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_VIRTIO_DEVICE,
+        {
+            .name = "back-end",
+            .info = &(const VMStateInfo) {
+                .name = "virtio-fs back-end state",
+                .get = vuf_load_state,
+                .put = vuf_save_state,
+            },
+        },
+        VMSTATE_END_OF_LIST()
+    },
 };
 
 static Property vuf_properties[] = {