diff mbox series

[v1,2/2] hw/pvrdma: add live migration support

Message ID 20190828142328.24561-3-skrtbhtngr@gmail.com
State New
Headers show
Series Add live migration support in the PVRDMA device | expand

Commit Message

Sukrit Bhatnagar Aug. 28, 2019, 2:23 p.m. UTC
vmstate_pvrdma describes the PCI and MSIX states as well as the dma
address for dsr and the gid table of device.
vmstate_pvrdma_gids describes each gid in the gid table.

pvrdma_post_save() does the job of unregistering gid entries from the
backend device in the source host.

pvrdma_post_load() maps to dsr using the loaded dma address, registers
each loaded gid into the backend device, and finally calls load_dsr()
to perform other mappings and ring init operations.

Cc: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
Cc: Yuval Shaia <yuval.shaia@oracle.com>
Signed-off-by: Sukrit Bhatnagar <skrtbhtngr@gmail.com>
---
 hw/rdma/vmw/pvrdma_main.c | 77 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

Comments

Yuval Shaia Aug. 29, 2019, 12:53 p.m. UTC | #1
On Wed, Aug 28, 2019 at 07:53:28PM +0530, Sukrit Bhatnagar wrote:
> vmstate_pvrdma describes the PCI and MSIX states as well as the dma
> address for dsr and the gid table of device.
> vmstate_pvrdma_gids describes each gid in the gid table.
> 
> pvrdma_post_save() does the job of unregistering gid entries from the
> backend device in the source host.
> 
> pvrdma_post_load() maps to dsr using the loaded dma address, registers
> each loaded gid into the backend device, and finally calls load_dsr()
> to perform other mappings and ring init operations.

I think it worth to mention that the dma address is kept in driver/device
shared memory (dsr->dma) which is migrated as part of memory migration and
it is out of the scope of this change and so we do not need to save/load
the dma address during migration.

Also you should specifically comment that this migration-support does not
includes QP migration. This means that support for life migration *during*
traffic is not yet supported.

> 
> Cc: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
> Cc: Yuval Shaia <yuval.shaia@oracle.com>
> Signed-off-by: Sukrit Bhatnagar <skrtbhtngr@gmail.com>
> ---
>  hw/rdma/vmw/pvrdma_main.c | 77 +++++++++++++++++++++++++++++++++++++++
>  1 file changed, 77 insertions(+)
> 
> diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c
> index 6c90db96f9..6f8b56dea3 100644
> --- a/hw/rdma/vmw/pvrdma_main.c
> +++ b/hw/rdma/vmw/pvrdma_main.c
> @@ -28,6 +28,7 @@
>  #include "sysemu/sysemu.h"
>  #include "monitor/monitor.h"
>  #include "hw/rdma/rdma.h"
> +#include "migration/register.h"
>  
>  #include "../rdma_rm.h"
>  #include "../rdma_backend.h"
> @@ -593,6 +594,81 @@ static void pvrdma_shutdown_notifier(Notifier *n, void *opaque)
>      pvrdma_fini(pci_dev);
>  }
>  
> +static int pvrdma_post_save(void *opaque)
> +{
> +    int i, rc;
> +    PVRDMADev *dev = opaque;
> +
> +    for (i = 0; i < MAX_GIDS; i++) {
> +

Empty line is redundant here.

> +        if (!dev->rdma_dev_res.port.gid_tbl[i].gid.global.interface_id) {
> +            continue;
> +        }
> +        rc = rdma_backend_del_gid(&dev->backend_dev,
> +                                   dev->backend_eth_device_name,
> +                                   &dev->rdma_dev_res.port.gid_tbl[i].gid);
> +        if (rc) {
> +            return -EINVAL;

Some error report will help here i guess.

> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +static int pvrdma_post_load(void *opaque, int version_id)
> +{
> +    int i, rc;
> +    PVRDMADev *dev = opaque;
> +    PCIDevice *pci_dev = PCI_DEVICE(dev);
> +    DSRInfo *dsr_info = &dev->dsr_info;
> +
> +    dsr_info->dsr = rdma_pci_dma_map(pci_dev, dsr_info->dma,
> +                                sizeof(struct pvrdma_device_shared_region));
> +    if (!dsr_info->dsr) {
> +        rdma_error_report("Failed to map to DSR");
> +        return -ENOMEM;
> +    }
> +
> +    for (i = 0; i < MAX_GIDS; i++) {
> +

Empty line is redundant here.

> +        if (!dev->rdma_dev_res.port.gid_tbl[i].gid.global.interface_id) {
> +            continue;
> +        }
> +
> +        rc = rdma_backend_add_gid(&dev->backend_dev,
> +                                  dev->backend_eth_device_name,
> +                                  &dev->rdma_dev_res.port.gid_tbl[i].gid);
> +        if (rc) {
> +            return -EINVAL;
> +        }
> +    }
> +
> +    return load_dsr(dev);
> +}
> +
> +static const VMStateDescription vmstate_pvrdma_gids = {
> +    .name = "pvrdma-gids",
> +    .fields = (VMStateField[]) {
> +            VMSTATE_UINT8_ARRAY_V(gid.raw, RdmaRmGid, 16, 0),
> +            VMSTATE_END_OF_LIST()
> +    }
> +};
> +
> +static const VMStateDescription vmstate_pvrdma = {
> +    .name = PVRDMA_HW_NAME,
> +    .post_save = pvrdma_post_save,
> +    .post_load = pvrdma_post_load,
> +    .fields = (VMStateField[]) {
> +            VMSTATE_PCI_DEVICE(parent_obj, PVRDMADev),
> +            VMSTATE_MSIX(parent_obj, PVRDMADev),
> +            VMSTATE_UINT64(dsr_info.dma, PVRDMADev),
> +            VMSTATE_STRUCT_ARRAY(rdma_dev_res.port.gid_tbl, PVRDMADev,
> +                                 MAX_PORT_GIDS, 0, vmstate_pvrdma_gids,
> +                                 RdmaRmGid),
> +            VMSTATE_END_OF_LIST()
> +    }
> +};
> +
>  static void pvrdma_realize(PCIDevice *pdev, Error **errp)
>  {
>      int rc = 0;
> @@ -688,6 +764,7 @@ static void pvrdma_class_init(ObjectClass *klass, void *data)
>  
>      dc->desc = "RDMA Device";
>      dc->props = pvrdma_dev_properties;
> +    dc->vmsd = &vmstate_pvrdma;
>      set_bit(DEVICE_CATEGORY_NETWORK, dc->categories);
>  
>      ir->print_statistics = pvrdma_print_statistics;
> -- 
> 2.21.0
> 
>
Yuval Shaia Aug. 29, 2019, 12:56 p.m. UTC | #2
On Wed, Aug 28, 2019 at 07:53:28PM +0530, Sukrit Bhatnagar wrote:
> vmstate_pvrdma describes the PCI and MSIX states as well as the dma
> address for dsr and the gid table of device.
> vmstate_pvrdma_gids describes each gid in the gid table.
> 
> pvrdma_post_save() does the job of unregistering gid entries from the
> backend device in the source host.
> 
> pvrdma_post_load() maps to dsr using the loaded dma address, registers
> each loaded gid into the backend device, and finally calls load_dsr()
> to perform other mappings and ring init operations.
> 
> Cc: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
> Cc: Yuval Shaia <yuval.shaia@oracle.com>
> Signed-off-by: Sukrit Bhatnagar <skrtbhtngr@gmail.com>
> ---
>  hw/rdma/vmw/pvrdma_main.c | 77 +++++++++++++++++++++++++++++++++++++++
>  1 file changed, 77 insertions(+)
> 
> diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c
> index 6c90db96f9..6f8b56dea3 100644
> --- a/hw/rdma/vmw/pvrdma_main.c
> +++ b/hw/rdma/vmw/pvrdma_main.c
> @@ -28,6 +28,7 @@
>  #include "sysemu/sysemu.h"
>  #include "monitor/monitor.h"
>  #include "hw/rdma/rdma.h"
> +#include "migration/register.h"
>  
>  #include "../rdma_rm.h"
>  #include "../rdma_backend.h"
> @@ -593,6 +594,81 @@ static void pvrdma_shutdown_notifier(Notifier *n, void *opaque)
>      pvrdma_fini(pci_dev);
>  }
>  
> +static int pvrdma_post_save(void *opaque)
> +{
> +    int i, rc;
> +    PVRDMADev *dev = opaque;
> +
> +    for (i = 0; i < MAX_GIDS; i++) {
> +
> +        if (!dev->rdma_dev_res.port.gid_tbl[i].gid.global.interface_id) {
> +            continue;
> +        }
> +        rc = rdma_backend_del_gid(&dev->backend_dev,
> +                                   dev->backend_eth_device_name,
> +                                   &dev->rdma_dev_res.port.gid_tbl[i].gid);
> +        if (rc) {
> +            return -EINVAL;
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +static int pvrdma_post_load(void *opaque, int version_id)
> +{
> +    int i, rc;
> +    PVRDMADev *dev = opaque;
> +    PCIDevice *pci_dev = PCI_DEVICE(dev);
> +    DSRInfo *dsr_info = &dev->dsr_info;
> +
> +    dsr_info->dsr = rdma_pci_dma_map(pci_dev, dsr_info->dma,
> +                                sizeof(struct pvrdma_device_shared_region));
> +    if (!dsr_info->dsr) {
> +        rdma_error_report("Failed to map to DSR");
> +        return -ENOMEM;
> +    }
> +
> +    for (i = 0; i < MAX_GIDS; i++) {
> +
> +        if (!dev->rdma_dev_res.port.gid_tbl[i].gid.global.interface_id) {
> +            continue;
> +        }
> +
> +        rc = rdma_backend_add_gid(&dev->backend_dev,
> +                                  dev->backend_eth_device_name,
> +                                  &dev->rdma_dev_res.port.gid_tbl[i].gid);
> +        if (rc) {
> +            return -EINVAL;
> +        }
> +    }
> +
> +    return load_dsr(dev);

This check is better performed before any gid manipulation on the host
because no one will undo it if load_dsr fails.

> +}
> +
> +static const VMStateDescription vmstate_pvrdma_gids = {
> +    .name = "pvrdma-gids",
> +    .fields = (VMStateField[]) {
> +            VMSTATE_UINT8_ARRAY_V(gid.raw, RdmaRmGid, 16, 0),
> +            VMSTATE_END_OF_LIST()
> +    }
> +};
> +
> +static const VMStateDescription vmstate_pvrdma = {
> +    .name = PVRDMA_HW_NAME,
> +    .post_save = pvrdma_post_save,
> +    .post_load = pvrdma_post_load,
> +    .fields = (VMStateField[]) {
> +            VMSTATE_PCI_DEVICE(parent_obj, PVRDMADev),
> +            VMSTATE_MSIX(parent_obj, PVRDMADev),
> +            VMSTATE_UINT64(dsr_info.dma, PVRDMADev),
> +            VMSTATE_STRUCT_ARRAY(rdma_dev_res.port.gid_tbl, PVRDMADev,
> +                                 MAX_PORT_GIDS, 0, vmstate_pvrdma_gids,
> +                                 RdmaRmGid),
> +            VMSTATE_END_OF_LIST()
> +    }
> +};
> +
>  static void pvrdma_realize(PCIDevice *pdev, Error **errp)
>  {
>      int rc = 0;
> @@ -688,6 +764,7 @@ static void pvrdma_class_init(ObjectClass *klass, void *data)
>  
>      dc->desc = "RDMA Device";
>      dc->props = pvrdma_dev_properties;
> +    dc->vmsd = &vmstate_pvrdma;
>      set_bit(DEVICE_CATEGORY_NETWORK, dc->categories);
>  
>      ir->print_statistics = pvrdma_print_statistics;
> -- 
> 2.21.0
> 
>
Marcel Apfelbaum Aug. 31, 2019, 7:45 p.m. UTC | #3
On 8/28/19 5:23 PM, Sukrit Bhatnagar wrote:
> vmstate_pvrdma describes the PCI and MSIX states as well as the dma
> address for dsr and the gid table of device.
> vmstate_pvrdma_gids describes each gid in the gid table.
>
> pvrdma_post_save() does the job of unregistering gid entries from the
> backend device in the source host.
>
> pvrdma_post_load() maps to dsr using the loaded dma address, registers
> each loaded gid into the backend device, and finally calls load_dsr()
> to perform other mappings and ring init operations.
>
> Cc: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
> Cc: Yuval Shaia <yuval.shaia@oracle.com>
> Signed-off-by: Sukrit Bhatnagar <skrtbhtngr@gmail.com>
> ---
>   hw/rdma/vmw/pvrdma_main.c | 77 +++++++++++++++++++++++++++++++++++++++
>   1 file changed, 77 insertions(+)
>
> diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c
> index 6c90db96f9..6f8b56dea3 100644
> --- a/hw/rdma/vmw/pvrdma_main.c
> +++ b/hw/rdma/vmw/pvrdma_main.c
> @@ -28,6 +28,7 @@
>   #include "sysemu/sysemu.h"
>   #include "monitor/monitor.h"
>   #include "hw/rdma/rdma.h"
> +#include "migration/register.h"
>   
>   #include "../rdma_rm.h"
>   #include "../rdma_backend.h"
> @@ -593,6 +594,81 @@ static void pvrdma_shutdown_notifier(Notifier *n, void *opaque)
>       pvrdma_fini(pci_dev);
>   }
>   
> +static int pvrdma_post_save(void *opaque)
> +{
> +    int i, rc;
> +    PVRDMADev *dev = opaque;
> +
> +    for (i = 0; i < MAX_GIDS; i++) {
> +

No need for the extra line
> +        if (!dev->rdma_dev_res.port.gid_tbl[i].gid.global.interface_id) {
> +            continue;
> +        }
> +        rc = rdma_backend_del_gid(&dev->backend_dev,
> +                                   dev->backend_eth_device_name,
> +                                   &dev->rdma_dev_res.port.gid_tbl[i].gid);
> +        if (rc) {
> +            return -EINVAL;
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +static int pvrdma_post_load(void *opaque, int version_id)
> +{
> +    int i, rc;
> +    PVRDMADev *dev = opaque;
> +    PCIDevice *pci_dev = PCI_DEVICE(dev);
> +    DSRInfo *dsr_info = &dev->dsr_info;
> +
> +    dsr_info->dsr = rdma_pci_dma_map(pci_dev, dsr_info->dma,
> +                                sizeof(struct pvrdma_device_shared_region));
> +    if (!dsr_info->dsr) {
> +        rdma_error_report("Failed to map to DSR");
> +        return -ENOMEM;
> +    }
> +
> +    for (i = 0; i < MAX_GIDS; i++) {
> +

The same here

> +        if (!dev->rdma_dev_res.port.gid_tbl[i].gid.global.interface_id) {
> +            continue;
> +        }
> +
> +        rc = rdma_backend_add_gid(&dev->backend_dev,
> +                                  dev->backend_eth_device_name,
> +                                  &dev->rdma_dev_res.port.gid_tbl[i].gid);
> +        if (rc) {
> +            return -EINVAL;
> +        }
> +    }
> +
> +    return load_dsr(dev);
> +}
> +
> +static const VMStateDescription vmstate_pvrdma_gids = {
> +    .name = "pvrdma-gids",
> +    .fields = (VMStateField[]) {
> +            VMSTATE_UINT8_ARRAY_V(gid.raw, RdmaRmGid, 16, 0),

Is 16 the array length? If yes, do we have same macro definition?

> +            VMSTATE_END_OF_LIST()
> +    }
> +};
> +
> +static const VMStateDescription vmstate_pvrdma = {
> +    .name = PVRDMA_HW_NAME,
> +    .post_save = pvrdma_post_save,
> +    .post_load = pvrdma_post_load,
> +    .fields = (VMStateField[]) {
> +            VMSTATE_PCI_DEVICE(parent_obj, PVRDMADev),
> +            VMSTATE_MSIX(parent_obj, PVRDMADev),
> +            VMSTATE_UINT64(dsr_info.dma, PVRDMADev),
> +            VMSTATE_STRUCT_ARRAY(rdma_dev_res.port.gid_tbl, PVRDMADev,
> +                                 MAX_PORT_GIDS, 0, vmstate_pvrdma_gids,
> +                                 RdmaRmGid),
> +            VMSTATE_END_OF_LIST()
> +    }
> +};
> +
>   static void pvrdma_realize(PCIDevice *pdev, Error **errp)
>   {
>       int rc = 0;
> @@ -688,6 +764,7 @@ static void pvrdma_class_init(ObjectClass *klass, void *data)
>   
>       dc->desc = "RDMA Device";
>       dc->props = pvrdma_dev_properties;
> +    dc->vmsd = &vmstate_pvrdma;
>       set_bit(DEVICE_CATEGORY_NETWORK, dc->categories);
>   
>       ir->print_statistics = pvrdma_print_statistics;

Very simple an elegant.
If I understand correctly the live migration of a pvrdma device with no
active workloads works with this patch, right?
If yes, I think we should consider merging this code already.
Yuval, do you agree?

Thanks,
Marcel
Yuval Shaia Sept. 1, 2019, 9:35 a.m. UTC | #4
On Sat, Aug 31, 2019 at 10:45:44PM +0300, Marcel Apfelbaum wrote:
> 
> 
> On 8/28/19 5:23 PM, Sukrit Bhatnagar wrote:
> > vmstate_pvrdma describes the PCI and MSIX states as well as the dma
> > address for dsr and the gid table of device.
> > vmstate_pvrdma_gids describes each gid in the gid table.
> > 
> > pvrdma_post_save() does the job of unregistering gid entries from the
> > backend device in the source host.
> > 
> > pvrdma_post_load() maps to dsr using the loaded dma address, registers
> > each loaded gid into the backend device, and finally calls load_dsr()
> > to perform other mappings and ring init operations.
> > 
> > Cc: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
> > Cc: Yuval Shaia <yuval.shaia@oracle.com>
> > Signed-off-by: Sukrit Bhatnagar <skrtbhtngr@gmail.com>
> > ---
> >   hw/rdma/vmw/pvrdma_main.c | 77 +++++++++++++++++++++++++++++++++++++++
> >   1 file changed, 77 insertions(+)
> > 
> > diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c
> > index 6c90db96f9..6f8b56dea3 100644
> > --- a/hw/rdma/vmw/pvrdma_main.c
> > +++ b/hw/rdma/vmw/pvrdma_main.c
> > @@ -28,6 +28,7 @@
> >   #include "sysemu/sysemu.h"
> >   #include "monitor/monitor.h"
> >   #include "hw/rdma/rdma.h"
> > +#include "migration/register.h"
> >   #include "../rdma_rm.h"
> >   #include "../rdma_backend.h"
> > @@ -593,6 +594,81 @@ static void pvrdma_shutdown_notifier(Notifier *n, void *opaque)
> >       pvrdma_fini(pci_dev);
> >   }
> > +static int pvrdma_post_save(void *opaque)
> > +{
> > +    int i, rc;
> > +    PVRDMADev *dev = opaque;
> > +
> > +    for (i = 0; i < MAX_GIDS; i++) {
> > +
> 
> No need for the extra line
> > +        if (!dev->rdma_dev_res.port.gid_tbl[i].gid.global.interface_id) {
> > +            continue;
> > +        }
> > +        rc = rdma_backend_del_gid(&dev->backend_dev,
> > +                                   dev->backend_eth_device_name,
> > +                                   &dev->rdma_dev_res.port.gid_tbl[i].gid);
> > +        if (rc) {
> > +            return -EINVAL;
> > +        }
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +static int pvrdma_post_load(void *opaque, int version_id)
> > +{
> > +    int i, rc;
> > +    PVRDMADev *dev = opaque;
> > +    PCIDevice *pci_dev = PCI_DEVICE(dev);
> > +    DSRInfo *dsr_info = &dev->dsr_info;
> > +
> > +    dsr_info->dsr = rdma_pci_dma_map(pci_dev, dsr_info->dma,
> > +                                sizeof(struct pvrdma_device_shared_region));
> > +    if (!dsr_info->dsr) {
> > +        rdma_error_report("Failed to map to DSR");
> > +        return -ENOMEM;
> > +    }
> > +
> > +    for (i = 0; i < MAX_GIDS; i++) {
> > +
> 
> The same here
> 
> > +        if (!dev->rdma_dev_res.port.gid_tbl[i].gid.global.interface_id) {
> > +            continue;
> > +        }
> > +
> > +        rc = rdma_backend_add_gid(&dev->backend_dev,
> > +                                  dev->backend_eth_device_name,
> > +                                  &dev->rdma_dev_res.port.gid_tbl[i].gid);
> > +        if (rc) {
> > +            return -EINVAL;
> > +        }
> > +    }
> > +
> > +    return load_dsr(dev);
> > +}
> > +
> > +static const VMStateDescription vmstate_pvrdma_gids = {
> > +    .name = "pvrdma-gids",
> > +    .fields = (VMStateField[]) {
> > +            VMSTATE_UINT8_ARRAY_V(gid.raw, RdmaRmGid, 16, 0),
> 
> Is 16 the array length? If yes, do we have same macro definition?
> 
> > +            VMSTATE_END_OF_LIST()
> > +    }
> > +};
> > +
> > +static const VMStateDescription vmstate_pvrdma = {
> > +    .name = PVRDMA_HW_NAME,
> > +    .post_save = pvrdma_post_save,
> > +    .post_load = pvrdma_post_load,
> > +    .fields = (VMStateField[]) {
> > +            VMSTATE_PCI_DEVICE(parent_obj, PVRDMADev),
> > +            VMSTATE_MSIX(parent_obj, PVRDMADev),
> > +            VMSTATE_UINT64(dsr_info.dma, PVRDMADev),
> > +            VMSTATE_STRUCT_ARRAY(rdma_dev_res.port.gid_tbl, PVRDMADev,
> > +                                 MAX_PORT_GIDS, 0, vmstate_pvrdma_gids,
> > +                                 RdmaRmGid),
> > +            VMSTATE_END_OF_LIST()
> > +    }
> > +};
> > +
> >   static void pvrdma_realize(PCIDevice *pdev, Error **errp)
> >   {
> >       int rc = 0;
> > @@ -688,6 +764,7 @@ static void pvrdma_class_init(ObjectClass *klass, void *data)
> >       dc->desc = "RDMA Device";
> >       dc->props = pvrdma_dev_properties;
> > +    dc->vmsd = &vmstate_pvrdma;
> >       set_bit(DEVICE_CATEGORY_NETWORK, dc->categories);
> >       ir->print_statistics = pvrdma_print_statistics;
> 
> Very simple an elegant.
> If I understand correctly the live migration of a pvrdma device with no
> active workloads works with this patch, right?

And no QPs also.

> If yes, I think we should consider merging this code already.
> Yuval, do you agree?

Sure i do!
Even with the limitation, this is huge enhancement that can be used right
away.

But first suggested some fixes, let's see v2.

> 
> Thanks,
> Marcel
> 
>
Sukrit Bhatnagar Sept. 3, 2019, 11:05 a.m. UTC | #5
On Sun, 1 Sep 2019 at 01:15, Marcel Apfelbaum
<marcel.apfelbaum@gmail.com> wrote:
>
>
>
> On 8/28/19 5:23 PM, Sukrit Bhatnagar wrote:
> > vmstate_pvrdma describes the PCI and MSIX states as well as the dma
> > address for dsr and the gid table of device.
> > vmstate_pvrdma_gids describes each gid in the gid table.
> >
> > pvrdma_post_save() does the job of unregistering gid entries from the
> > backend device in the source host.
> >
> > pvrdma_post_load() maps to dsr using the loaded dma address, registers
> > each loaded gid into the backend device, and finally calls load_dsr()
> > to perform other mappings and ring init operations.
> >
> > Cc: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
> > Cc: Yuval Shaia <yuval.shaia@oracle.com>
> > Signed-off-by: Sukrit Bhatnagar <skrtbhtngr@gmail.com>
> > ---
> >   hw/rdma/vmw/pvrdma_main.c | 77 +++++++++++++++++++++++++++++++++++++++
> >   1 file changed, 77 insertions(+)
> >
> > diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c
> > index 6c90db96f9..6f8b56dea3 100644
> > --- a/hw/rdma/vmw/pvrdma_main.c
> > +++ b/hw/rdma/vmw/pvrdma_main.c
> > @@ -28,6 +28,7 @@
> >   #include "sysemu/sysemu.h"
> >   #include "monitor/monitor.h"
> >   #include "hw/rdma/rdma.h"
> > +#include "migration/register.h"
> >
> >   #include "../rdma_rm.h"
> >   #include "../rdma_backend.h"
> > @@ -593,6 +594,81 @@ static void pvrdma_shutdown_notifier(Notifier *n, void *opaque)
> >       pvrdma_fini(pci_dev);
> >   }
> >
> > +static int pvrdma_post_save(void *opaque)
> > +{
> > +    int i, rc;
> > +    PVRDMADev *dev = opaque;
> > +
> > +    for (i = 0; i < MAX_GIDS; i++) {
> > +
>
> No need for the extra line
> > +        if (!dev->rdma_dev_res.port.gid_tbl[i].gid.global.interface_id) {
> > +            continue;
> > +        }
> > +        rc = rdma_backend_del_gid(&dev->backend_dev,
> > +                                   dev->backend_eth_device_name,
> > +                                   &dev->rdma_dev_res.port.gid_tbl[i].gid);
> > +        if (rc) {
> > +            return -EINVAL;
> > +        }
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +static int pvrdma_post_load(void *opaque, int version_id)
> > +{
> > +    int i, rc;
> > +    PVRDMADev *dev = opaque;
> > +    PCIDevice *pci_dev = PCI_DEVICE(dev);
> > +    DSRInfo *dsr_info = &dev->dsr_info;
> > +
> > +    dsr_info->dsr = rdma_pci_dma_map(pci_dev, dsr_info->dma,
> > +                                sizeof(struct pvrdma_device_shared_region));
> > +    if (!dsr_info->dsr) {
> > +        rdma_error_report("Failed to map to DSR");
> > +        return -ENOMEM;
> > +    }
> > +
> > +    for (i = 0; i < MAX_GIDS; i++) {
> > +
>
> The same here
>
> > +        if (!dev->rdma_dev_res.port.gid_tbl[i].gid.global.interface_id) {
> > +            continue;
> > +        }
> > +
> > +        rc = rdma_backend_add_gid(&dev->backend_dev,
> > +                                  dev->backend_eth_device_name,
> > +                                  &dev->rdma_dev_res.port.gid_tbl[i].gid);
> > +        if (rc) {
> > +            return -EINVAL;
> > +        }
> > +    }
> > +
> > +    return load_dsr(dev);
> > +}
> > +
> > +static const VMStateDescription vmstate_pvrdma_gids = {
> > +    .name = "pvrdma-gids",
> > +    .fields = (VMStateField[]) {
> > +            VMSTATE_UINT8_ARRAY_V(gid.raw, RdmaRmGid, 16, 0),
>
> Is 16 the array length? If yes, do we have same macro definition?

16 here represents the number of bytes in a GID.
This comes from the verbs definition of ibv_gid

union ibv_gid {
    uint8_t         raw[16];
    struct {
        __be64  subnet_prefix;
        __be64  interface_id;
    } global;
};

I suppose there is no macro for this but we can declare
our own (something like IBV_GID_SIZE).

> > +            VMSTATE_END_OF_LIST()
> > +    }
> > +};
> > +
> > +static const VMStateDescription vmstate_pvrdma = {
> > +    .name = PVRDMA_HW_NAME,
> > +    .post_save = pvrdma_post_save,
> > +    .post_load = pvrdma_post_load,
> > +    .fields = (VMStateField[]) {
> > +            VMSTATE_PCI_DEVICE(parent_obj, PVRDMADev),
> > +            VMSTATE_MSIX(parent_obj, PVRDMADev),
> > +            VMSTATE_UINT64(dsr_info.dma, PVRDMADev),
> > +            VMSTATE_STRUCT_ARRAY(rdma_dev_res.port.gid_tbl, PVRDMADev,
> > +                                 MAX_PORT_GIDS, 0, vmstate_pvrdma_gids,
> > +                                 RdmaRmGid),
> > +            VMSTATE_END_OF_LIST()
> > +    }
> > +};
> > +
> >   static void pvrdma_realize(PCIDevice *pdev, Error **errp)
> >   {
> >       int rc = 0;
> > @@ -688,6 +764,7 @@ static void pvrdma_class_init(ObjectClass *klass, void *data)
> >
> >       dc->desc = "RDMA Device";
> >       dc->props = pvrdma_dev_properties;
> > +    dc->vmsd = &vmstate_pvrdma;
> >       set_bit(DEVICE_CATEGORY_NETWORK, dc->categories);
> >
> >       ir->print_statistics = pvrdma_print_statistics;
>
> Very simple an elegant.
> If I understand correctly the live migration of a pvrdma device with no
> active workloads works with this patch, right?

Yes.

> If yes, I think we should consider merging this code already.
> Yuval, do you agree?
>
> Thanks,
> Marcel
>
>
Sukrit Bhatnagar Sept. 3, 2019, 9:33 p.m. UTC | #6
On Thu, 29 Aug 2019 at 18:23, Yuval Shaia <yuval.shaia@oracle.com> wrote:
>
> On Wed, Aug 28, 2019 at 07:53:28PM +0530, Sukrit Bhatnagar wrote:
> > vmstate_pvrdma describes the PCI and MSIX states as well as the dma
> > address for dsr and the gid table of device.
> > vmstate_pvrdma_gids describes each gid in the gid table.
> >
> > pvrdma_post_save() does the job of unregistering gid entries from the
> > backend device in the source host.
> >
> > pvrdma_post_load() maps to dsr using the loaded dma address, registers
> > each loaded gid into the backend device, and finally calls load_dsr()
> > to perform other mappings and ring init operations.
>
> I think it worth to mention that the dma address is kept in driver/device
> shared memory (dsr->dma) which is migrated as part of memory migration and
> it is out of the scope of this change and so we do not need to save/load
> the dma address during migration.
>
> Also you should specifically comment that this migration-support does not
> includes QP migration. This means that support for life migration *during*
> traffic is not yet supported.
>
> >
> > Cc: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
> > Cc: Yuval Shaia <yuval.shaia@oracle.com>
> > Signed-off-by: Sukrit Bhatnagar <skrtbhtngr@gmail.com>
> > ---
> >  hw/rdma/vmw/pvrdma_main.c | 77 +++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 77 insertions(+)
> >
> > diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c
> > index 6c90db96f9..6f8b56dea3 100644
> > --- a/hw/rdma/vmw/pvrdma_main.c
> > +++ b/hw/rdma/vmw/pvrdma_main.c
> > @@ -28,6 +28,7 @@
> >  #include "sysemu/sysemu.h"
> >  #include "monitor/monitor.h"
> >  #include "hw/rdma/rdma.h"
> > +#include "migration/register.h"
> >
> >  #include "../rdma_rm.h"
> >  #include "../rdma_backend.h"
> > @@ -593,6 +594,81 @@ static void pvrdma_shutdown_notifier(Notifier *n, void *opaque)
> >      pvrdma_fini(pci_dev);
> >  }
> >
> > +static int pvrdma_post_save(void *opaque)
> > +{
> > +    int i, rc;
> > +    PVRDMADev *dev = opaque;
> > +
> > +    for (i = 0; i < MAX_GIDS; i++) {
> > +
>
> Empty line is redundant here.
>
> > +        if (!dev->rdma_dev_res.port.gid_tbl[i].gid.global.interface_id) {
> > +            continue;
> > +        }
> > +        rc = rdma_backend_del_gid(&dev->backend_dev,
> > +                                   dev->backend_eth_device_name,
> > +                                   &dev->rdma_dev_res.port.gid_tbl[i].gid);
> > +        if (rc) {
> > +            return -EINVAL;
>
> Some error report will help here i guess.

rdma_backend_del_gid() already generates an error report
when rc isn't 0.

Adding another statement for the same seems redundant.

> > +        }
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +static int pvrdma_post_load(void *opaque, int version_id)
> > +{
> > +    int i, rc;
> > +    PVRDMADev *dev = opaque;
> > +    PCIDevice *pci_dev = PCI_DEVICE(dev);
> > +    DSRInfo *dsr_info = &dev->dsr_info;
> > +
> > +    dsr_info->dsr = rdma_pci_dma_map(pci_dev, dsr_info->dma,
> > +                                sizeof(struct pvrdma_device_shared_region));
> > +    if (!dsr_info->dsr) {
> > +        rdma_error_report("Failed to map to DSR");
> > +        return -ENOMEM;
> > +    }
> > +
> > +    for (i = 0; i < MAX_GIDS; i++) {
> > +
>
> Empty line is redundant here.
>
> > +        if (!dev->rdma_dev_res.port.gid_tbl[i].gid.global.interface_id) {
> > +            continue;
> > +        }
> > +
> > +        rc = rdma_backend_add_gid(&dev->backend_dev,
> > +                                  dev->backend_eth_device_name,
> > +                                  &dev->rdma_dev_res.port.gid_tbl[i].gid);
> > +        if (rc) {
> > +            return -EINVAL;
> > +        }
> > +    }
> > +
> > +    return load_dsr(dev);

Now that I will move load_dsr() before the del_gid loop,
I can use goto jumps on exit/error paths, so that I can
undo load_dsr if any del_gid fails.

> > +}
> > +
> > +static const VMStateDescription vmstate_pvrdma_gids = {
> > +    .name = "pvrdma-gids",
> > +    .fields = (VMStateField[]) {
> > +            VMSTATE_UINT8_ARRAY_V(gid.raw, RdmaRmGid, 16, 0),
> > +            VMSTATE_END_OF_LIST()
> > +    }
> > +};
> > +
> > +static const VMStateDescription vmstate_pvrdma = {
> > +    .name = PVRDMA_HW_NAME,
> > +    .post_save = pvrdma_post_save,
> > +    .post_load = pvrdma_post_load,
> > +    .fields = (VMStateField[]) {
> > +            VMSTATE_PCI_DEVICE(parent_obj, PVRDMADev),
> > +            VMSTATE_MSIX(parent_obj, PVRDMADev),
> > +            VMSTATE_UINT64(dsr_info.dma, PVRDMADev),
> > +            VMSTATE_STRUCT_ARRAY(rdma_dev_res.port.gid_tbl, PVRDMADev,
> > +                                 MAX_PORT_GIDS, 0, vmstate_pvrdma_gids,
> > +                                 RdmaRmGid),
> > +            VMSTATE_END_OF_LIST()
> > +    }
> > +};
> > +
> >  static void pvrdma_realize(PCIDevice *pdev, Error **errp)
> >  {
> >      int rc = 0;
> > @@ -688,6 +764,7 @@ static void pvrdma_class_init(ObjectClass *klass, void *data)
> >
> >      dc->desc = "RDMA Device";
> >      dc->props = pvrdma_dev_properties;
> > +    dc->vmsd = &vmstate_pvrdma;
> >      set_bit(DEVICE_CATEGORY_NETWORK, dc->categories);
> >
> >      ir->print_statistics = pvrdma_print_statistics;
> > --
> > 2.21.0
> >
> >
Yuval Shaia Sept. 4, 2019, 5:04 a.m. UTC | #7
On Wed, Sep 04, 2019 at 03:03:20AM +0530, Sukrit Bhatnagar wrote:
> On Thu, 29 Aug 2019 at 18:23, Yuval Shaia <yuval.shaia@oracle.com> wrote:
> >
> > On Wed, Aug 28, 2019 at 07:53:28PM +0530, Sukrit Bhatnagar wrote:
> > > vmstate_pvrdma describes the PCI and MSIX states as well as the dma
> > > address for dsr and the gid table of device.
> > > vmstate_pvrdma_gids describes each gid in the gid table.
> > >
> > > pvrdma_post_save() does the job of unregistering gid entries from the
> > > backend device in the source host.
> > >
> > > pvrdma_post_load() maps to dsr using the loaded dma address, registers
> > > each loaded gid into the backend device, and finally calls load_dsr()
> > > to perform other mappings and ring init operations.
> >
> > I think it worth to mention that the dma address is kept in driver/device
> > shared memory (dsr->dma) which is migrated as part of memory migration and
> > it is out of the scope of this change and so we do not need to save/load
> > the dma address during migration.
> >
> > Also you should specifically comment that this migration-support does not
> > includes QP migration. This means that support for life migration *during*
> > traffic is not yet supported.
> >
> > >
> > > Cc: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
> > > Cc: Yuval Shaia <yuval.shaia@oracle.com>
> > > Signed-off-by: Sukrit Bhatnagar <skrtbhtngr@gmail.com>
> > > ---
> > >  hw/rdma/vmw/pvrdma_main.c | 77 +++++++++++++++++++++++++++++++++++++++
> > >  1 file changed, 77 insertions(+)
> > >
> > > diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c
> > > index 6c90db96f9..6f8b56dea3 100644
> > > --- a/hw/rdma/vmw/pvrdma_main.c
> > > +++ b/hw/rdma/vmw/pvrdma_main.c
> > > @@ -28,6 +28,7 @@
> > >  #include "sysemu/sysemu.h"
> > >  #include "monitor/monitor.h"
> > >  #include "hw/rdma/rdma.h"
> > > +#include "migration/register.h"
> > >
> > >  #include "../rdma_rm.h"
> > >  #include "../rdma_backend.h"
> > > @@ -593,6 +594,81 @@ static void pvrdma_shutdown_notifier(Notifier *n, void *opaque)
> > >      pvrdma_fini(pci_dev);
> > >  }
> > >
> > > +static int pvrdma_post_save(void *opaque)
> > > +{
> > > +    int i, rc;
> > > +    PVRDMADev *dev = opaque;
> > > +
> > > +    for (i = 0; i < MAX_GIDS; i++) {
> > > +
> >
> > Empty line is redundant here.
> >
> > > +        if (!dev->rdma_dev_res.port.gid_tbl[i].gid.global.interface_id) {
> > > +            continue;
> > > +        }
> > > +        rc = rdma_backend_del_gid(&dev->backend_dev,
> > > +                                   dev->backend_eth_device_name,
> > > +                                   &dev->rdma_dev_res.port.gid_tbl[i].gid);
> > > +        if (rc) {
> > > +            return -EINVAL;
> >
> > Some error report will help here i guess.
> 
> rdma_backend_del_gid() already generates an error report
> when rc isn't 0.
> 
> Adding another statement for the same seems redundant.

Sure, make sense.

> 
> > > +        }
> > > +    }
> > > +
> > > +    return 0;
> > > +}
> > > +
> > > +static int pvrdma_post_load(void *opaque, int version_id)
> > > +{
> > > +    int i, rc;
> > > +    PVRDMADev *dev = opaque;
> > > +    PCIDevice *pci_dev = PCI_DEVICE(dev);
> > > +    DSRInfo *dsr_info = &dev->dsr_info;
> > > +
> > > +    dsr_info->dsr = rdma_pci_dma_map(pci_dev, dsr_info->dma,
> > > +                                sizeof(struct pvrdma_device_shared_region));
> > > +    if (!dsr_info->dsr) {
> > > +        rdma_error_report("Failed to map to DSR");
> > > +        return -ENOMEM;
> > > +    }
> > > +
> > > +    for (i = 0; i < MAX_GIDS; i++) {
> > > +
> >
> > Empty line is redundant here.
> >
> > > +        if (!dev->rdma_dev_res.port.gid_tbl[i].gid.global.interface_id) {
> > > +            continue;
> > > +        }
> > > +
> > > +        rc = rdma_backend_add_gid(&dev->backend_dev,
> > > +                                  dev->backend_eth_device_name,
> > > +                                  &dev->rdma_dev_res.port.gid_tbl[i].gid);
> > > +        if (rc) {
> > > +            return -EINVAL;
> > > +        }
> > > +    }
> > > +
> > > +    return load_dsr(dev);
> 
> Now that I will move load_dsr() before the del_gid loop,

You probably meant before add_gid loop.

> I can use goto jumps on exit/error paths, so that I can
> undo load_dsr if any del_gid fails.

Yeah, it will be easier to undo load_dsr than add_gid.

> 
> > > +}
> > > +
> > > +static const VMStateDescription vmstate_pvrdma_gids = {
> > > +    .name = "pvrdma-gids",
> > > +    .fields = (VMStateField[]) {
> > > +            VMSTATE_UINT8_ARRAY_V(gid.raw, RdmaRmGid, 16, 0),
> > > +            VMSTATE_END_OF_LIST()
> > > +    }
> > > +};
> > > +
> > > +static const VMStateDescription vmstate_pvrdma = {
> > > +    .name = PVRDMA_HW_NAME,
> > > +    .post_save = pvrdma_post_save,
> > > +    .post_load = pvrdma_post_load,
> > > +    .fields = (VMStateField[]) {
> > > +            VMSTATE_PCI_DEVICE(parent_obj, PVRDMADev),
> > > +            VMSTATE_MSIX(parent_obj, PVRDMADev),
> > > +            VMSTATE_UINT64(dsr_info.dma, PVRDMADev),
> > > +            VMSTATE_STRUCT_ARRAY(rdma_dev_res.port.gid_tbl, PVRDMADev,
> > > +                                 MAX_PORT_GIDS, 0, vmstate_pvrdma_gids,
> > > +                                 RdmaRmGid),
> > > +            VMSTATE_END_OF_LIST()
> > > +    }
> > > +};
> > > +
> > >  static void pvrdma_realize(PCIDevice *pdev, Error **errp)
> > >  {
> > >      int rc = 0;
> > > @@ -688,6 +764,7 @@ static void pvrdma_class_init(ObjectClass *klass, void *data)
> > >
> > >      dc->desc = "RDMA Device";
> > >      dc->props = pvrdma_dev_properties;
> > > +    dc->vmsd = &vmstate_pvrdma;
> > >      set_bit(DEVICE_CATEGORY_NETWORK, dc->categories);
> > >
> > >      ir->print_statistics = pvrdma_print_statistics;
> > > --
> > > 2.21.0
> > >
> > >
diff mbox series

Patch

diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c
index 6c90db96f9..6f8b56dea3 100644
--- a/hw/rdma/vmw/pvrdma_main.c
+++ b/hw/rdma/vmw/pvrdma_main.c
@@ -28,6 +28,7 @@ 
 #include "sysemu/sysemu.h"
 #include "monitor/monitor.h"
 #include "hw/rdma/rdma.h"
+#include "migration/register.h"
 
 #include "../rdma_rm.h"
 #include "../rdma_backend.h"
@@ -593,6 +594,81 @@  static void pvrdma_shutdown_notifier(Notifier *n, void *opaque)
     pvrdma_fini(pci_dev);
 }
 
+static int pvrdma_post_save(void *opaque)
+{
+    int i, rc;
+    PVRDMADev *dev = opaque;
+
+    for (i = 0; i < MAX_GIDS; i++) {
+
+        if (!dev->rdma_dev_res.port.gid_tbl[i].gid.global.interface_id) {
+            continue;
+        }
+        rc = rdma_backend_del_gid(&dev->backend_dev,
+                                   dev->backend_eth_device_name,
+                                   &dev->rdma_dev_res.port.gid_tbl[i].gid);
+        if (rc) {
+            return -EINVAL;
+        }
+    }
+
+    return 0;
+}
+
+static int pvrdma_post_load(void *opaque, int version_id)
+{
+    int i, rc;
+    PVRDMADev *dev = opaque;
+    PCIDevice *pci_dev = PCI_DEVICE(dev);
+    DSRInfo *dsr_info = &dev->dsr_info;
+
+    dsr_info->dsr = rdma_pci_dma_map(pci_dev, dsr_info->dma,
+                                sizeof(struct pvrdma_device_shared_region));
+    if (!dsr_info->dsr) {
+        rdma_error_report("Failed to map to DSR");
+        return -ENOMEM;
+    }
+
+    for (i = 0; i < MAX_GIDS; i++) {
+
+        if (!dev->rdma_dev_res.port.gid_tbl[i].gid.global.interface_id) {
+            continue;
+        }
+
+        rc = rdma_backend_add_gid(&dev->backend_dev,
+                                  dev->backend_eth_device_name,
+                                  &dev->rdma_dev_res.port.gid_tbl[i].gid);
+        if (rc) {
+            return -EINVAL;
+        }
+    }
+
+    return load_dsr(dev);
+}
+
+static const VMStateDescription vmstate_pvrdma_gids = {
+    .name = "pvrdma-gids",
+    .fields = (VMStateField[]) {
+            VMSTATE_UINT8_ARRAY_V(gid.raw, RdmaRmGid, 16, 0),
+            VMSTATE_END_OF_LIST()
+    }
+};
+
+static const VMStateDescription vmstate_pvrdma = {
+    .name = PVRDMA_HW_NAME,
+    .post_save = pvrdma_post_save,
+    .post_load = pvrdma_post_load,
+    .fields = (VMStateField[]) {
+            VMSTATE_PCI_DEVICE(parent_obj, PVRDMADev),
+            VMSTATE_MSIX(parent_obj, PVRDMADev),
+            VMSTATE_UINT64(dsr_info.dma, PVRDMADev),
+            VMSTATE_STRUCT_ARRAY(rdma_dev_res.port.gid_tbl, PVRDMADev,
+                                 MAX_PORT_GIDS, 0, vmstate_pvrdma_gids,
+                                 RdmaRmGid),
+            VMSTATE_END_OF_LIST()
+    }
+};
+
 static void pvrdma_realize(PCIDevice *pdev, Error **errp)
 {
     int rc = 0;
@@ -688,6 +764,7 @@  static void pvrdma_class_init(ObjectClass *klass, void *data)
 
     dc->desc = "RDMA Device";
     dc->props = pvrdma_dev_properties;
+    dc->vmsd = &vmstate_pvrdma;
     set_bit(DEVICE_CATEGORY_NETWORK, dc->categories);
 
     ir->print_statistics = pvrdma_print_statistics;