diff mbox series

[v8,12/17] vfio-user: IOMMU support for remote device

Message ID 2a42664b61cef7cdd44688679b60a8c6c397b075.1650379269.git.jag.raman@oracle.com
State New
Headers show
Series vfio-user server in QEMU | expand

Commit Message

Jag Raman April 19, 2022, 8:44 p.m. UTC
Assign separate address space for each device in the remote processes.

Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
---
 include/hw/remote/iommu.h |  40 +++++++++++++
 hw/remote/iommu.c         | 114 ++++++++++++++++++++++++++++++++++++++
 hw/remote/machine.c       |  13 ++++-
 MAINTAINERS               |   2 +
 hw/remote/meson.build     |   1 +
 5 files changed, 169 insertions(+), 1 deletion(-)
 create mode 100644 include/hw/remote/iommu.h
 create mode 100644 hw/remote/iommu.c

Comments

Jag Raman April 20, 2022, 11:15 a.m. UTC | #1
> On Apr 19, 2022, at 4:45 PM, Jag Raman <jag.raman@oracle.com> wrote:
> 
> Assign separate address space for each device in the remote processes.
> 
> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
> ---
> include/hw/remote/iommu.h |  40 +++++++++++++
> hw/remote/iommu.c         | 114 ++++++++++++++++++++++++++++++++++++++
> hw/remote/machine.c       |  13 ++++-
> MAINTAINERS               |   2 +
> hw/remote/meson.build     |   1 +
> 5 files changed, 169 insertions(+), 1 deletion(-)
> create mode 100644 include/hw/remote/iommu.h
> create mode 100644 hw/remote/iommu.c
> 
> diff --git a/include/hw/remote/iommu.h b/include/hw/remote/iommu.h
> new file mode 100644
> index 0000000000..33b68a8f4b
> --- /dev/null
> +++ b/include/hw/remote/iommu.h
> @@ -0,0 +1,40 @@
> +/**
> + * Copyright © 2022 Oracle and/or its affiliates.
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + *
> + */
> +
> +#ifndef REMOTE_IOMMU_H
> +#define REMOTE_IOMMU_H
> +
> +#include "hw/pci/pci_bus.h"
> +#include "hw/pci/pci.h"
> +
> +#ifndef INT2VOIDP
> +#define INT2VOIDP(i) (void *)(uintptr_t)(i)
> +#endif
> +
> +typedef struct RemoteIommuElem {
> +    MemoryRegion *mr;
> +
> +    AddressSpace as;
> +} RemoteIommuElem;
> +
> +#define TYPE_REMOTE_IOMMU "x-remote-iommu"
> +OBJECT_DECLARE_SIMPLE_TYPE(RemoteIommu, REMOTE_IOMMU)
> +
> +struct RemoteIommu {
> +    Object parent;
> +
> +    GHashTable *elem_by_devfn;
> +
> +    QemuMutex lock;
> +};
> +
> +void remote_iommu_setup(PCIBus *pci_bus);
> +
> +void remote_iommu_unplug_dev(PCIDevice *pci_dev);
> +
> +#endif
> diff --git a/hw/remote/iommu.c b/hw/remote/iommu.c
> new file mode 100644
> index 0000000000..16c6b0834e
> --- /dev/null
> +++ b/hw/remote/iommu.c
> @@ -0,0 +1,114 @@
> +/**
> + * IOMMU for remote device
> + *
> + * Copyright © 2022 Oracle and/or its affiliates.
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + *
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu-common.h"
> +
> +#include "hw/remote/iommu.h"
> +#include "hw/pci/pci_bus.h"
> +#include "hw/pci/pci.h"
> +#include "exec/memory.h"
> +#include "exec/address-spaces.h"
> +#include "trace.h"
> +
> +static AddressSpace *remote_iommu_find_add_as(PCIBus *pci_bus,
> +                                              void *opaque, int devfn)
> +{
> +    RemoteIommu *iommu = opaque;
> +    RemoteIommuElem *elem = NULL;
> +
> +    qemu_mutex_lock(&iommu->lock);
> +
> +    elem = g_hash_table_lookup(iommu->elem_by_devfn, INT2VOIDP(devfn));
> +
> +    if (!elem) {
> +        elem = g_malloc0(sizeof(RemoteIommuElem));
> +        g_hash_table_insert(iommu->elem_by_devfn, INT2VOIDP(devfn), elem);
> +    }
> +
> +    if (!elem->mr) {
> +        elem->mr = MEMORY_REGION(object_new(TYPE_MEMORY_REGION));
> +        memory_region_set_size(elem->mr, UINT64_MAX);
> +        address_space_init(&elem->as, elem->mr, NULL);

Hi,

I’d like to add a note here.

We tried to add "elem->mr” as a child of PCIDevice. That way, when PCIDevice is
unplugged, the child is also finalized.

However, there was some issue with hotplug. During the hotplug, there’s a window
during initialization where we couldn’t lookup the PCIDevice by “devfn”.

do_pci_register_device() -> pci_init_bus_master() -> pci_device_iommu_address_space()
happens before do_pci_register_device() -> “bus->devices[devfn] = pci_dev”. As such,
pci_find_device() doesn’t work at this time.

Thank you!
--
Jag

> +    }
> +
> +    qemu_mutex_unlock(&iommu->lock);
> +
> +    return &elem->as;
> +}
> +
> +void remote_iommu_unplug_dev(PCIDevice *pci_dev)
> +{
> +    AddressSpace *as = pci_device_iommu_address_space(pci_dev);
> +    RemoteIommuElem *elem = NULL;
> +
> +    if (as == &address_space_memory) {
> +        return;
> +    }
> +
> +    elem = container_of(as, RemoteIommuElem, as);
> +
> +    address_space_destroy(&elem->as);
> +
> +    object_unref(elem->mr);
> +
> +    elem->mr = NULL;
> +}
> +
> +static void remote_iommu_init(Object *obj)
> +{
> +    RemoteIommu *iommu = REMOTE_IOMMU(obj);
> +
> +    iommu->elem_by_devfn = g_hash_table_new_full(NULL, NULL, NULL, g_free);
> +
> +    qemu_mutex_init(&iommu->lock);
> +}
> +
> +static void remote_iommu_finalize(Object *obj)
> +{
> +    RemoteIommu *iommu = REMOTE_IOMMU(obj);
> +
> +    qemu_mutex_destroy(&iommu->lock);
> +
> +    if (iommu->elem_by_devfn) {
> +        g_hash_table_destroy(iommu->elem_by_devfn);
> +        iommu->elem_by_devfn = NULL;
> +    }
> +}
> +
> +void remote_iommu_setup(PCIBus *pci_bus)
> +{
> +    RemoteIommu *iommu = NULL;
> +
> +    g_assert(pci_bus);
> +
> +    iommu = REMOTE_IOMMU(object_new(TYPE_REMOTE_IOMMU));
> +
> +    pci_setup_iommu(pci_bus, remote_iommu_find_add_as, iommu);
> +
> +    object_property_add_child(OBJECT(pci_bus), "remote-iommu", OBJECT(iommu));
> +
> +    object_unref(OBJECT(iommu));
> +}
> +
> +static const TypeInfo remote_iommu_info = {
> +    .name = TYPE_REMOTE_IOMMU,
> +    .parent = TYPE_OBJECT,
> +    .instance_size = sizeof(RemoteIommu),
> +    .instance_init = remote_iommu_init,
> +    .instance_finalize = remote_iommu_finalize,
> +};
> +
> +static void remote_iommu_register_types(void)
> +{
> +    type_register_static(&remote_iommu_info);
> +}
> +
> +type_init(remote_iommu_register_types)
> diff --git a/hw/remote/machine.c b/hw/remote/machine.c
> index ed91659794..cca5d25f50 100644
> --- a/hw/remote/machine.c
> +++ b/hw/remote/machine.c
> @@ -21,6 +21,7 @@
> #include "qapi/error.h"
> #include "hw/pci/pci_host.h"
> #include "hw/remote/iohub.h"
> +#include "hw/remote/iommu.h"
> #include "hw/qdev-core.h"
> 
> static void remote_machine_init(MachineState *machine)
> @@ -100,6 +101,16 @@ static void remote_machine_instance_init(Object *obj)
>     s->auto_shutdown = true;
> }
> 
> +static void remote_machine_dev_unplug_cb(HotplugHandler *hotplug_dev,
> +                                         DeviceState *dev, Error **errp)
> +{
> +    qdev_unrealize(dev);
> +
> +    if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
> +        remote_iommu_unplug_dev(PCI_DEVICE(dev));
> +    }
> +}
> +
> static void remote_machine_class_init(ObjectClass *oc, void *data)
> {
>     MachineClass *mc = MACHINE_CLASS(oc);
> @@ -108,7 +119,7 @@ static void remote_machine_class_init(ObjectClass *oc, void *data)
>     mc->init = remote_machine_init;
>     mc->desc = "Experimental remote machine";
> 
> -    hc->unplug = qdev_simple_device_unplug_cb;
> +    hc->unplug = remote_machine_dev_unplug_cb;
> 
>     object_class_property_add_bool(oc, "vfio-user",
>                                    remote_machine_get_vfio_user,
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 37afdecc61..3e62ccab0a 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -3599,6 +3599,8 @@ F: hw/remote/iohub.c
> F: include/hw/remote/iohub.h
> F: subprojects/libvfio-user
> F: hw/remote/vfio-user-obj.c
> +F: hw/remote/iommu.c
> +F: include/hw/remote/iommu.h
> 
> EBPF:
> M: Jason Wang <jasowang@redhat.com>
> diff --git a/hw/remote/meson.build b/hw/remote/meson.build
> index 534ac5df79..bcef83c8cc 100644
> --- a/hw/remote/meson.build
> +++ b/hw/remote/meson.build
> @@ -6,6 +6,7 @@ remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('message.c'))
> remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('remote-obj.c'))
> remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('proxy.c'))
> remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('iohub.c'))
> +remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('iommu.c'))
> remote_ss.add(when: 'CONFIG_VFIO_USER_SERVER', if_true: files('vfio-user-obj.c'))
> 
> remote_ss.add(when: 'CONFIG_VFIO_USER_SERVER', if_true: vfiouser)
> -- 
> 2.20.1
>
Stefan Hajnoczi April 25, 2022, 9:31 a.m. UTC | #2
On Tue, Apr 19, 2022 at 04:44:17PM -0400, Jagannathan Raman wrote:
> +static AddressSpace *remote_iommu_find_add_as(PCIBus *pci_bus,
> +                                              void *opaque, int devfn)
> +{
> +    RemoteIommu *iommu = opaque;
> +    RemoteIommuElem *elem = NULL;
> +
> +    qemu_mutex_lock(&iommu->lock);
> +
> +    elem = g_hash_table_lookup(iommu->elem_by_devfn, INT2VOIDP(devfn));
> +
> +    if (!elem) {
> +        elem = g_malloc0(sizeof(RemoteIommuElem));
> +        g_hash_table_insert(iommu->elem_by_devfn, INT2VOIDP(devfn), elem);
> +    }
> +
> +    if (!elem->mr) {
> +        elem->mr = MEMORY_REGION(object_new(TYPE_MEMORY_REGION));
> +        memory_region_set_size(elem->mr, UINT64_MAX);
> +        address_space_init(&elem->as, elem->mr, NULL);
> +    }
> +
> +    qemu_mutex_unlock(&iommu->lock);
> +
> +    return &elem->as;
> +}

A few comments that can be added to this file to explain the design:

- Each vfio-user server handles one PCIDevice on a PCIBus. There is one
  RemoteIommu per PCIBus, so the RemoteIommu tracks multiple PCIDevices
  by maintaining a ->elem_by_devfn mapping.

- memory_region_init_iommu() is not used because vfio-user MemoryRegions
  will be added to the elem->mr container instead. This is more natural
  than implementing the IOMMUMemoryRegionClass APIs since vfio-user
  provides something that is close to a full-fledged MemoryRegion and
  not like an IOMMU mapping.

- When a device is hot unplugged, the elem->mr reference is dropped so
  all vfio-user MemoryRegions associated with this vfio-user server are
  destroyed.

> +static void remote_iommu_finalize(Object *obj)
> +{
> +    RemoteIommu *iommu = REMOTE_IOMMU(obj);
> +
> +    qemu_mutex_destroy(&iommu->lock);
> +
> +    if (iommu->elem_by_devfn) {

->init() and ->finalize() are a pair, so I don't think ->finalize() will
ever be called with a NULL ->elem_by_devfn.

If ->elem_by_devfn can be NULL then there would probably need to be a
check around qemu_mutex_destroy(&iommu->lock) too.

> +        g_hash_table_destroy(iommu->elem_by_devfn);
> +        iommu->elem_by_devfn = NULL;
> +    }
> +}
Stefan Hajnoczi April 25, 2022, 9:38 a.m. UTC | #3
On Wed, Apr 20, 2022 at 11:15:16AM +0000, Jag Raman wrote:
> 
> 
> > On Apr 19, 2022, at 4:45 PM, Jag Raman <jag.raman@oracle.com> wrote:
> > 
> > Assign separate address space for each device in the remote processes.
> > 
> > Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> > Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
> > Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
> > ---
> > include/hw/remote/iommu.h |  40 +++++++++++++
> > hw/remote/iommu.c         | 114 ++++++++++++++++++++++++++++++++++++++
> > hw/remote/machine.c       |  13 ++++-
> > MAINTAINERS               |   2 +
> > hw/remote/meson.build     |   1 +
> > 5 files changed, 169 insertions(+), 1 deletion(-)
> > create mode 100644 include/hw/remote/iommu.h
> > create mode 100644 hw/remote/iommu.c
> > 
> > diff --git a/include/hw/remote/iommu.h b/include/hw/remote/iommu.h
> > new file mode 100644
> > index 0000000000..33b68a8f4b
> > --- /dev/null
> > +++ b/include/hw/remote/iommu.h
> > @@ -0,0 +1,40 @@
> > +/**
> > + * Copyright © 2022 Oracle and/or its affiliates.
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> > + * See the COPYING file in the top-level directory.
> > + *
> > + */
> > +
> > +#ifndef REMOTE_IOMMU_H
> > +#define REMOTE_IOMMU_H
> > +
> > +#include "hw/pci/pci_bus.h"
> > +#include "hw/pci/pci.h"
> > +
> > +#ifndef INT2VOIDP
> > +#define INT2VOIDP(i) (void *)(uintptr_t)(i)
> > +#endif
> > +
> > +typedef struct RemoteIommuElem {
> > +    MemoryRegion *mr;
> > +
> > +    AddressSpace as;
> > +} RemoteIommuElem;
> > +
> > +#define TYPE_REMOTE_IOMMU "x-remote-iommu"
> > +OBJECT_DECLARE_SIMPLE_TYPE(RemoteIommu, REMOTE_IOMMU)
> > +
> > +struct RemoteIommu {
> > +    Object parent;
> > +
> > +    GHashTable *elem_by_devfn;
> > +
> > +    QemuMutex lock;
> > +};
> > +
> > +void remote_iommu_setup(PCIBus *pci_bus);
> > +
> > +void remote_iommu_unplug_dev(PCIDevice *pci_dev);
> > +
> > +#endif
> > diff --git a/hw/remote/iommu.c b/hw/remote/iommu.c
> > new file mode 100644
> > index 0000000000..16c6b0834e
> > --- /dev/null
> > +++ b/hw/remote/iommu.c
> > @@ -0,0 +1,114 @@
> > +/**
> > + * IOMMU for remote device
> > + *
> > + * Copyright © 2022 Oracle and/or its affiliates.
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> > + * See the COPYING file in the top-level directory.
> > + *
> > + */
> > +
> > +#include "qemu/osdep.h"
> > +#include "qemu-common.h"
> > +
> > +#include "hw/remote/iommu.h"
> > +#include "hw/pci/pci_bus.h"
> > +#include "hw/pci/pci.h"
> > +#include "exec/memory.h"
> > +#include "exec/address-spaces.h"
> > +#include "trace.h"
> > +
> > +static AddressSpace *remote_iommu_find_add_as(PCIBus *pci_bus,
> > +                                              void *opaque, int devfn)
> > +{
> > +    RemoteIommu *iommu = opaque;
> > +    RemoteIommuElem *elem = NULL;
> > +
> > +    qemu_mutex_lock(&iommu->lock);
> > +
> > +    elem = g_hash_table_lookup(iommu->elem_by_devfn, INT2VOIDP(devfn));
> > +
> > +    if (!elem) {
> > +        elem = g_malloc0(sizeof(RemoteIommuElem));
> > +        g_hash_table_insert(iommu->elem_by_devfn, INT2VOIDP(devfn), elem);
> > +    }
> > +
> > +    if (!elem->mr) {
> > +        elem->mr = MEMORY_REGION(object_new(TYPE_MEMORY_REGION));
> > +        memory_region_set_size(elem->mr, UINT64_MAX);
> > +        address_space_init(&elem->as, elem->mr, NULL);
> 
> Hi,
> 
> I’d like to add a note here.
> 
> We tried to add "elem->mr” as a child of PCIDevice. That way, when PCIDevice is
> unplugged, the child is also finalized.

Do you mean via a memory_region_init()-family function where a parent
object is given? Or do you mean by adding a QOM child property?

> However, there was some issue with hotplug. During the hotplug, there’s a window
> during initialization where we couldn’t lookup the PCIDevice by “devfn”.
> 
> do_pci_register_device() -> pci_init_bus_master() -> pci_device_iommu_address_space()
> happens before do_pci_register_device() -> “bus->devices[devfn] = pci_dev”. As such,
> pci_find_device() doesn’t work at this time.

I don't follow. What calls pci_find_device()?

Stefan
Jag Raman April 25, 2022, 5:26 p.m. UTC | #4
> On Apr 25, 2022, at 5:31 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> 
> On Tue, Apr 19, 2022 at 04:44:17PM -0400, Jagannathan Raman wrote:
>> +static AddressSpace *remote_iommu_find_add_as(PCIBus *pci_bus,
>> +                                              void *opaque, int devfn)
>> +{
>> +    RemoteIommu *iommu = opaque;
>> +    RemoteIommuElem *elem = NULL;
>> +
>> +    qemu_mutex_lock(&iommu->lock);
>> +
>> +    elem = g_hash_table_lookup(iommu->elem_by_devfn, INT2VOIDP(devfn));
>> +
>> +    if (!elem) {
>> +        elem = g_malloc0(sizeof(RemoteIommuElem));
>> +        g_hash_table_insert(iommu->elem_by_devfn, INT2VOIDP(devfn), elem);
>> +    }
>> +
>> +    if (!elem->mr) {
>> +        elem->mr = MEMORY_REGION(object_new(TYPE_MEMORY_REGION));
>> +        memory_region_set_size(elem->mr, UINT64_MAX);
>> +        address_space_init(&elem->as, elem->mr, NULL);
>> +    }
>> +
>> +    qemu_mutex_unlock(&iommu->lock);
>> +
>> +    return &elem->as;
>> +}
> 
> A few comments that can be added to this file to explain the design:
> 
> - Each vfio-user server handles one PCIDevice on a PCIBus. There is one
>  RemoteIommu per PCIBus, so the RemoteIommu tracks multiple PCIDevices
>  by maintaining a ->elem_by_devfn mapping.
> 
> - memory_region_init_iommu() is not used because vfio-user MemoryRegions
>  will be added to the elem->mr container instead. This is more natural
>  than implementing the IOMMUMemoryRegionClass APIs since vfio-user
>  provides something that is close to a full-fledged MemoryRegion and
>  not like an IOMMU mapping.
> 
> - When a device is hot unplugged, the elem->mr reference is dropped so
>  all vfio-user MemoryRegions associated with this vfio-user server are
>  destroyed.

OK, will add this comment.

> 
>> +static void remote_iommu_finalize(Object *obj)
>> +{
>> +    RemoteIommu *iommu = REMOTE_IOMMU(obj);
>> +
>> +    qemu_mutex_destroy(&iommu->lock);
>> +
>> +    if (iommu->elem_by_devfn) {
> 
> ->init() and ->finalize() are a pair, so I don't think ->finalize() will
> ever be called with a NULL ->elem_by_devfn.

OK, got it.

Thanks!
--
Jag

> 
> If ->elem_by_devfn can be NULL then there would probably need to be a
> check around qemu_mutex_destroy(&iommu->lock) too.
> 
>> +        g_hash_table_destroy(iommu->elem_by_devfn);
>> +        iommu->elem_by_devfn = NULL;
>> +    }
>> +}
Jag Raman April 25, 2022, 5:30 p.m. UTC | #5
> On Apr 25, 2022, at 5:38 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> 
> On Wed, Apr 20, 2022 at 11:15:16AM +0000, Jag Raman wrote:
>> 
>> 
>>> On Apr 19, 2022, at 4:45 PM, Jag Raman <jag.raman@oracle.com> wrote:
>>> 
>>> Assign separate address space for each device in the remote processes.
>>> 
>>> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
>>> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
>>> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
>>> ---
>>> include/hw/remote/iommu.h |  40 +++++++++++++
>>> hw/remote/iommu.c         | 114 ++++++++++++++++++++++++++++++++++++++
>>> hw/remote/machine.c       |  13 ++++-
>>> MAINTAINERS               |   2 +
>>> hw/remote/meson.build     |   1 +
>>> 5 files changed, 169 insertions(+), 1 deletion(-)
>>> create mode 100644 include/hw/remote/iommu.h
>>> create mode 100644 hw/remote/iommu.c
>>> 
>>> diff --git a/include/hw/remote/iommu.h b/include/hw/remote/iommu.h
>>> new file mode 100644
>>> index 0000000000..33b68a8f4b
>>> --- /dev/null
>>> +++ b/include/hw/remote/iommu.h
>>> @@ -0,0 +1,40 @@
>>> +/**
>>> + * Copyright © 2022 Oracle and/or its affiliates.
>>> + *
>>> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
>>> + * See the COPYING file in the top-level directory.
>>> + *
>>> + */
>>> +
>>> +#ifndef REMOTE_IOMMU_H
>>> +#define REMOTE_IOMMU_H
>>> +
>>> +#include "hw/pci/pci_bus.h"
>>> +#include "hw/pci/pci.h"
>>> +
>>> +#ifndef INT2VOIDP
>>> +#define INT2VOIDP(i) (void *)(uintptr_t)(i)
>>> +#endif
>>> +
>>> +typedef struct RemoteIommuElem {
>>> +    MemoryRegion *mr;
>>> +
>>> +    AddressSpace as;
>>> +} RemoteIommuElem;
>>> +
>>> +#define TYPE_REMOTE_IOMMU "x-remote-iommu"
>>> +OBJECT_DECLARE_SIMPLE_TYPE(RemoteIommu, REMOTE_IOMMU)
>>> +
>>> +struct RemoteIommu {
>>> +    Object parent;
>>> +
>>> +    GHashTable *elem_by_devfn;
>>> +
>>> +    QemuMutex lock;
>>> +};
>>> +
>>> +void remote_iommu_setup(PCIBus *pci_bus);
>>> +
>>> +void remote_iommu_unplug_dev(PCIDevice *pci_dev);
>>> +
>>> +#endif
>>> diff --git a/hw/remote/iommu.c b/hw/remote/iommu.c
>>> new file mode 100644
>>> index 0000000000..16c6b0834e
>>> --- /dev/null
>>> +++ b/hw/remote/iommu.c
>>> @@ -0,0 +1,114 @@
>>> +/**
>>> + * IOMMU for remote device
>>> + *
>>> + * Copyright © 2022 Oracle and/or its affiliates.
>>> + *
>>> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
>>> + * See the COPYING file in the top-level directory.
>>> + *
>>> + */
>>> +
>>> +#include "qemu/osdep.h"
>>> +#include "qemu-common.h"
>>> +
>>> +#include "hw/remote/iommu.h"
>>> +#include "hw/pci/pci_bus.h"
>>> +#include "hw/pci/pci.h"
>>> +#include "exec/memory.h"
>>> +#include "exec/address-spaces.h"
>>> +#include "trace.h"
>>> +
>>> +static AddressSpace *remote_iommu_find_add_as(PCIBus *pci_bus,
>>> +                                              void *opaque, int devfn)
>>> +{
>>> +    RemoteIommu *iommu = opaque;
>>> +    RemoteIommuElem *elem = NULL;
>>> +
>>> +    qemu_mutex_lock(&iommu->lock);
>>> +
>>> +    elem = g_hash_table_lookup(iommu->elem_by_devfn, INT2VOIDP(devfn));
>>> +
>>> +    if (!elem) {
>>> +        elem = g_malloc0(sizeof(RemoteIommuElem));
>>> +        g_hash_table_insert(iommu->elem_by_devfn, INT2VOIDP(devfn), elem);
>>> +    }
>>> +
>>> +    if (!elem->mr) {
>>> +        elem->mr = MEMORY_REGION(object_new(TYPE_MEMORY_REGION));
>>> +        memory_region_set_size(elem->mr, UINT64_MAX);
>>> +        address_space_init(&elem->as, elem->mr, NULL);
>> 
>> Hi,
>> 
>> I’d like to add a note here.
>> 
>> We tried to add "elem->mr” as a child of PCIDevice. That way, when PCIDevice is
>> unplugged, the child is also finalized.
> 
> Do you mean via a memory_region_init()-family function where a parent
> object is given? Or do you mean by adding a QOM child property?

I mean by adding “elem->mr” as a QOM child property of PCIDevice.

> 
>> However, there was some issue with hotplug. During the hotplug, there’s a window
>> during initialization where we couldn’t lookup the PCIDevice by “devfn”.
>> 
>> do_pci_register_device() -> pci_init_bus_master() -> pci_device_iommu_address_space()
>> happens before do_pci_register_device() -> “bus->devices[devfn] = pci_dev”. As such,
>> pci_find_device() doesn’t work at this time.
> 
> I don't follow. What calls pci_find_device()?

To add the MemoryRegion as a child of PCIDevice, remote_iommu_find_add_as()
would need to lookup the PCIDevice using devfn. The function that looks up
PCIDevice by devfn is pci_find_device().

The above note explains why we didn’t lookup the PCIDevice using pci_find_device()
and then adding the MemoryRegion as its child.

Thank you!
--
Jag

> 
> Stefan
Stefan Hajnoczi April 28, 2022, 9:47 a.m. UTC | #6
On Mon, Apr 25, 2022 at 05:30:19PM +0000, Jag Raman wrote:
> 
> 
> > On Apr 25, 2022, at 5:38 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > 
> > On Wed, Apr 20, 2022 at 11:15:16AM +0000, Jag Raman wrote:
> >> 
> >> 
> >>> On Apr 19, 2022, at 4:45 PM, Jag Raman <jag.raman@oracle.com> wrote:
> >>> 
> >>> Assign separate address space for each device in the remote processes.
> >>> 
> >>> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> >>> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
> >>> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
> >>> ---
> >>> include/hw/remote/iommu.h |  40 +++++++++++++
> >>> hw/remote/iommu.c         | 114 ++++++++++++++++++++++++++++++++++++++
> >>> hw/remote/machine.c       |  13 ++++-
> >>> MAINTAINERS               |   2 +
> >>> hw/remote/meson.build     |   1 +
> >>> 5 files changed, 169 insertions(+), 1 deletion(-)
> >>> create mode 100644 include/hw/remote/iommu.h
> >>> create mode 100644 hw/remote/iommu.c
> >>> 
> >>> diff --git a/include/hw/remote/iommu.h b/include/hw/remote/iommu.h
> >>> new file mode 100644
> >>> index 0000000000..33b68a8f4b
> >>> --- /dev/null
> >>> +++ b/include/hw/remote/iommu.h
> >>> @@ -0,0 +1,40 @@
> >>> +/**
> >>> + * Copyright © 2022 Oracle and/or its affiliates.
> >>> + *
> >>> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> >>> + * See the COPYING file in the top-level directory.
> >>> + *
> >>> + */
> >>> +
> >>> +#ifndef REMOTE_IOMMU_H
> >>> +#define REMOTE_IOMMU_H
> >>> +
> >>> +#include "hw/pci/pci_bus.h"
> >>> +#include "hw/pci/pci.h"
> >>> +
> >>> +#ifndef INT2VOIDP
> >>> +#define INT2VOIDP(i) (void *)(uintptr_t)(i)
> >>> +#endif
> >>> +
> >>> +typedef struct RemoteIommuElem {
> >>> +    MemoryRegion *mr;
> >>> +
> >>> +    AddressSpace as;
> >>> +} RemoteIommuElem;
> >>> +
> >>> +#define TYPE_REMOTE_IOMMU "x-remote-iommu"
> >>> +OBJECT_DECLARE_SIMPLE_TYPE(RemoteIommu, REMOTE_IOMMU)
> >>> +
> >>> +struct RemoteIommu {
> >>> +    Object parent;
> >>> +
> >>> +    GHashTable *elem_by_devfn;
> >>> +
> >>> +    QemuMutex lock;
> >>> +};
> >>> +
> >>> +void remote_iommu_setup(PCIBus *pci_bus);
> >>> +
> >>> +void remote_iommu_unplug_dev(PCIDevice *pci_dev);
> >>> +
> >>> +#endif
> >>> diff --git a/hw/remote/iommu.c b/hw/remote/iommu.c
> >>> new file mode 100644
> >>> index 0000000000..16c6b0834e
> >>> --- /dev/null
> >>> +++ b/hw/remote/iommu.c
> >>> @@ -0,0 +1,114 @@
> >>> +/**
> >>> + * IOMMU for remote device
> >>> + *
> >>> + * Copyright © 2022 Oracle and/or its affiliates.
> >>> + *
> >>> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> >>> + * See the COPYING file in the top-level directory.
> >>> + *
> >>> + */
> >>> +
> >>> +#include "qemu/osdep.h"
> >>> +#include "qemu-common.h"
> >>> +
> >>> +#include "hw/remote/iommu.h"
> >>> +#include "hw/pci/pci_bus.h"
> >>> +#include "hw/pci/pci.h"
> >>> +#include "exec/memory.h"
> >>> +#include "exec/address-spaces.h"
> >>> +#include "trace.h"
> >>> +
> >>> +static AddressSpace *remote_iommu_find_add_as(PCIBus *pci_bus,
> >>> +                                              void *opaque, int devfn)
> >>> +{
> >>> +    RemoteIommu *iommu = opaque;
> >>> +    RemoteIommuElem *elem = NULL;
> >>> +
> >>> +    qemu_mutex_lock(&iommu->lock);
> >>> +
> >>> +    elem = g_hash_table_lookup(iommu->elem_by_devfn, INT2VOIDP(devfn));
> >>> +
> >>> +    if (!elem) {
> >>> +        elem = g_malloc0(sizeof(RemoteIommuElem));
> >>> +        g_hash_table_insert(iommu->elem_by_devfn, INT2VOIDP(devfn), elem);
> >>> +    }
> >>> +
> >>> +    if (!elem->mr) {
> >>> +        elem->mr = MEMORY_REGION(object_new(TYPE_MEMORY_REGION));
> >>> +        memory_region_set_size(elem->mr, UINT64_MAX);
> >>> +        address_space_init(&elem->as, elem->mr, NULL);
> >> 
> >> Hi,
> >> 
> >> I’d like to add a note here.
> >> 
> >> We tried to add "elem->mr” as a child of PCIDevice. That way, when PCIDevice is
> >> unplugged, the child is also finalized.
> > 
> > Do you mean via a memory_region_init()-family function where a parent
> > object is given? Or do you mean by adding a QOM child property?
> 
> I mean by adding “elem->mr” as a QOM child property of PCIDevice.
> 
> > 
> >> However, there was some issue with hotplug. During the hotplug, there’s a window
> >> during initialization where we couldn’t lookup the PCIDevice by “devfn”.
> >> 
> >> do_pci_register_device() -> pci_init_bus_master() -> pci_device_iommu_address_space()
> >> happens before do_pci_register_device() -> “bus->devices[devfn] = pci_dev”. As such,
> >> pci_find_device() doesn’t work at this time.
> > 
> > I don't follow. What calls pci_find_device()?
> 
> To add the MemoryRegion as a child of PCIDevice, remote_iommu_find_add_as()
> would need to lookup the PCIDevice using devfn. The function that looks up
> PCIDevice by devfn is pci_find_device().
> 
> The above note explains why we didn’t lookup the PCIDevice using pci_find_device()
> and then adding the MemoryRegion as its child.

If I understand correctly you're saying it's not possible to use
memory_region_init(&elem->mr, OBJECT(pci_dev), ...) inside
remote_iommu_find_add_as() because there is no way to find the
PCIDevice?

It would be nice to automatically clean up the memory region but the
AddressSpace needs to be destroyed too and there isn't an automatic way
of doing that, so the approach in this patch is fine.

Stefan
diff mbox series

Patch

diff --git a/include/hw/remote/iommu.h b/include/hw/remote/iommu.h
new file mode 100644
index 0000000000..33b68a8f4b
--- /dev/null
+++ b/include/hw/remote/iommu.h
@@ -0,0 +1,40 @@ 
+/**
+ * Copyright © 2022 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef REMOTE_IOMMU_H
+#define REMOTE_IOMMU_H
+
+#include "hw/pci/pci_bus.h"
+#include "hw/pci/pci.h"
+
+#ifndef INT2VOIDP
+#define INT2VOIDP(i) (void *)(uintptr_t)(i)
+#endif
+
+typedef struct RemoteIommuElem {
+    MemoryRegion *mr;
+
+    AddressSpace as;
+} RemoteIommuElem;
+
+#define TYPE_REMOTE_IOMMU "x-remote-iommu"
+OBJECT_DECLARE_SIMPLE_TYPE(RemoteIommu, REMOTE_IOMMU)
+
+struct RemoteIommu {
+    Object parent;
+
+    GHashTable *elem_by_devfn;
+
+    QemuMutex lock;
+};
+
+void remote_iommu_setup(PCIBus *pci_bus);
+
+void remote_iommu_unplug_dev(PCIDevice *pci_dev);
+
+#endif
diff --git a/hw/remote/iommu.c b/hw/remote/iommu.c
new file mode 100644
index 0000000000..16c6b0834e
--- /dev/null
+++ b/hw/remote/iommu.c
@@ -0,0 +1,114 @@ 
+/**
+ * IOMMU for remote device
+ *
+ * Copyright © 2022 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#include "hw/remote/iommu.h"
+#include "hw/pci/pci_bus.h"
+#include "hw/pci/pci.h"
+#include "exec/memory.h"
+#include "exec/address-spaces.h"
+#include "trace.h"
+
+static AddressSpace *remote_iommu_find_add_as(PCIBus *pci_bus,
+                                              void *opaque, int devfn)
+{
+    RemoteIommu *iommu = opaque;
+    RemoteIommuElem *elem = NULL;
+
+    qemu_mutex_lock(&iommu->lock);
+
+    elem = g_hash_table_lookup(iommu->elem_by_devfn, INT2VOIDP(devfn));
+
+    if (!elem) {
+        elem = g_malloc0(sizeof(RemoteIommuElem));
+        g_hash_table_insert(iommu->elem_by_devfn, INT2VOIDP(devfn), elem);
+    }
+
+    if (!elem->mr) {
+        elem->mr = MEMORY_REGION(object_new(TYPE_MEMORY_REGION));
+        memory_region_set_size(elem->mr, UINT64_MAX);
+        address_space_init(&elem->as, elem->mr, NULL);
+    }
+
+    qemu_mutex_unlock(&iommu->lock);
+
+    return &elem->as;
+}
+
+void remote_iommu_unplug_dev(PCIDevice *pci_dev)
+{
+    AddressSpace *as = pci_device_iommu_address_space(pci_dev);
+    RemoteIommuElem *elem = NULL;
+
+    if (as == &address_space_memory) {
+        return;
+    }
+
+    elem = container_of(as, RemoteIommuElem, as);
+
+    address_space_destroy(&elem->as);
+
+    object_unref(elem->mr);
+
+    elem->mr = NULL;
+}
+
+static void remote_iommu_init(Object *obj)
+{
+    RemoteIommu *iommu = REMOTE_IOMMU(obj);
+
+    iommu->elem_by_devfn = g_hash_table_new_full(NULL, NULL, NULL, g_free);
+
+    qemu_mutex_init(&iommu->lock);
+}
+
+static void remote_iommu_finalize(Object *obj)
+{
+    RemoteIommu *iommu = REMOTE_IOMMU(obj);
+
+    qemu_mutex_destroy(&iommu->lock);
+
+    if (iommu->elem_by_devfn) {
+        g_hash_table_destroy(iommu->elem_by_devfn);
+        iommu->elem_by_devfn = NULL;
+    }
+}
+
+void remote_iommu_setup(PCIBus *pci_bus)
+{
+    RemoteIommu *iommu = NULL;
+
+    g_assert(pci_bus);
+
+    iommu = REMOTE_IOMMU(object_new(TYPE_REMOTE_IOMMU));
+
+    pci_setup_iommu(pci_bus, remote_iommu_find_add_as, iommu);
+
+    object_property_add_child(OBJECT(pci_bus), "remote-iommu", OBJECT(iommu));
+
+    object_unref(OBJECT(iommu));
+}
+
+static const TypeInfo remote_iommu_info = {
+    .name = TYPE_REMOTE_IOMMU,
+    .parent = TYPE_OBJECT,
+    .instance_size = sizeof(RemoteIommu),
+    .instance_init = remote_iommu_init,
+    .instance_finalize = remote_iommu_finalize,
+};
+
+static void remote_iommu_register_types(void)
+{
+    type_register_static(&remote_iommu_info);
+}
+
+type_init(remote_iommu_register_types)
diff --git a/hw/remote/machine.c b/hw/remote/machine.c
index ed91659794..cca5d25f50 100644
--- a/hw/remote/machine.c
+++ b/hw/remote/machine.c
@@ -21,6 +21,7 @@ 
 #include "qapi/error.h"
 #include "hw/pci/pci_host.h"
 #include "hw/remote/iohub.h"
+#include "hw/remote/iommu.h"
 #include "hw/qdev-core.h"
 
 static void remote_machine_init(MachineState *machine)
@@ -100,6 +101,16 @@  static void remote_machine_instance_init(Object *obj)
     s->auto_shutdown = true;
 }
 
+static void remote_machine_dev_unplug_cb(HotplugHandler *hotplug_dev,
+                                         DeviceState *dev, Error **errp)
+{
+    qdev_unrealize(dev);
+
+    if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
+        remote_iommu_unplug_dev(PCI_DEVICE(dev));
+    }
+}
+
 static void remote_machine_class_init(ObjectClass *oc, void *data)
 {
     MachineClass *mc = MACHINE_CLASS(oc);
@@ -108,7 +119,7 @@  static void remote_machine_class_init(ObjectClass *oc, void *data)
     mc->init = remote_machine_init;
     mc->desc = "Experimental remote machine";
 
-    hc->unplug = qdev_simple_device_unplug_cb;
+    hc->unplug = remote_machine_dev_unplug_cb;
 
     object_class_property_add_bool(oc, "vfio-user",
                                    remote_machine_get_vfio_user,
diff --git a/MAINTAINERS b/MAINTAINERS
index 37afdecc61..3e62ccab0a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3599,6 +3599,8 @@  F: hw/remote/iohub.c
 F: include/hw/remote/iohub.h
 F: subprojects/libvfio-user
 F: hw/remote/vfio-user-obj.c
+F: hw/remote/iommu.c
+F: include/hw/remote/iommu.h
 
 EBPF:
 M: Jason Wang <jasowang@redhat.com>
diff --git a/hw/remote/meson.build b/hw/remote/meson.build
index 534ac5df79..bcef83c8cc 100644
--- a/hw/remote/meson.build
+++ b/hw/remote/meson.build
@@ -6,6 +6,7 @@  remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('message.c'))
 remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('remote-obj.c'))
 remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('proxy.c'))
 remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('iohub.c'))
+remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('iommu.c'))
 remote_ss.add(when: 'CONFIG_VFIO_USER_SERVER', if_true: files('vfio-user-obj.c'))
 
 remote_ss.add(when: 'CONFIG_VFIO_USER_SERVER', if_true: vfiouser)