diff mbox series

[v7,12/17] vfio-user: IOMMU support for remote device

Message ID 2fdb90acd40a1f79d571d4e68c56e6b08aded60d.1648234157.git.jag.raman@oracle.com
State New
Headers show
Series vfio-user server in QEMU | expand

Commit Message

Jag Raman March 25, 2022, 7:19 p.m. UTC
Assign separate address space for each device in the remote processes.

Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
---
 include/hw/remote/iommu.h | 18 ++++++++
 hw/remote/iommu.c         | 95 +++++++++++++++++++++++++++++++++++++++
 MAINTAINERS               |  2 +
 hw/remote/meson.build     |  1 +
 4 files changed, 116 insertions(+)
 create mode 100644 include/hw/remote/iommu.h
 create mode 100644 hw/remote/iommu.c

Comments

Stefan Hajnoczi March 29, 2022, 12:35 p.m. UTC | #1
On Fri, Mar 25, 2022 at 03:19:41PM -0400, Jagannathan Raman wrote:
> Assign separate address space for each device in the remote processes.
> 
> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
> ---
>  include/hw/remote/iommu.h | 18 ++++++++
>  hw/remote/iommu.c         | 95 +++++++++++++++++++++++++++++++++++++++
>  MAINTAINERS               |  2 +
>  hw/remote/meson.build     |  1 +
>  4 files changed, 116 insertions(+)
>  create mode 100644 include/hw/remote/iommu.h
>  create mode 100644 hw/remote/iommu.c
> 
> diff --git a/include/hw/remote/iommu.h b/include/hw/remote/iommu.h
> new file mode 100644
> index 0000000000..8f850400f1
> --- /dev/null
> +++ b/include/hw/remote/iommu.h
> @@ -0,0 +1,18 @@
> +/**
> + * Copyright © 2022 Oracle and/or its affiliates.
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + *
> + */
> +
> +#ifndef REMOTE_IOMMU_H
> +#define REMOTE_IOMMU_H
> +
> +#include "hw/pci/pci_bus.h"
> +
> +void remote_configure_iommu(PCIBus *pci_bus);
> +
> +void remote_iommu_del_device(PCIDevice *pci_dev);
> +
> +#endif
> diff --git a/hw/remote/iommu.c b/hw/remote/iommu.c
> new file mode 100644
> index 0000000000..13f329b45d
> --- /dev/null
> +++ b/hw/remote/iommu.c
> @@ -0,0 +1,95 @@
> +/**
> + * IOMMU for remote device
> + *
> + * Copyright © 2022 Oracle and/or its affiliates.
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + *
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu-common.h"
> +
> +#include "hw/remote/iommu.h"
> +#include "hw/pci/pci_bus.h"
> +#include "hw/pci/pci.h"
> +#include "exec/memory.h"
> +#include "exec/address-spaces.h"
> +#include "trace.h"
> +
> +struct RemoteIommuElem {
> +    AddressSpace  as;
> +    MemoryRegion  mr;
> +};
> +
> +struct RemoteIommuTable {
> +    QemuMutex lock;
> +    GHashTable *elem_by_bdf;
> +} remote_iommu_table;
> +
> +#define INT2VOIDP(i) (void *)(uintptr_t)(i)
> +
> +static AddressSpace *remote_iommu_find_add_as(PCIBus *pci_bus,
> +                                              void *opaque, int devfn)
> +{
> +    struct RemoteIommuTable *iommu_table = opaque;
> +    struct RemoteIommuElem *elem = NULL;
> +    int pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_bus), devfn);
> +
> +    elem = g_hash_table_lookup(iommu_table->elem_by_bdf, INT2VOIDP(pci_bdf));

Why is a lock needed around g_hash_table_insert() below but no lock is
held around g_hash_table_lookup()?

Insertion isn't atomic because lookup and insert are separate operations
and they are not done under a single lock.

> +
> +    if (!elem) {
> +        g_autofree char *mr_name = g_strdup_printf("vfu-ram-%d", pci_bdf);
> +        g_autofree char *as_name = g_strdup_printf("vfu-as-%d", pci_bdf);
> +
> +        elem = g_malloc0(sizeof(struct RemoteIommuElem));
> +
> +        memory_region_init(&elem->mr, NULL, mr_name, UINT64_MAX);
> +        address_space_init(&elem->as, &elem->mr, as_name);
> +
> +        qemu_mutex_lock(&iommu_table->lock);
> +        g_hash_table_insert(iommu_table->elem_by_bdf, INT2VOIDP(pci_bdf), elem);
> +        qemu_mutex_unlock(&iommu_table->lock);
> +    }
> +
> +    return &elem->as;
> +}
> +
> +static void remote_iommu_del_elem(gpointer data)
> +{
> +    struct RemoteIommuElem *elem = data;
> +
> +    g_assert(elem);
> +
> +    memory_region_unref(&elem->mr);
> +    address_space_destroy(&elem->as);
> +
> +    g_free(elem);
> +}
> +
> +void remote_iommu_del_device(PCIDevice *pci_dev)
> +{
> +    int pci_bdf;
> +
> +    if (!remote_iommu_table.elem_by_bdf || !pci_dev) {
> +        return;
> +    }
> +
> +    pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)), pci_dev->devfn);
> +
> +    qemu_mutex_lock(&remote_iommu_table.lock);
> +    g_hash_table_remove(remote_iommu_table.elem_by_bdf, INT2VOIDP(pci_bdf));
> +    qemu_mutex_unlock(&remote_iommu_table.lock);
> +}
> +
> +void remote_configure_iommu(PCIBus *pci_bus)
> +{
> +    if (!remote_iommu_table.elem_by_bdf) {
> +        remote_iommu_table.elem_by_bdf =
> +            g_hash_table_new_full(NULL, NULL, NULL, remote_iommu_del_elem);
> +        qemu_mutex_init(&remote_iommu_table.lock);
> +    }
> +
> +    pci_setup_iommu(pci_bus, remote_iommu_find_add_as, &remote_iommu_table);

Why is remote_iommu_table global? It could be per-PCIBus and indexed by
just devfn instead of the full BDF.
Jag Raman March 29, 2022, 2:12 p.m. UTC | #2
> On Mar 29, 2022, at 8:35 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> 
> On Fri, Mar 25, 2022 at 03:19:41PM -0400, Jagannathan Raman wrote:
>> Assign separate address space for each device in the remote processes.
>> 
>> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
>> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
>> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
>> ---
>> include/hw/remote/iommu.h | 18 ++++++++
>> hw/remote/iommu.c         | 95 +++++++++++++++++++++++++++++++++++++++
>> MAINTAINERS               |  2 +
>> hw/remote/meson.build     |  1 +
>> 4 files changed, 116 insertions(+)
>> create mode 100644 include/hw/remote/iommu.h
>> create mode 100644 hw/remote/iommu.c
>> 
>> diff --git a/include/hw/remote/iommu.h b/include/hw/remote/iommu.h
>> new file mode 100644
>> index 0000000000..8f850400f1
>> --- /dev/null
>> +++ b/include/hw/remote/iommu.h
>> @@ -0,0 +1,18 @@
>> +/**
>> + * Copyright © 2022 Oracle and/or its affiliates.
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
>> + * See the COPYING file in the top-level directory.
>> + *
>> + */
>> +
>> +#ifndef REMOTE_IOMMU_H
>> +#define REMOTE_IOMMU_H
>> +
>> +#include "hw/pci/pci_bus.h"
>> +
>> +void remote_configure_iommu(PCIBus *pci_bus);
>> +
>> +void remote_iommu_del_device(PCIDevice *pci_dev);
>> +
>> +#endif
>> diff --git a/hw/remote/iommu.c b/hw/remote/iommu.c
>> new file mode 100644
>> index 0000000000..13f329b45d
>> --- /dev/null
>> +++ b/hw/remote/iommu.c
>> @@ -0,0 +1,95 @@
>> +/**
>> + * IOMMU for remote device
>> + *
>> + * Copyright © 2022 Oracle and/or its affiliates.
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
>> + * See the COPYING file in the top-level directory.
>> + *
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +#include "qemu-common.h"
>> +
>> +#include "hw/remote/iommu.h"
>> +#include "hw/pci/pci_bus.h"
>> +#include "hw/pci/pci.h"
>> +#include "exec/memory.h"
>> +#include "exec/address-spaces.h"
>> +#include "trace.h"
>> +
>> +struct RemoteIommuElem {
>> +    AddressSpace  as;
>> +    MemoryRegion  mr;
>> +};
>> +
>> +struct RemoteIommuTable {
>> +    QemuMutex lock;
>> +    GHashTable *elem_by_bdf;
>> +} remote_iommu_table;
>> +
>> +#define INT2VOIDP(i) (void *)(uintptr_t)(i)
>> +
>> +static AddressSpace *remote_iommu_find_add_as(PCIBus *pci_bus,
>> +                                              void *opaque, int devfn)
>> +{
>> +    struct RemoteIommuTable *iommu_table = opaque;
>> +    struct RemoteIommuElem *elem = NULL;
>> +    int pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_bus), devfn);
>> +
>> +    elem = g_hash_table_lookup(iommu_table->elem_by_bdf, INT2VOIDP(pci_bdf));
> 
> Why is a lock needed around g_hash_table_insert() below but no lock is
> held around g_hash_table_lookup()?
> 
> Insertion isn't atomic because lookup and insert are separate operations
> and they are not done under a single lock.

Thanks for the catch! The lock should cover lookup also.

> 
>> +
>> +    if (!elem) {
>> +        g_autofree char *mr_name = g_strdup_printf("vfu-ram-%d", pci_bdf);
>> +        g_autofree char *as_name = g_strdup_printf("vfu-as-%d", pci_bdf);
>> +
>> +        elem = g_malloc0(sizeof(struct RemoteIommuElem));
>> +
>> +        memory_region_init(&elem->mr, NULL, mr_name, UINT64_MAX);
>> +        address_space_init(&elem->as, &elem->mr, as_name);
>> +
>> +        qemu_mutex_lock(&iommu_table->lock);
>> +        g_hash_table_insert(iommu_table->elem_by_bdf, INT2VOIDP(pci_bdf), elem);
>> +        qemu_mutex_unlock(&iommu_table->lock);
>> +    }
>> +
>> +    return &elem->as;
>> +}
>> +
>> +static void remote_iommu_del_elem(gpointer data)
>> +{
>> +    struct RemoteIommuElem *elem = data;
>> +
>> +    g_assert(elem);
>> +
>> +    memory_region_unref(&elem->mr);
>> +    address_space_destroy(&elem->as);
>> +
>> +    g_free(elem);
>> +}
>> +
>> +void remote_iommu_del_device(PCIDevice *pci_dev)
>> +{
>> +    int pci_bdf;
>> +
>> +    if (!remote_iommu_table.elem_by_bdf || !pci_dev) {
>> +        return;
>> +    }
>> +
>> +    pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)), pci_dev->devfn);
>> +
>> +    qemu_mutex_lock(&remote_iommu_table.lock);
>> +    g_hash_table_remove(remote_iommu_table.elem_by_bdf, INT2VOIDP(pci_bdf));
>> +    qemu_mutex_unlock(&remote_iommu_table.lock);
>> +}
>> +
>> +void remote_configure_iommu(PCIBus *pci_bus)
>> +{
>> +    if (!remote_iommu_table.elem_by_bdf) {
>> +        remote_iommu_table.elem_by_bdf =
>> +            g_hash_table_new_full(NULL, NULL, NULL, remote_iommu_del_elem);
>> +        qemu_mutex_init(&remote_iommu_table.lock);
>> +    }
>> +
>> +    pci_setup_iommu(pci_bus, remote_iommu_find_add_as, &remote_iommu_table);
> 
> Why is remote_iommu_table global? It could be per-PCIBus and indexed by
> just devfn instead of the full BDF.

It’s global because remote_iommu_del_device() needs it for cleanup.

OK, will make it a per bus property.

Thank you!
--
Jag
Stefan Hajnoczi March 29, 2022, 2:48 p.m. UTC | #3
On Tue, Mar 29, 2022 at 02:12:40PM +0000, Jag Raman wrote:
> > On Mar 29, 2022, at 8:35 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > On Fri, Mar 25, 2022 at 03:19:41PM -0400, Jagannathan Raman wrote:
> >> +void remote_iommu_del_device(PCIDevice *pci_dev)
> >> +{
> >> +    int pci_bdf;
> >> +
> >> +    if (!remote_iommu_table.elem_by_bdf || !pci_dev) {
> >> +        return;
> >> +    }
> >> +
> >> +    pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)), pci_dev->devfn);
> >> +
> >> +    qemu_mutex_lock(&remote_iommu_table.lock);
> >> +    g_hash_table_remove(remote_iommu_table.elem_by_bdf, INT2VOIDP(pci_bdf));
> >> +    qemu_mutex_unlock(&remote_iommu_table.lock);
> >> +}
> >> +
> >> +void remote_configure_iommu(PCIBus *pci_bus)
> >> +{
> >> +    if (!remote_iommu_table.elem_by_bdf) {
> >> +        remote_iommu_table.elem_by_bdf =
> >> +            g_hash_table_new_full(NULL, NULL, NULL, remote_iommu_del_elem);
> >> +        qemu_mutex_init(&remote_iommu_table.lock);
> >> +    }
> >> +
> >> +    pci_setup_iommu(pci_bus, remote_iommu_find_add_as, &remote_iommu_table);
> > 
> > Why is remote_iommu_table global? It could be per-PCIBus and indexed by
> > just devfn instead of the full BDF.
> 
> It’s global because remote_iommu_del_device() needs it for cleanup.

Can remote_iommu_del_device() use pci_get_bis(pci_dev)->irq_opaque to
get the per-bus table?

Stefan
Jag Raman March 29, 2022, 7:58 p.m. UTC | #4
> On Mar 29, 2022, at 10:48 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> 
> On Tue, Mar 29, 2022 at 02:12:40PM +0000, Jag Raman wrote:
>>> On Mar 29, 2022, at 8:35 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
>>> On Fri, Mar 25, 2022 at 03:19:41PM -0400, Jagannathan Raman wrote:
>>>> +void remote_iommu_del_device(PCIDevice *pci_dev)
>>>> +{
>>>> +    int pci_bdf;
>>>> +
>>>> +    if (!remote_iommu_table.elem_by_bdf || !pci_dev) {
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)), pci_dev->devfn);
>>>> +
>>>> +    qemu_mutex_lock(&remote_iommu_table.lock);
>>>> +    g_hash_table_remove(remote_iommu_table.elem_by_bdf, INT2VOIDP(pci_bdf));
>>>> +    qemu_mutex_unlock(&remote_iommu_table.lock);
>>>> +}
>>>> +
>>>> +void remote_configure_iommu(PCIBus *pci_bus)
>>>> +{
>>>> +    if (!remote_iommu_table.elem_by_bdf) {
>>>> +        remote_iommu_table.elem_by_bdf =
>>>> +            g_hash_table_new_full(NULL, NULL, NULL, remote_iommu_del_elem);
>>>> +        qemu_mutex_init(&remote_iommu_table.lock);
>>>> +    }
>>>> +
>>>> +    pci_setup_iommu(pci_bus, remote_iommu_find_add_as, &remote_iommu_table);
>>> 
>>> Why is remote_iommu_table global? It could be per-PCIBus and indexed by
>>> just devfn instead of the full BDF.
>> 
>> It’s global because remote_iommu_del_device() needs it for cleanup.
> 
> Can remote_iommu_del_device() use pci_get_bis(pci_dev)->irq_opaque to
> get the per-bus table?

pci_get_bus(pci_dev)->irq_opaque is used for interrupts.

PCIBus already has an iommu_opaque, which is a private
member of the bus structure. It’s passed as an argument
to the iommu_fn().

We could add a getter function to retrieve PCIBus->iommu_opaque
in remote_iommu_del_device(). That way we could avoid the global variable.

Thank you!
--
Jag

> 
> Stefan
Stefan Hajnoczi March 30, 2022, 10:04 a.m. UTC | #5
On Tue, Mar 29, 2022 at 07:58:51PM +0000, Jag Raman wrote:
> 
> 
> > On Mar 29, 2022, at 10:48 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > 
> > On Tue, Mar 29, 2022 at 02:12:40PM +0000, Jag Raman wrote:
> >>> On Mar 29, 2022, at 8:35 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> >>> On Fri, Mar 25, 2022 at 03:19:41PM -0400, Jagannathan Raman wrote:
> >>>> +void remote_iommu_del_device(PCIDevice *pci_dev)
> >>>> +{
> >>>> +    int pci_bdf;
> >>>> +
> >>>> +    if (!remote_iommu_table.elem_by_bdf || !pci_dev) {
> >>>> +        return;
> >>>> +    }
> >>>> +
> >>>> +    pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)), pci_dev->devfn);
> >>>> +
> >>>> +    qemu_mutex_lock(&remote_iommu_table.lock);
> >>>> +    g_hash_table_remove(remote_iommu_table.elem_by_bdf, INT2VOIDP(pci_bdf));
> >>>> +    qemu_mutex_unlock(&remote_iommu_table.lock);
> >>>> +}
> >>>> +
> >>>> +void remote_configure_iommu(PCIBus *pci_bus)
> >>>> +{
> >>>> +    if (!remote_iommu_table.elem_by_bdf) {
> >>>> +        remote_iommu_table.elem_by_bdf =
> >>>> +            g_hash_table_new_full(NULL, NULL, NULL, remote_iommu_del_elem);
> >>>> +        qemu_mutex_init(&remote_iommu_table.lock);
> >>>> +    }
> >>>> +
> >>>> +    pci_setup_iommu(pci_bus, remote_iommu_find_add_as, &remote_iommu_table);
> >>> 
> >>> Why is remote_iommu_table global? It could be per-PCIBus and indexed by
> >>> just devfn instead of the full BDF.
> >> 
> >> It’s global because remote_iommu_del_device() needs it for cleanup.
> > 
> > Can remote_iommu_del_device() use pci_get_bis(pci_dev)->irq_opaque to
> > get the per-bus table?
> 
> pci_get_bus(pci_dev)->irq_opaque is used for interrupts.
> 
> PCIBus already has an iommu_opaque, which is a private
> member of the bus structure. It’s passed as an argument
> to the iommu_fn().
> 
> We could add a getter function to retrieve PCIBus->iommu_opaque
> in remote_iommu_del_device(). That way we could avoid the global variable.

I've CCed Michael, Peter, and Jason regarding IOMMUs.

This makes me wonder whether there is a deeper issue with the
pci_setup_iommu() API: the lack of per-device cleanup callbacks.
Per-device IOMMU resources should be freed when a device is hot
unplugged.

From what I can tell this is not the case today:

- hw/i386/intel_iommu.c:vtd_find_add_as() allocates and adds device
  address spaces but I can't find where they are removed and freed.
  VTDAddressSpace instances pointed to from vtd_bus->dev_as[] are leaked.

- hw/i386/amd_iommu.c has similar leaks.

Your patch introduces a custom remote_iommu_del_device() function, but I
think the pci_setup_iommu() API should take a device_del() callback so
IOMMUs have a standard interface for handling per-device cleanup.

BTW in your case remote_iommu_del_device() is sufficient because hot
unplug is blocked by the new unplug blocker mechanism you introduced.
For other IOMMUs unplug will not be blocked and therefore IOMMUs really
need a callback for per-device cleanup.

Stefan
Peter Xu March 30, 2022, 12:53 p.m. UTC | #6
On Wed, Mar 30, 2022 at 11:04:24AM +0100, Stefan Hajnoczi wrote:
> This makes me wonder whether there is a deeper issue with the
> pci_setup_iommu() API: the lack of per-device cleanup callbacks.
> Per-device IOMMU resources should be freed when a device is hot
> unplugged.
> 
> From what I can tell this is not the case today:
> 
> - hw/i386/intel_iommu.c:vtd_find_add_as() allocates and adds device
>   address spaces but I can't find where they are removed and freed.
>   VTDAddressSpace instances pointed to from vtd_bus->dev_as[] are leaked.
> 
> - hw/i386/amd_iommu.c has similar leaks.

AFAICT it's because there's no device-specific data cached in the
per-device IOMMU address space, at least so far.  IOW, all the data
structures allocated here can be re-used when a new device is plugged in
after the old device unplugged.

It's definitely not ideal since after unplug (and before a new device
plugged in) the resource is not needed at all so it's kind of wasted, but
it should work functionally.  If to achieve that, some iommu_unplug() or
iommu_cleanup() hook sounds reasonable.

One thing I'm not sure is these iommu ops are per-bus not per-device.  So
I'm not sure whether that's what we wanted here because remote device
cleanup seems to be per-device only.

Thanks,
Stefan Hajnoczi March 30, 2022, 4:08 p.m. UTC | #7
On Wed, Mar 30, 2022 at 08:53:16AM -0400, Peter Xu wrote:
> On Wed, Mar 30, 2022 at 11:04:24AM +0100, Stefan Hajnoczi wrote:
> > This makes me wonder whether there is a deeper issue with the
> > pci_setup_iommu() API: the lack of per-device cleanup callbacks.
> > Per-device IOMMU resources should be freed when a device is hot
> > unplugged.
> > 
> > From what I can tell this is not the case today:
> > 
> > - hw/i386/intel_iommu.c:vtd_find_add_as() allocates and adds device
> >   address spaces but I can't find where they are removed and freed.
> >   VTDAddressSpace instances pointed to from vtd_bus->dev_as[] are leaked.
> > 
> > - hw/i386/amd_iommu.c has similar leaks.
> 
> AFAICT it's because there's no device-specific data cached in the
> per-device IOMMU address space, at least so far.  IOW, all the data
> structures allocated here can be re-used when a new device is plugged in
> after the old device unplugged.
> 
> It's definitely not ideal since after unplug (and before a new device
> plugged in) the resource is not needed at all so it's kind of wasted, but
> it should work functionally.  If to achieve that, some iommu_unplug() or
> iommu_cleanup() hook sounds reasonable.

I guess the question is whether PCI busses can be hotplugged with
IOMMUs. If yes, then there is a memory leak that matters for
intel_iommu.c and amd_iommu.c.

Stefan
Peter Xu March 30, 2022, 5:13 p.m. UTC | #8
On Wed, Mar 30, 2022 at 05:08:24PM +0100, Stefan Hajnoczi wrote:
> On Wed, Mar 30, 2022 at 08:53:16AM -0400, Peter Xu wrote:
> > On Wed, Mar 30, 2022 at 11:04:24AM +0100, Stefan Hajnoczi wrote:
> > > This makes me wonder whether there is a deeper issue with the
> > > pci_setup_iommu() API: the lack of per-device cleanup callbacks.
> > > Per-device IOMMU resources should be freed when a device is hot
> > > unplugged.
> > > 
> > > From what I can tell this is not the case today:
> > > 
> > > - hw/i386/intel_iommu.c:vtd_find_add_as() allocates and adds device
> > >   address spaces but I can't find where they are removed and freed.
> > >   VTDAddressSpace instances pointed to from vtd_bus->dev_as[] are leaked.
> > > 
> > > - hw/i386/amd_iommu.c has similar leaks.
> > 
> > AFAICT it's because there's no device-specific data cached in the
> > per-device IOMMU address space, at least so far.  IOW, all the data
> > structures allocated here can be re-used when a new device is plugged in
> > after the old device unplugged.
> > 
> > It's definitely not ideal since after unplug (and before a new device
> > plugged in) the resource is not needed at all so it's kind of wasted, but
> > it should work functionally.  If to achieve that, some iommu_unplug() or
> > iommu_cleanup() hook sounds reasonable.
> 
> I guess the question is whether PCI busses can be hotplugged with
> IOMMUs. If yes, then there is a memory leak that matters for
> intel_iommu.c and amd_iommu.c.

It can't, and we only support one vIOMMU so far for both (commit
1b3bf13890fd849b26).  Thanks,
Stefan Hajnoczi March 31, 2022, 9:47 a.m. UTC | #9
On Wed, Mar 30, 2022 at 01:13:03PM -0400, Peter Xu wrote:
> On Wed, Mar 30, 2022 at 05:08:24PM +0100, Stefan Hajnoczi wrote:
> > On Wed, Mar 30, 2022 at 08:53:16AM -0400, Peter Xu wrote:
> > > On Wed, Mar 30, 2022 at 11:04:24AM +0100, Stefan Hajnoczi wrote:
> > > > This makes me wonder whether there is a deeper issue with the
> > > > pci_setup_iommu() API: the lack of per-device cleanup callbacks.
> > > > Per-device IOMMU resources should be freed when a device is hot
> > > > unplugged.
> > > > 
> > > > From what I can tell this is not the case today:
> > > > 
> > > > - hw/i386/intel_iommu.c:vtd_find_add_as() allocates and adds device
> > > >   address spaces but I can't find where they are removed and freed.
> > > >   VTDAddressSpace instances pointed to from vtd_bus->dev_as[] are leaked.
> > > > 
> > > > - hw/i386/amd_iommu.c has similar leaks.
> > > 
> > > AFAICT it's because there's no device-specific data cached in the
> > > per-device IOMMU address space, at least so far.  IOW, all the data
> > > structures allocated here can be re-used when a new device is plugged in
> > > after the old device unplugged.
> > > 
> > > It's definitely not ideal since after unplug (and before a new device
> > > plugged in) the resource is not needed at all so it's kind of wasted, but
> > > it should work functionally.  If to achieve that, some iommu_unplug() or
> > > iommu_cleanup() hook sounds reasonable.
> > 
> > I guess the question is whether PCI busses can be hotplugged with
> > IOMMUs. If yes, then there is a memory leak that matters for
> > intel_iommu.c and amd_iommu.c.
> 
> It can't, and we only support one vIOMMU so far for both (commit
> 1b3bf13890fd849b26).  Thanks,

I see, thanks!

Okay, summarizing options for the vfio-user IOMMU:

1. Use the same singleton approach as existing IOMMUs where the
   MemoryRegion/AddressSpace are never freed. Don't bother deleting.

2. Keep the approach in this patch where vfio-user code manually calls a
   custom delete function (not part of the pci_setup_iommu() API). This
   is slightly awkward to do without global state and that's what
   started this discussion.

3. Introduce an optional pci_setup_iommu() callback:

   typdef void (*PCIIOMMUDeviceUnplug)(PCIBus *bus, void *opaque, int devfn);

   Solves the awkwardness of option #2. Not needed by existing IOMMU
   devices.

Any preferences anyone?

Stefan
Peter Xu March 31, 2022, 12:41 p.m. UTC | #10
On Thu, Mar 31, 2022 at 10:47:33AM +0100, Stefan Hajnoczi wrote:
> On Wed, Mar 30, 2022 at 01:13:03PM -0400, Peter Xu wrote:
> > On Wed, Mar 30, 2022 at 05:08:24PM +0100, Stefan Hajnoczi wrote:
> > > On Wed, Mar 30, 2022 at 08:53:16AM -0400, Peter Xu wrote:
> > > > On Wed, Mar 30, 2022 at 11:04:24AM +0100, Stefan Hajnoczi wrote:
> > > > > This makes me wonder whether there is a deeper issue with the
> > > > > pci_setup_iommu() API: the lack of per-device cleanup callbacks.
> > > > > Per-device IOMMU resources should be freed when a device is hot
> > > > > unplugged.
> > > > > 
> > > > > From what I can tell this is not the case today:
> > > > > 
> > > > > - hw/i386/intel_iommu.c:vtd_find_add_as() allocates and adds device
> > > > >   address spaces but I can't find where they are removed and freed.
> > > > >   VTDAddressSpace instances pointed to from vtd_bus->dev_as[] are leaked.
> > > > > 
> > > > > - hw/i386/amd_iommu.c has similar leaks.
> > > > 
> > > > AFAICT it's because there's no device-specific data cached in the
> > > > per-device IOMMU address space, at least so far.  IOW, all the data
> > > > structures allocated here can be re-used when a new device is plugged in
> > > > after the old device unplugged.
> > > > 
> > > > It's definitely not ideal since after unplug (and before a new device
> > > > plugged in) the resource is not needed at all so it's kind of wasted, but
> > > > it should work functionally.  If to achieve that, some iommu_unplug() or
> > > > iommu_cleanup() hook sounds reasonable.
> > > 
> > > I guess the question is whether PCI busses can be hotplugged with
> > > IOMMUs. If yes, then there is a memory leak that matters for
> > > intel_iommu.c and amd_iommu.c.
> > 
> > It can't, and we only support one vIOMMU so far for both (commit
> > 1b3bf13890fd849b26).  Thanks,
> 
> I see, thanks!
> 
> Okay, summarizing options for the vfio-user IOMMU:
> 
> 1. Use the same singleton approach as existing IOMMUs where the
>    MemoryRegion/AddressSpace are never freed. Don't bother deleting.
> 
> 2. Keep the approach in this patch where vfio-user code manually calls a
>    custom delete function (not part of the pci_setup_iommu() API). This
>    is slightly awkward to do without global state and that's what
>    started this discussion.
> 
> 3. Introduce an optional pci_setup_iommu() callback:
> 
>    typdef void (*PCIIOMMUDeviceUnplug)(PCIBus *bus, void *opaque, int devfn);
> 
>    Solves the awkwardness of option #2. Not needed by existing IOMMU
>    devices.

Looks all workable to me.  One tiny thing is if we'd like 3) we may want to
pass over the PCIDevice* too because in this case IIUC we'd need to double
check the device class before doing anything - we may not want to call the
vfio-user callbacks for general emulated devices under the same pci bus.

I think we could also fetch that from PCIBus.devices[devfn] but that's just
not as obvious.

Option 4) is as mentioned previously, that we add another device unplug
hook that can be registered per-device.  I just didn't think thoroughly on
how it would interact with the current HotplugHandler design yet.. it looks
quite similar but so far it's either for the machine type or pci bus, not
capable of registering on one single device (and it's always a mistery to
me why we'd rather ignore the per-bus hook if the machine hook
existed.. that's in qdev_get_hotplug_handler).

Copying Igor too.
Igor Mammedov April 13, 2022, 2:25 p.m. UTC | #11
On Fri, 25 Mar 2022 15:19:41 -0400
Jagannathan Raman <jag.raman@oracle.com> wrote:

> Assign separate address space for each device in the remote processes.
> 
> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
> ---
>  include/hw/remote/iommu.h | 18 ++++++++
>  hw/remote/iommu.c         | 95 +++++++++++++++++++++++++++++++++++++++
>  MAINTAINERS               |  2 +
>  hw/remote/meson.build     |  1 +
>  4 files changed, 116 insertions(+)
>  create mode 100644 include/hw/remote/iommu.h
>  create mode 100644 hw/remote/iommu.c
> 
> diff --git a/include/hw/remote/iommu.h b/include/hw/remote/iommu.h
> new file mode 100644
> index 0000000000..8f850400f1
> --- /dev/null
> +++ b/include/hw/remote/iommu.h
> @@ -0,0 +1,18 @@
> +/**
> + * Copyright © 2022 Oracle and/or its affiliates.
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + *
> + */
> +
> +#ifndef REMOTE_IOMMU_H
> +#define REMOTE_IOMMU_H
> +
> +#include "hw/pci/pci_bus.h"
> +
> +void remote_configure_iommu(PCIBus *pci_bus);
> +
> +void remote_iommu_del_device(PCIDevice *pci_dev);
> +
> +#endif
> diff --git a/hw/remote/iommu.c b/hw/remote/iommu.c
> new file mode 100644
> index 0000000000..13f329b45d
> --- /dev/null
> +++ b/hw/remote/iommu.c
> @@ -0,0 +1,95 @@
> +/**
> + * IOMMU for remote device
> + *
> + * Copyright © 2022 Oracle and/or its affiliates.
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + *
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu-common.h"
> +
> +#include "hw/remote/iommu.h"
> +#include "hw/pci/pci_bus.h"
> +#include "hw/pci/pci.h"
> +#include "exec/memory.h"
> +#include "exec/address-spaces.h"
> +#include "trace.h"
> +
> +struct RemoteIommuElem {
> +    AddressSpace  as;
> +    MemoryRegion  mr;
> +};
> +
> +struct RemoteIommuTable {
> +    QemuMutex lock;
> +    GHashTable *elem_by_bdf;
> +} remote_iommu_table;
> +
> +#define INT2VOIDP(i) (void *)(uintptr_t)(i)
> +
> +static AddressSpace *remote_iommu_find_add_as(PCIBus *pci_bus,
> +                                              void *opaque, int devfn)
> +{
> +    struct RemoteIommuTable *iommu_table = opaque;
> +    struct RemoteIommuElem *elem = NULL;
> +    int pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_bus), devfn);
> +
> +    elem = g_hash_table_lookup(iommu_table->elem_by_bdf, INT2VOIDP(pci_bdf));
> +
> +    if (!elem) {
> +        g_autofree char *mr_name = g_strdup_printf("vfu-ram-%d", pci_bdf);
> +        g_autofree char *as_name = g_strdup_printf("vfu-as-%d", pci_bdf);
> +
> +        elem = g_malloc0(sizeof(struct RemoteIommuElem));
> +
> +        memory_region_init(&elem->mr, NULL, mr_name, UINT64_MAX);
goes here:
   memory_region_do_init()
        if (!owner) {                                                            
            owner = container_get(qdev_get_machine(), "/unattached");            
        }  

then

> +        address_space_init(&elem->as, &elem->mr, as_name);
> +
> +        qemu_mutex_lock(&iommu_table->lock);
> +        g_hash_table_insert(iommu_table->elem_by_bdf, INT2VOIDP(pci_bdf), elem);
> +        qemu_mutex_unlock(&iommu_table->lock);
> +    }
> +
> +    return &elem->as;
> +}
> +
> +static void remote_iommu_del_elem(gpointer data)
> +{
> +    struct RemoteIommuElem *elem = data;
> +
> +    g_assert(elem);
> +
> +    memory_region_unref(&elem->mr);

here we call
      object_unref(mr->owner); 
leaving dangling pointer in owner '(qdev_get_machine(), "/unattached")'
it doesn't look correct

I thought that memory_region_unref() should be always paired with memory_region_ref()

and looking at memory_region_init(...owner...) history it looks like
owner-less (NULL) regions are not meant to be deleted ever.

> +    address_space_destroy(&elem->as);
> +
> +    g_free(elem);
> +}
> +
> +void remote_iommu_del_device(PCIDevice *pci_dev)
> +{
> +    int pci_bdf;
> +
> +    if (!remote_iommu_table.elem_by_bdf || !pci_dev) {
> +        return;
> +    }
> +
> +    pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)), pci_dev->devfn);
> +
> +    qemu_mutex_lock(&remote_iommu_table.lock);
> +    g_hash_table_remove(remote_iommu_table.elem_by_bdf, INT2VOIDP(pci_bdf));
> +    qemu_mutex_unlock(&remote_iommu_table.lock);
> +}
> +
> +void remote_configure_iommu(PCIBus *pci_bus)
> +{
> +    if (!remote_iommu_table.elem_by_bdf) {
> +        remote_iommu_table.elem_by_bdf =
> +            g_hash_table_new_full(NULL, NULL, NULL, remote_iommu_del_elem);
> +        qemu_mutex_init(&remote_iommu_table.lock);
> +    }
> +
> +    pci_setup_iommu(pci_bus, remote_iommu_find_add_as, &remote_iommu_table);
> +}
> diff --git a/MAINTAINERS b/MAINTAINERS
> index e7b0297a63..21694a9698 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -3599,6 +3599,8 @@ F: hw/remote/iohub.c
>  F: include/hw/remote/iohub.h
>  F: subprojects/libvfio-user
>  F: hw/remote/vfio-user-obj.c
> +F: hw/remote/iommu.c
> +F: include/hw/remote/iommu.h
>  
>  EBPF:
>  M: Jason Wang <jasowang@redhat.com>
> diff --git a/hw/remote/meson.build b/hw/remote/meson.build
> index 534ac5df79..bcef83c8cc 100644
> --- a/hw/remote/meson.build
> +++ b/hw/remote/meson.build
> @@ -6,6 +6,7 @@ remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('message.c'))
>  remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('remote-obj.c'))
>  remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('proxy.c'))
>  remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('iohub.c'))
> +remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('iommu.c'))
>  remote_ss.add(when: 'CONFIG_VFIO_USER_SERVER', if_true: files('vfio-user-obj.c'))
>  
>  remote_ss.add(when: 'CONFIG_VFIO_USER_SERVER', if_true: vfiouser)
Igor Mammedov April 13, 2022, 2:37 p.m. UTC | #12
On Thu, 31 Mar 2022 08:41:01 -0400
Peter Xu <peterx@redhat.com> wrote:

> On Thu, Mar 31, 2022 at 10:47:33AM +0100, Stefan Hajnoczi wrote:
> > On Wed, Mar 30, 2022 at 01:13:03PM -0400, Peter Xu wrote:  
> > > On Wed, Mar 30, 2022 at 05:08:24PM +0100, Stefan Hajnoczi wrote:  
> > > > On Wed, Mar 30, 2022 at 08:53:16AM -0400, Peter Xu wrote:  
> > > > > On Wed, Mar 30, 2022 at 11:04:24AM +0100, Stefan Hajnoczi wrote:  
> > > > > > This makes me wonder whether there is a deeper issue with the
> > > > > > pci_setup_iommu() API: the lack of per-device cleanup callbacks.
> > > > > > Per-device IOMMU resources should be freed when a device is hot
> > > > > > unplugged.
> > > > > > 
> > > > > > From what I can tell this is not the case today:
> > > > > > 
> > > > > > - hw/i386/intel_iommu.c:vtd_find_add_as() allocates and adds device
> > > > > >   address spaces but I can't find where they are removed and freed.
> > > > > >   VTDAddressSpace instances pointed to from vtd_bus->dev_as[] are leaked.
> > > > > > 
> > > > > > - hw/i386/amd_iommu.c has similar leaks.  
> > > > > 
> > > > > AFAICT it's because there's no device-specific data cached in the
> > > > > per-device IOMMU address space, at least so far.  IOW, all the data
> > > > > structures allocated here can be re-used when a new device is plugged in
> > > > > after the old device unplugged.
> > > > > 
> > > > > It's definitely not ideal since after unplug (and before a new device
> > > > > plugged in) the resource is not needed at all so it's kind of wasted, but
> > > > > it should work functionally.  If to achieve that, some iommu_unplug() or
> > > > > iommu_cleanup() hook sounds reasonable.  
> > > > 
> > > > I guess the question is whether PCI busses can be hotplugged with
> > > > IOMMUs. If yes, then there is a memory leak that matters for
> > > > intel_iommu.c and amd_iommu.c.  
> > > 
> > > It can't, and we only support one vIOMMU so far for both (commit
> > > 1b3bf13890fd849b26).  Thanks,  
> > 
> > I see, thanks!
> > 
> > Okay, summarizing options for the vfio-user IOMMU:
> > 
> > 1. Use the same singleton approach as existing IOMMUs where the
> >    MemoryRegion/AddressSpace are never freed. Don't bother deleting.
> > 
> > 2. Keep the approach in this patch where vfio-user code manually calls a
> >    custom delete function (not part of the pci_setup_iommu() API). This
> >    is slightly awkward to do without global state and that's what
> >    started this discussion.
> > 
> > 3. Introduce an optional pci_setup_iommu() callback:
> > 
> >    typdef void (*PCIIOMMUDeviceUnplug)(PCIBus *bus, void *opaque, int devfn);
> > 
> >    Solves the awkwardness of option #2. Not needed by existing IOMMU
> >    devices.  
> 
> Looks all workable to me.  One tiny thing is if we'd like 3) we may want to
> pass over the PCIDevice* too because in this case IIUC we'd need to double
> check the device class before doing anything - we may not want to call the
> vfio-user callbacks for general emulated devices under the same pci bus.
> 
> I think we could also fetch that from PCIBus.devices[devfn] but that's just
> not as obvious.
> 
> Option 4) is as mentioned previously, that we add another device unplug
> hook that can be registered per-device.  I just didn't think thoroughly on
can you expand on why per device hook is needed?

> how it would interact with the current HotplugHandler design yet.. it looks
> quite similar but so far it's either for the machine type or pci bus, not
> capable of registering on one single device (and it's always a mistery to
> me why we'd rather ignore the per-bus hook if the machine hook
> existed.. that's in qdev_get_hotplug_handler).

machine hook is there for bus-less devices mainly, if it's not defined
code will fallback to bus handler if any exists.

However machine hook can also be used to override default hotplug chain
to do to implement non trivial plug/unplug flow.
for example see pc_get_hotplug_handler(), corresponding
pc_machine_device_[pre_plug|plug|unplug...]_cb() where
plug/unplug chain is altered for some PCI devices types.
Perhaps the same can be done for vfio.

> 
> Copying Igor too.
>
Jag Raman April 13, 2022, 6:25 p.m. UTC | #13
> On Apr 13, 2022, at 10:25 AM, Igor Mammedov <imammedo@redhat.com> wrote:
> 
> On Fri, 25 Mar 2022 15:19:41 -0400
> Jagannathan Raman <jag.raman@oracle.com> wrote:
> 
>> Assign separate address space for each device in the remote processes.
>> 
>> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
>> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
>> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
>> ---
>> include/hw/remote/iommu.h | 18 ++++++++
>> hw/remote/iommu.c         | 95 +++++++++++++++++++++++++++++++++++++++
>> MAINTAINERS               |  2 +
>> hw/remote/meson.build     |  1 +
>> 4 files changed, 116 insertions(+)
>> create mode 100644 include/hw/remote/iommu.h
>> create mode 100644 hw/remote/iommu.c
>> 
>> diff --git a/include/hw/remote/iommu.h b/include/hw/remote/iommu.h
>> new file mode 100644
>> index 0000000000..8f850400f1
>> --- /dev/null
>> +++ b/include/hw/remote/iommu.h
>> @@ -0,0 +1,18 @@
>> +/**
>> + * Copyright © 2022 Oracle and/or its affiliates.
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
>> + * See the COPYING file in the top-level directory.
>> + *
>> + */
>> +
>> +#ifndef REMOTE_IOMMU_H
>> +#define REMOTE_IOMMU_H
>> +
>> +#include "hw/pci/pci_bus.h"
>> +
>> +void remote_configure_iommu(PCIBus *pci_bus);
>> +
>> +void remote_iommu_del_device(PCIDevice *pci_dev);
>> +
>> +#endif
>> diff --git a/hw/remote/iommu.c b/hw/remote/iommu.c
>> new file mode 100644
>> index 0000000000..13f329b45d
>> --- /dev/null
>> +++ b/hw/remote/iommu.c
>> @@ -0,0 +1,95 @@
>> +/**
>> + * IOMMU for remote device
>> + *
>> + * Copyright © 2022 Oracle and/or its affiliates.
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
>> + * See the COPYING file in the top-level directory.
>> + *
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +#include "qemu-common.h"
>> +
>> +#include "hw/remote/iommu.h"
>> +#include "hw/pci/pci_bus.h"
>> +#include "hw/pci/pci.h"
>> +#include "exec/memory.h"
>> +#include "exec/address-spaces.h"
>> +#include "trace.h"
>> +
>> +struct RemoteIommuElem {
>> +    AddressSpace  as;
>> +    MemoryRegion  mr;
>> +};
>> +
>> +struct RemoteIommuTable {
>> +    QemuMutex lock;
>> +    GHashTable *elem_by_bdf;
>> +} remote_iommu_table;
>> +
>> +#define INT2VOIDP(i) (void *)(uintptr_t)(i)
>> +
>> +static AddressSpace *remote_iommu_find_add_as(PCIBus *pci_bus,
>> +                                              void *opaque, int devfn)
>> +{
>> +    struct RemoteIommuTable *iommu_table = opaque;
>> +    struct RemoteIommuElem *elem = NULL;
>> +    int pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_bus), devfn);
>> +
>> +    elem = g_hash_table_lookup(iommu_table->elem_by_bdf, INT2VOIDP(pci_bdf));
>> +
>> +    if (!elem) {
>> +        g_autofree char *mr_name = g_strdup_printf("vfu-ram-%d", pci_bdf);
>> +        g_autofree char *as_name = g_strdup_printf("vfu-as-%d", pci_bdf);
>> +
>> +        elem = g_malloc0(sizeof(struct RemoteIommuElem));
>> +
>> +        memory_region_init(&elem->mr, NULL, mr_name, UINT64_MAX);
> goes here:
>   memory_region_do_init()
>        if (!owner) {                                                            
>            owner = container_get(qdev_get_machine(), "/unattached");            
>        }  
> 
> then
> 
>> +        address_space_init(&elem->as, &elem->mr, as_name);
>> +
>> +        qemu_mutex_lock(&iommu_table->lock);
>> +        g_hash_table_insert(iommu_table->elem_by_bdf, INT2VOIDP(pci_bdf), elem);
>> +        qemu_mutex_unlock(&iommu_table->lock);
>> +    }
>> +
>> +    return &elem->as;
>> +}
>> +
>> +static void remote_iommu_del_elem(gpointer data)
>> +{
>> +    struct RemoteIommuElem *elem = data;
>> +
>> +    g_assert(elem);
>> +
>> +    memory_region_unref(&elem->mr);
> 
> here we call
>      object_unref(mr->owner); 
> leaving dangling pointer in owner '(qdev_get_machine(), "/unattached")'
> it doesn't look correct
> 
> I thought that memory_region_unref() should be always paired with memory_region_ref()
> 
> and looking at memory_region_init(...owner...) history it looks like
> owner-less (NULL) regions are not meant to be deleted ever.

Hi Igor,

Thanks for the pointers about ref counters for MemoryRegions.

It makes sense - MemoryRegions are not QEMU Objects. So their
owner’s ref counters are used instead. So the expectation is that
when the owner is destroyed, the MemoryRegions initialized by them
also get destroyed simultaneously.

In this case, RemoteIommuElem->mr does not have an owner. Therefore,
we don’t have to manage its ref counter. When RemoteIommuElem is
destroyed, the MemoryRegion should be cleaned up automatically.

--
Jag

> 
>> +    address_space_destroy(&elem->as);
>> +
>> +    g_free(elem);
>> +}
>> +
>> +void remote_iommu_del_device(PCIDevice *pci_dev)
>> +{
>> +    int pci_bdf;
>> +
>> +    if (!remote_iommu_table.elem_by_bdf || !pci_dev) {
>> +        return;
>> +    }
>> +
>> +    pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)), pci_dev->devfn);
>> +
>> +    qemu_mutex_lock(&remote_iommu_table.lock);
>> +    g_hash_table_remove(remote_iommu_table.elem_by_bdf, INT2VOIDP(pci_bdf));
>> +    qemu_mutex_unlock(&remote_iommu_table.lock);
>> +}
>> +
>> +void remote_configure_iommu(PCIBus *pci_bus)
>> +{
>> +    if (!remote_iommu_table.elem_by_bdf) {
>> +        remote_iommu_table.elem_by_bdf =
>> +            g_hash_table_new_full(NULL, NULL, NULL, remote_iommu_del_elem);
>> +        qemu_mutex_init(&remote_iommu_table.lock);
>> +    }
>> +
>> +    pci_setup_iommu(pci_bus, remote_iommu_find_add_as, &remote_iommu_table);
>> +}
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index e7b0297a63..21694a9698 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -3599,6 +3599,8 @@ F: hw/remote/iohub.c
>> F: include/hw/remote/iohub.h
>> F: subprojects/libvfio-user
>> F: hw/remote/vfio-user-obj.c
>> +F: hw/remote/iommu.c
>> +F: include/hw/remote/iommu.h
>> 
>> EBPF:
>> M: Jason Wang <jasowang@redhat.com>
>> diff --git a/hw/remote/meson.build b/hw/remote/meson.build
>> index 534ac5df79..bcef83c8cc 100644
>> --- a/hw/remote/meson.build
>> +++ b/hw/remote/meson.build
>> @@ -6,6 +6,7 @@ remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('message.c'))
>> remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('remote-obj.c'))
>> remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('proxy.c'))
>> remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('iohub.c'))
>> +remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('iommu.c'))
>> remote_ss.add(when: 'CONFIG_VFIO_USER_SERVER', if_true: files('vfio-user-obj.c'))
>> 
>> remote_ss.add(when: 'CONFIG_VFIO_USER_SERVER', if_true: vfiouser)
>
Peter Xu April 13, 2022, 7:08 p.m. UTC | #14
On Wed, Apr 13, 2022 at 04:37:35PM +0200, Igor Mammedov wrote:
> On Thu, 31 Mar 2022 08:41:01 -0400
> Peter Xu <peterx@redhat.com> wrote:
> 
> > On Thu, Mar 31, 2022 at 10:47:33AM +0100, Stefan Hajnoczi wrote:
> > > On Wed, Mar 30, 2022 at 01:13:03PM -0400, Peter Xu wrote:  
> > > > On Wed, Mar 30, 2022 at 05:08:24PM +0100, Stefan Hajnoczi wrote:  
> > > > > On Wed, Mar 30, 2022 at 08:53:16AM -0400, Peter Xu wrote:  
> > > > > > On Wed, Mar 30, 2022 at 11:04:24AM +0100, Stefan Hajnoczi wrote:  
> > > > > > > This makes me wonder whether there is a deeper issue with the
> > > > > > > pci_setup_iommu() API: the lack of per-device cleanup callbacks.
> > > > > > > Per-device IOMMU resources should be freed when a device is hot
> > > > > > > unplugged.
> > > > > > > 
> > > > > > > From what I can tell this is not the case today:
> > > > > > > 
> > > > > > > - hw/i386/intel_iommu.c:vtd_find_add_as() allocates and adds device
> > > > > > >   address spaces but I can't find where they are removed and freed.
> > > > > > >   VTDAddressSpace instances pointed to from vtd_bus->dev_as[] are leaked.
> > > > > > > 
> > > > > > > - hw/i386/amd_iommu.c has similar leaks.  
> > > > > > 
> > > > > > AFAICT it's because there's no device-specific data cached in the
> > > > > > per-device IOMMU address space, at least so far.  IOW, all the data
> > > > > > structures allocated here can be re-used when a new device is plugged in
> > > > > > after the old device unplugged.
> > > > > > 
> > > > > > It's definitely not ideal since after unplug (and before a new device
> > > > > > plugged in) the resource is not needed at all so it's kind of wasted, but
> > > > > > it should work functionally.  If to achieve that, some iommu_unplug() or
> > > > > > iommu_cleanup() hook sounds reasonable.  
> > > > > 
> > > > > I guess the question is whether PCI busses can be hotplugged with
> > > > > IOMMUs. If yes, then there is a memory leak that matters for
> > > > > intel_iommu.c and amd_iommu.c.  
> > > > 
> > > > It can't, and we only support one vIOMMU so far for both (commit
> > > > 1b3bf13890fd849b26).  Thanks,  
> > > 
> > > I see, thanks!
> > > 
> > > Okay, summarizing options for the vfio-user IOMMU:
> > > 
> > > 1. Use the same singleton approach as existing IOMMUs where the
> > >    MemoryRegion/AddressSpace are never freed. Don't bother deleting.
> > > 
> > > 2. Keep the approach in this patch where vfio-user code manually calls a
> > >    custom delete function (not part of the pci_setup_iommu() API). This
> > >    is slightly awkward to do without global state and that's what
> > >    started this discussion.
> > > 
> > > 3. Introduce an optional pci_setup_iommu() callback:
> > > 
> > >    typdef void (*PCIIOMMUDeviceUnplug)(PCIBus *bus, void *opaque, int devfn);
> > > 
> > >    Solves the awkwardness of option #2. Not needed by existing IOMMU
> > >    devices.  
> > 
> > Looks all workable to me.  One tiny thing is if we'd like 3) we may want to
> > pass over the PCIDevice* too because in this case IIUC we'd need to double
> > check the device class before doing anything - we may not want to call the
> > vfio-user callbacks for general emulated devices under the same pci bus.
> > 
> > I think we could also fetch that from PCIBus.devices[devfn] but that's just
> > not as obvious.
> > 
> > Option 4) is as mentioned previously, that we add another device unplug
> > hook that can be registered per-device.  I just didn't think thoroughly on
> can you expand on why per device hook is needed?

E.g. when the pci bus that contains the vfio-user device also contains
another emulated device?  Then IIUC we only want to call the vfio-user hook
for the vfio-user device, not the rest ones on the same bus?

Per-bus will work too, but again then the per-bus hook will need to first
identify the PCIDevice* object so it'll work similarly as a per-device hook.

> 
> > how it would interact with the current HotplugHandler design yet.. it looks
> > quite similar but so far it's either for the machine type or pci bus, not
> > capable of registering on one single device (and it's always a mistery to
> > me why we'd rather ignore the per-bus hook if the machine hook
> > existed.. that's in qdev_get_hotplug_handler).
> 
> machine hook is there for bus-less devices mainly, if it's not defined
> code will fallback to bus handler if any exists.
> 
> However machine hook can also be used to override default hotplug chain
> to do to implement non trivial plug/unplug flow.
> for example see pc_get_hotplug_handler(), corresponding
> pc_machine_device_[pre_plug|plug|unplug...]_cb() where
> plug/unplug chain is altered for some PCI devices types.
> Perhaps the same can be done for vfio.

It just seems non-obvious, no?  For example, if someone implementes a pci
bus with hotplug_handler() being provided, it will surprise me a bit if
it's triggered conditionally, depending on which machine type the bus is
attached to, and whether this machine type has get_hotplug_handler().

Thanks,
Jag Raman April 13, 2022, 9:59 p.m. UTC | #15
> On Apr 13, 2022, at 2:24 PM, Jag Raman <jag.raman@oracle.com> wrote:
> 
> 
> 
>> On Apr 13, 2022, at 10:25 AM, Igor Mammedov <imammedo@redhat.com> wrote:
>> 
>> On Fri, 25 Mar 2022 15:19:41 -0400
>> Jagannathan Raman <jag.raman@oracle.com> wrote:
>> 
>>> Assign separate address space for each device in the remote processes.
>>> 
>>> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
>>> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
>>> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
>>> ---
>>> include/hw/remote/iommu.h | 18 ++++++++
>>> hw/remote/iommu.c         | 95 +++++++++++++++++++++++++++++++++++++++
>>> MAINTAINERS               |  2 +
>>> hw/remote/meson.build     |  1 +
>>> 4 files changed, 116 insertions(+)
>>> create mode 100644 include/hw/remote/iommu.h
>>> create mode 100644 hw/remote/iommu.c
>>> 
>>> diff --git a/include/hw/remote/iommu.h b/include/hw/remote/iommu.h
>>> new file mode 100644
>>> index 0000000000..8f850400f1
>>> --- /dev/null
>>> +++ b/include/hw/remote/iommu.h
>>> @@ -0,0 +1,18 @@
>>> +/**
>>> + * Copyright © 2022 Oracle and/or its affiliates.
>>> + *
>>> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
>>> + * See the COPYING file in the top-level directory.
>>> + *
>>> + */
>>> +
>>> +#ifndef REMOTE_IOMMU_H
>>> +#define REMOTE_IOMMU_H
>>> +
>>> +#include "hw/pci/pci_bus.h"
>>> +
>>> +void remote_configure_iommu(PCIBus *pci_bus);
>>> +
>>> +void remote_iommu_del_device(PCIDevice *pci_dev);
>>> +
>>> +#endif
>>> diff --git a/hw/remote/iommu.c b/hw/remote/iommu.c
>>> new file mode 100644
>>> index 0000000000..13f329b45d
>>> --- /dev/null
>>> +++ b/hw/remote/iommu.c
>>> @@ -0,0 +1,95 @@
>>> +/**
>>> + * IOMMU for remote device
>>> + *
>>> + * Copyright © 2022 Oracle and/or its affiliates.
>>> + *
>>> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
>>> + * See the COPYING file in the top-level directory.
>>> + *
>>> + */
>>> +
>>> +#include "qemu/osdep.h"
>>> +#include "qemu-common.h"
>>> +
>>> +#include "hw/remote/iommu.h"
>>> +#include "hw/pci/pci_bus.h"
>>> +#include "hw/pci/pci.h"
>>> +#include "exec/memory.h"
>>> +#include "exec/address-spaces.h"
>>> +#include "trace.h"
>>> +
>>> +struct RemoteIommuElem {
>>> +    AddressSpace  as;
>>> +    MemoryRegion  mr;
>>> +};
>>> +
>>> +struct RemoteIommuTable {
>>> +    QemuMutex lock;
>>> +    GHashTable *elem_by_bdf;
>>> +} remote_iommu_table;
>>> +
>>> +#define INT2VOIDP(i) (void *)(uintptr_t)(i)
>>> +
>>> +static AddressSpace *remote_iommu_find_add_as(PCIBus *pci_bus,
>>> +                                              void *opaque, int devfn)
>>> +{
>>> +    struct RemoteIommuTable *iommu_table = opaque;
>>> +    struct RemoteIommuElem *elem = NULL;
>>> +    int pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_bus), devfn);
>>> +
>>> +    elem = g_hash_table_lookup(iommu_table->elem_by_bdf, INT2VOIDP(pci_bdf));
>>> +
>>> +    if (!elem) {
>>> +        g_autofree char *mr_name = g_strdup_printf("vfu-ram-%d", pci_bdf);
>>> +        g_autofree char *as_name = g_strdup_printf("vfu-as-%d", pci_bdf);
>>> +
>>> +        elem = g_malloc0(sizeof(struct RemoteIommuElem));
>>> +
>>> +        memory_region_init(&elem->mr, NULL, mr_name, UINT64_MAX);
>> goes here:
>>  memory_region_do_init()
>>       if (!owner) {                                                            
>>           owner = container_get(qdev_get_machine(), "/unattached");            
>>       }  
>> 
>> then
>> 
>>> +        address_space_init(&elem->as, &elem->mr, as_name);
>>> +
>>> +        qemu_mutex_lock(&iommu_table->lock);
>>> +        g_hash_table_insert(iommu_table->elem_by_bdf, INT2VOIDP(pci_bdf), elem);
>>> +        qemu_mutex_unlock(&iommu_table->lock);
>>> +    }
>>> +
>>> +    return &elem->as;
>>> +}
>>> +
>>> +static void remote_iommu_del_elem(gpointer data)
>>> +{
>>> +    struct RemoteIommuElem *elem = data;
>>> +
>>> +    g_assert(elem);
>>> +
>>> +    memory_region_unref(&elem->mr);
>> 
>> here we call
>>     object_unref(mr->owner); 
>> leaving dangling pointer in owner '(qdev_get_machine(), "/unattached")'
>> it doesn't look correct
>> 
>> I thought that memory_region_unref() should be always paired with memory_region_ref()
>> 
>> and looking at memory_region_init(...owner...) history it looks like
>> owner-less (NULL) regions are not meant to be deleted ever.
> 
> Hi Igor,
> 
> Thanks for the pointers about ref counters for MemoryRegions.
> 
> It makes sense - MemoryRegions are not QEMU Objects. So their
> owner’s ref counters are used instead. So the expectation is that
> when the owner is destroyed, the MemoryRegions initialized by them
> also get destroyed simultaneously.

Well, MemoryRegions are indeed QEMU objects -
"memory_region_init() -> object_initialize()" initializes the object.
So we should be able to unref the MemoryRegion object directly.

We could make the PCIDevice as the owner of its IOMMU region -
when the device is finalized, its region would be finalized as well.

Given the above, I don’t think we would need a separate delete
function (such as remote_iommu_del_device()). When the device is
unplugged, its MemoryRegion would be finalized automatically.

Thank you!
--
Jag

> 
> In this case, RemoteIommuElem->mr does not have an owner. Therefore,
> we don’t have to manage its ref counter. When RemoteIommuElem is
> destroyed, the MemoryRegion should be cleaned up automatically.
> 
> --
> Jag
> 
>> 
>>> +    address_space_destroy(&elem->as);
>>> +
>>> +    g_free(elem);
>>> +}
>>> +
>>> +void remote_iommu_del_device(PCIDevice *pci_dev)
>>> +{
>>> +    int pci_bdf;
>>> +
>>> +    if (!remote_iommu_table.elem_by_bdf || !pci_dev) {
>>> +        return;
>>> +    }
>>> +
>>> +    pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)), pci_dev->devfn);
>>> +
>>> +    qemu_mutex_lock(&remote_iommu_table.lock);
>>> +    g_hash_table_remove(remote_iommu_table.elem_by_bdf, INT2VOIDP(pci_bdf));
>>> +    qemu_mutex_unlock(&remote_iommu_table.lock);
>>> +}
>>> +
>>> +void remote_configure_iommu(PCIBus *pci_bus)
>>> +{
>>> +    if (!remote_iommu_table.elem_by_bdf) {
>>> +        remote_iommu_table.elem_by_bdf =
>>> +            g_hash_table_new_full(NULL, NULL, NULL, remote_iommu_del_elem);
>>> +        qemu_mutex_init(&remote_iommu_table.lock);
>>> +    }
>>> +
>>> +    pci_setup_iommu(pci_bus, remote_iommu_find_add_as, &remote_iommu_table);
>>> +}
>>> diff --git a/MAINTAINERS b/MAINTAINERS
>>> index e7b0297a63..21694a9698 100644
>>> --- a/MAINTAINERS
>>> +++ b/MAINTAINERS
>>> @@ -3599,6 +3599,8 @@ F: hw/remote/iohub.c
>>> F: include/hw/remote/iohub.h
>>> F: subprojects/libvfio-user
>>> F: hw/remote/vfio-user-obj.c
>>> +F: hw/remote/iommu.c
>>> +F: include/hw/remote/iommu.h
>>> 
>>> EBPF:
>>> M: Jason Wang <jasowang@redhat.com>
>>> diff --git a/hw/remote/meson.build b/hw/remote/meson.build
>>> index 534ac5df79..bcef83c8cc 100644
>>> --- a/hw/remote/meson.build
>>> +++ b/hw/remote/meson.build
>>> @@ -6,6 +6,7 @@ remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('message.c'))
>>> remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('remote-obj.c'))
>>> remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('proxy.c'))
>>> remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('iohub.c'))
>>> +remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('iommu.c'))
>>> remote_ss.add(when: 'CONFIG_VFIO_USER_SERVER', if_true: files('vfio-user-obj.c'))
>>> 
>>> remote_ss.add(when: 'CONFIG_VFIO_USER_SERVER', if_true: vfiouser)
Igor Mammedov April 19, 2022, 8:48 a.m. UTC | #16
On Wed, 13 Apr 2022 15:08:33 -0400
Peter Xu <peterx@redhat.com> wrote:

> On Wed, Apr 13, 2022 at 04:37:35PM +0200, Igor Mammedov wrote:
> > On Thu, 31 Mar 2022 08:41:01 -0400
> > Peter Xu <peterx@redhat.com> wrote:
> >   
> > > On Thu, Mar 31, 2022 at 10:47:33AM +0100, Stefan Hajnoczi wrote:  
> > > > On Wed, Mar 30, 2022 at 01:13:03PM -0400, Peter Xu wrote:    
> > > > > On Wed, Mar 30, 2022 at 05:08:24PM +0100, Stefan Hajnoczi wrote:    
> > > > > > On Wed, Mar 30, 2022 at 08:53:16AM -0400, Peter Xu wrote:    
> > > > > > > On Wed, Mar 30, 2022 at 11:04:24AM +0100, Stefan Hajnoczi wrote:    
> > > > > > > > This makes me wonder whether there is a deeper issue with the
> > > > > > > > pci_setup_iommu() API: the lack of per-device cleanup callbacks.
> > > > > > > > Per-device IOMMU resources should be freed when a device is hot
> > > > > > > > unplugged.
> > > > > > > > 
> > > > > > > > From what I can tell this is not the case today:
> > > > > > > > 
> > > > > > > > - hw/i386/intel_iommu.c:vtd_find_add_as() allocates and adds device
> > > > > > > >   address spaces but I can't find where they are removed and freed.
> > > > > > > >   VTDAddressSpace instances pointed to from vtd_bus->dev_as[] are leaked.
> > > > > > > > 
> > > > > > > > - hw/i386/amd_iommu.c has similar leaks.    
> > > > > > > 
> > > > > > > AFAICT it's because there's no device-specific data cached in the
> > > > > > > per-device IOMMU address space, at least so far.  IOW, all the data
> > > > > > > structures allocated here can be re-used when a new device is plugged in
> > > > > > > after the old device unplugged.
> > > > > > > 
> > > > > > > It's definitely not ideal since after unplug (and before a new device
> > > > > > > plugged in) the resource is not needed at all so it's kind of wasted, but
> > > > > > > it should work functionally.  If to achieve that, some iommu_unplug() or
> > > > > > > iommu_cleanup() hook sounds reasonable.    
> > > > > > 
> > > > > > I guess the question is whether PCI busses can be hotplugged with
> > > > > > IOMMUs. If yes, then there is a memory leak that matters for
> > > > > > intel_iommu.c and amd_iommu.c.    
> > > > > 
> > > > > It can't, and we only support one vIOMMU so far for both (commit
> > > > > 1b3bf13890fd849b26).  Thanks,    
> > > > 
> > > > I see, thanks!
> > > > 
> > > > Okay, summarizing options for the vfio-user IOMMU:
> > > > 
> > > > 1. Use the same singleton approach as existing IOMMUs where the
> > > >    MemoryRegion/AddressSpace are never freed. Don't bother deleting.
> > > > 
> > > > 2. Keep the approach in this patch where vfio-user code manually calls a
> > > >    custom delete function (not part of the pci_setup_iommu() API). This
> > > >    is slightly awkward to do without global state and that's what
> > > >    started this discussion.
> > > > 
> > > > 3. Introduce an optional pci_setup_iommu() callback:
> > > > 
> > > >    typdef void (*PCIIOMMUDeviceUnplug)(PCIBus *bus, void *opaque, int devfn);
> > > > 
> > > >    Solves the awkwardness of option #2. Not needed by existing IOMMU
> > > >    devices.    
> > > 
> > > Looks all workable to me.  One tiny thing is if we'd like 3) we may want to
> > > pass over the PCIDevice* too because in this case IIUC we'd need to double
> > > check the device class before doing anything - we may not want to call the
> > > vfio-user callbacks for general emulated devices under the same pci bus.
> > > 
> > > I think we could also fetch that from PCIBus.devices[devfn] but that's just
> > > not as obvious.
> > > 
> > > Option 4) is as mentioned previously, that we add another device unplug
> > > hook that can be registered per-device.  I just didn't think thoroughly on  
> > can you expand on why per device hook is needed?  
> 
> E.g. when the pci bus that contains the vfio-user device also contains
> another emulated device?  Then IIUC we only want to call the vfio-user hook
> for the vfio-user device, not the rest ones on the same bus?
> 
> Per-bus will work too, but again then the per-bus hook will need to first
> identify the PCIDevice* object so it'll work similarly as a per-device hook.

Question is if hook is really needed,
and why doing cleanup in vfio-usr.unrealize() is not sufficient.

also there are realize/unrealize listener hooks, that could
help to hook random code to plug/unplug workflow as the last resort
(i.e. avoid properly dividing responsibility between emulated
device models) on top of that it doesn't have any error handling
so hooks must not fail ever.

(also it's generic vfio issue as it's written as a mix of backend+frontend
code which is confusing at times and makes it hard to distinguish what
belongs where (unless one has an intimate knowledge of how it should
be working))

> > > how it would interact with the current HotplugHandler design yet.. it looks
> > > quite similar but so far it's either for the machine type or pci bus, not
> > > capable of registering on one single device (and it's always a mistery to
> > > me why we'd rather ignore the per-bus hook if the machine hook
> > > existed.. that's in qdev_get_hotplug_handler).  
> > 
> > machine hook is there for bus-less devices mainly, if it's not defined
> > code will fallback to bus handler if any exists.
> > 
> > However machine hook can also be used to override default hotplug chain
> > to do to implement non trivial plug/unplug flow.
> > for example see pc_get_hotplug_handler(), corresponding
> > pc_machine_device_[pre_plug|plug|unplug...]_cb() where
> > plug/unplug chain is altered for some PCI devices types.
> > Perhaps the same can be done for vfio.  
> 
> It just seems non-obvious, no?  For example, if someone implementes a pci
> bus with hotplug_handler() being provided, it will surprise me a bit if
> it's triggered conditionally, depending on which machine type the bus is
> attached to, and whether this machine type has get_hotplug_handler().

bus level handler is called anyways, with the difference that plug/unplug
flow could be additionally redirected to other relevant components.

It might be confusing at the beginning, the idea behind, having single
entry point to override flow at machine level, is that it's the top most
object that governs all other devices, wires them together and is allowed
to reshape default wiring between attached devices without violating
relationship between components. (drawback is that approach adds quite
a bit of boilerplate, but no one has suggested/implemented any other
idea for generic device wiring).

Adding specialized hooks to generic bus code for a random device quirks
might work too, but it's not obvious either, scattered through codebase
often polluting  generic code with device specific workarounds, dn it's
harder to review/judge if proposed hook is not just another layer
violation.
Well I don't know enough about vfio/iommu to make useful suggestion
but adding to generic PCI bus/device vfio induced hooks does look
suspicious to me.


> Thanks,
>
diff mbox series

Patch

diff --git a/include/hw/remote/iommu.h b/include/hw/remote/iommu.h
new file mode 100644
index 0000000000..8f850400f1
--- /dev/null
+++ b/include/hw/remote/iommu.h
@@ -0,0 +1,18 @@ 
+/**
+ * Copyright © 2022 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef REMOTE_IOMMU_H
+#define REMOTE_IOMMU_H
+
+#include "hw/pci/pci_bus.h"
+
+void remote_configure_iommu(PCIBus *pci_bus);
+
+void remote_iommu_del_device(PCIDevice *pci_dev);
+
+#endif
diff --git a/hw/remote/iommu.c b/hw/remote/iommu.c
new file mode 100644
index 0000000000..13f329b45d
--- /dev/null
+++ b/hw/remote/iommu.c
@@ -0,0 +1,95 @@ 
+/**
+ * IOMMU for remote device
+ *
+ * Copyright © 2022 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#include "hw/remote/iommu.h"
+#include "hw/pci/pci_bus.h"
+#include "hw/pci/pci.h"
+#include "exec/memory.h"
+#include "exec/address-spaces.h"
+#include "trace.h"
+
+struct RemoteIommuElem {
+    AddressSpace  as;
+    MemoryRegion  mr;
+};
+
+struct RemoteIommuTable {
+    QemuMutex lock;
+    GHashTable *elem_by_bdf;
+} remote_iommu_table;
+
+#define INT2VOIDP(i) (void *)(uintptr_t)(i)
+
+static AddressSpace *remote_iommu_find_add_as(PCIBus *pci_bus,
+                                              void *opaque, int devfn)
+{
+    struct RemoteIommuTable *iommu_table = opaque;
+    struct RemoteIommuElem *elem = NULL;
+    int pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_bus), devfn);
+
+    elem = g_hash_table_lookup(iommu_table->elem_by_bdf, INT2VOIDP(pci_bdf));
+
+    if (!elem) {
+        g_autofree char *mr_name = g_strdup_printf("vfu-ram-%d", pci_bdf);
+        g_autofree char *as_name = g_strdup_printf("vfu-as-%d", pci_bdf);
+
+        elem = g_malloc0(sizeof(struct RemoteIommuElem));
+
+        memory_region_init(&elem->mr, NULL, mr_name, UINT64_MAX);
+        address_space_init(&elem->as, &elem->mr, as_name);
+
+        qemu_mutex_lock(&iommu_table->lock);
+        g_hash_table_insert(iommu_table->elem_by_bdf, INT2VOIDP(pci_bdf), elem);
+        qemu_mutex_unlock(&iommu_table->lock);
+    }
+
+    return &elem->as;
+}
+
+static void remote_iommu_del_elem(gpointer data)
+{
+    struct RemoteIommuElem *elem = data;
+
+    g_assert(elem);
+
+    memory_region_unref(&elem->mr);
+    address_space_destroy(&elem->as);
+
+    g_free(elem);
+}
+
+void remote_iommu_del_device(PCIDevice *pci_dev)
+{
+    int pci_bdf;
+
+    if (!remote_iommu_table.elem_by_bdf || !pci_dev) {
+        return;
+    }
+
+    pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)), pci_dev->devfn);
+
+    qemu_mutex_lock(&remote_iommu_table.lock);
+    g_hash_table_remove(remote_iommu_table.elem_by_bdf, INT2VOIDP(pci_bdf));
+    qemu_mutex_unlock(&remote_iommu_table.lock);
+}
+
+void remote_configure_iommu(PCIBus *pci_bus)
+{
+    if (!remote_iommu_table.elem_by_bdf) {
+        remote_iommu_table.elem_by_bdf =
+            g_hash_table_new_full(NULL, NULL, NULL, remote_iommu_del_elem);
+        qemu_mutex_init(&remote_iommu_table.lock);
+    }
+
+    pci_setup_iommu(pci_bus, remote_iommu_find_add_as, &remote_iommu_table);
+}
diff --git a/MAINTAINERS b/MAINTAINERS
index e7b0297a63..21694a9698 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3599,6 +3599,8 @@  F: hw/remote/iohub.c
 F: include/hw/remote/iohub.h
 F: subprojects/libvfio-user
 F: hw/remote/vfio-user-obj.c
+F: hw/remote/iommu.c
+F: include/hw/remote/iommu.h
 
 EBPF:
 M: Jason Wang <jasowang@redhat.com>
diff --git a/hw/remote/meson.build b/hw/remote/meson.build
index 534ac5df79..bcef83c8cc 100644
--- a/hw/remote/meson.build
+++ b/hw/remote/meson.build
@@ -6,6 +6,7 @@  remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('message.c'))
 remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('remote-obj.c'))
 remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('proxy.c'))
 remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('iohub.c'))
+remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('iommu.c'))
 remote_ss.add(when: 'CONFIG_VFIO_USER_SERVER', if_true: files('vfio-user-obj.c'))
 
 remote_ss.add(when: 'CONFIG_VFIO_USER_SERVER', if_true: vfiouser)