diff mbox

[qemu,v4,2/3] vfio/spapr: Add a notifier for PPC64 HV/PR KVM about new group attached to LIOBN

Message ID 20170720072231.35054-3-aik@ozlabs.ru
State New
Headers show

Commit Message

Alexey Kardashevskiy July 20, 2017, 7:22 a.m. UTC
This implements a notification for a new IOMMU group attached to
sPAPR's logical IO bus (LIOBN) to enable in-kernel TCE acceleration.

This extends the TYPE_SPAPR_IOMMU_MEMORY_REGION class with a get_fd()
callback which returns KVM fd associated with LIOBN, the notifier uses it
to establish link between LIOBN and IOMMU group in the KVM.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---

The practical reason for adding get_fd() as a callback is avoiding static
linking to spapt_tce_get_fd(): hw/vfio/spapr.c compiles when
CONFIG_SOFTMMU=y to avoid multiple "ifdef PSERIES"'s in the rest
of VFIO code but hw/ppc/spapr_iommu.c (where spapt_tce_get_fd() besides)
compiles only when CONFIG_PSERIES=y.
---
 include/hw/ppc/spapr.h        | 15 +++++++++++++++
 include/hw/vfio/vfio-common.h |  2 ++
 hw/ppc/spapr_iommu.c          | 10 ++++++++++
 hw/vfio/common.c              | 10 ++++++++++
 hw/vfio/spapr.c               | 39 +++++++++++++++++++++++++++++++++++++++
 hw/vfio/trace-events          |  1 +
 6 files changed, 77 insertions(+)

Comments

David Gibson Aug. 25, 2017, 6:21 a.m. UTC | #1
On Thu, Jul 20, 2017 at 05:22:30PM +1000, Alexey Kardashevskiy wrote:
> This implements a notification for a new IOMMU group attached to
> sPAPR's logical IO bus (LIOBN) to enable in-kernel TCE acceleration.
> 
> This extends the TYPE_SPAPR_IOMMU_MEMORY_REGION class with a get_fd()
> callback which returns KVM fd associated with LIOBN, the notifier uses it
> to establish link between LIOBN and IOMMU group in the KVM.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
> 
> The practical reason for adding get_fd() as a callback is avoiding static
> linking to spapt_tce_get_fd(): hw/vfio/spapr.c compiles when
> CONFIG_SOFTMMU=y to avoid multiple "ifdef PSERIES"'s in the rest
> of VFIO code but hw/ppc/spapr_iommu.c (where spapt_tce_get_fd() besides)
> compiles only when CONFIG_PSERIES=y.

Ok.  Nonetheless I don't think the get_fd() method is a good idea.
First, it's basically an abstraction violation, exposing the region's
internal fd.  Second, it's a method which only plausibly has one
implementation which is rarely sensible.

What this comes down to is that the guest IOMMU mechanism needs
information about host vfio groups mapped - for an optimization in
this case.

So what would make sense to me is to put an "add_vfio_group" method
into IOMMUMemoryRegionClass (or even MemoryRegionClass).  In most
cases that will be NULL (== no-op).  For the spapr IOMMU region, it
will (attempt to) connect the host group to the guest liobn.

> ---
>  include/hw/ppc/spapr.h        | 15 +++++++++++++++
>  include/hw/vfio/vfio-common.h |  2 ++
>  hw/ppc/spapr_iommu.c          | 10 ++++++++++
>  hw/vfio/common.c              | 10 ++++++++++
>  hw/vfio/spapr.c               | 39 +++++++++++++++++++++++++++++++++++++++
>  hw/vfio/trace-events          |  1 +
>  6 files changed, 77 insertions(+)
> 
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index 2a303a705c..c1d37e6356 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -591,6 +591,7 @@ void spapr_load_rtas(sPAPRMachineState *spapr, void *fdt, hwaddr addr);
>  #define RTAS_EVENT_SCAN_RATE    1
>  
>  typedef struct sPAPRTCETable sPAPRTCETable;
> +typedef struct sPAPRIOMMUMemoryRegionClass sPAPRIOMMUMemoryRegionClass;
>  
>  #define TYPE_SPAPR_TCE_TABLE "spapr-tce-table"
>  #define SPAPR_TCE_TABLE(obj) \
> @@ -599,6 +600,12 @@ typedef struct sPAPRTCETable sPAPRTCETable;
>  #define TYPE_SPAPR_IOMMU_MEMORY_REGION "spapr-iommu-memory-region"
>  #define SPAPR_IOMMU_MEMORY_REGION(obj) \
>          OBJECT_CHECK(IOMMUMemoryRegion, (obj), TYPE_SPAPR_IOMMU_MEMORY_REGION)
> +#define SPAPR_IOMMU_MEMORY_REGION_GET_CLASS(obj) \
> +        OBJECT_GET_CLASS(sPAPRIOMMUMemoryRegionClass, obj, \
> +                         TYPE_SPAPR_IOMMU_MEMORY_REGION)
> +#define SPAPR_IOMMU_MEMORY_REGION_CLASS(klass) \
> +        OBJECT_CLASS_CHECK(sPAPRIOMMUMemoryRegionClass, klass, \
> +                           TYPE_SPAPR_IOMMU_MEMORY_REGION)
>  
>  struct sPAPRTCETable {
>      DeviceState parent;
> @@ -618,6 +625,14 @@ struct sPAPRTCETable {
>      QLIST_ENTRY(sPAPRTCETable) list;
>  };
>  
> +struct sPAPRIOMMUMemoryRegionClass {
> +    /* private */
> +    IOMMUMemoryRegionClass parent_class;
> +
> +    /* public */
> +    int (*get_fd)(IOMMUMemoryRegion *iommu_mr);
> +};
> +
>  sPAPRTCETable *spapr_tce_find_by_liobn(target_ulong liobn);

To make sure I'm understanding correctly: the MR subclass here is
representing a guest-side property, yes?  It means that on the guest
side the IOMMU mappings are managed by the PAPR {GET,PUT}_TCE
interface.

>  struct sPAPREventLogEntry {
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index f3a2ac9fee..d245d3cecc 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -177,6 +177,8 @@ extern const MemoryListener vfio_prereg_listener;
>  int vfio_spapr_create_window(VFIOContainer *container,
>                               MemoryRegionSection *section,
>                               hwaddr *pgsize);
> +int vfio_spapr_notify_kvm(int vfio_kvm_device_fd, int groupfd,
> +                          IOMMUMemoryRegion *iommumr);
>  int vfio_spapr_remove_window(VFIOContainer *container,
>                               hwaddr offset_within_address_space);
>  
> diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
> index 307dc3021e..82fca61a75 100644
> --- a/hw/ppc/spapr_iommu.c
> +++ b/hw/ppc/spapr_iommu.c
> @@ -171,6 +171,13 @@ static void spapr_tce_notify_flag_changed(IOMMUMemoryRegion *iommu,
>      }
>  }
>  
> +static int spapr_tce_get_fd(IOMMUMemoryRegion *iommu_mr)
> +{
> +    sPAPRTCETable *tcet = container_of(iommu_mr, sPAPRTCETable, iommu);
> +
> +    return tcet->fd;

Does this have a well defined value if there's no KVM?

> +}
> +
>  static int spapr_tce_table_post_load(void *opaque, int version_id)
>  {
>      sPAPRTCETable *tcet = SPAPR_TCE_TABLE(opaque);
> @@ -631,16 +638,19 @@ static TypeInfo spapr_tce_table_info = {
>  static void spapr_iommu_memory_region_class_init(ObjectClass *klass, void *data)
>  {
>      IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
> +    sPAPRIOMMUMemoryRegionClass *simrc = SPAPR_IOMMU_MEMORY_REGION_CLASS(klass);
>  
>      imrc->translate = spapr_tce_translate_iommu;
>      imrc->get_min_page_size = spapr_tce_get_min_page_size;
>      imrc->notify_flag_changed = spapr_tce_notify_flag_changed;
> +    simrc->get_fd = spapr_tce_get_fd;
>  }
>  
>  static const TypeInfo spapr_iommu_memory_region_info = {
>      .parent = TYPE_IOMMU_MEMORY_REGION,
>      .name = TYPE_SPAPR_IOMMU_MEMORY_REGION,
>      .class_init = spapr_iommu_memory_region_class_init,
> +    .class_size = sizeof(sPAPRIOMMUMemoryRegionClass),
>  };
>  
>  static void register_types(void)
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 7b2924c0ef..92f1f88ae8 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -454,6 +454,16 @@ static void vfio_listener_region_add(MemoryListener *listener,
>              goto fail;
>          }
>  
> +#ifdef CONFIG_KVM
> +        if (kvm_enabled()) {
> +            VFIOGroup *group;
> +
> +            QLIST_FOREACH(group, &container->group_list, container_next) {
> +                vfio_spapr_notify_kvm(vfio_kvm_device_fd, group->fd,
> +                                      IOMMU_MEMORY_REGION(section->mr));
> +            }
> +        }

So, here you're informing the region of the groups when the region is
mapped in.  But don't you similarly need to notify if a group is added
to an existing address space?  And won't you also need notifications
of groups/regions being removed?

> +#endif
>          vfio_host_win_add(container, section->offset_within_address_space,
>                            section->offset_within_address_space +
>                            int128_get64(section->size) - 1, pgsize);
> diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
> index 32fd6a9b54..2b9af75c03 100644
> --- a/hw/vfio/spapr.c
> +++ b/hw/vfio/spapr.c
> @@ -15,8 +15,12 @@
>  
>  #include "hw/vfio/vfio-common.h"
>  #include "hw/hw.h"
> +#include "hw/ppc/spapr.h"
>  #include "qemu/error-report.h"
>  #include "trace.h"
> +#ifdef CONFIG_KVM
> +#include "linux/kvm.h"
> +#endif
>  
>  static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section)
>  {
> @@ -188,6 +192,41 @@ int vfio_spapr_create_window(VFIOContainer *container,
>      return 0;
>  }
>  
> +int vfio_spapr_notify_kvm(int vfio_kvm_device_fd, int groupfd,
> +                          IOMMUMemoryRegion *iommu_mr)
> +{
> +#ifdef CONFIG_KVM
> +    struct kvm_vfio_spapr_tce param = {
> +        .groupfd = groupfd,
> +    };
> +    struct kvm_device_attr attr = {
> +        .group = KVM_DEV_VFIO_GROUP,
> +        .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
> +        .addr = (uint64_t)(unsigned long)&param,
> +    };
> +    IOMMUMemoryRegion *spapr_iommu_mr = SPAPR_IOMMU_MEMORY_REGION(iommu_mr);

This will assert if you have a non-spapr guest IOMMU on a ppc host
(e.g. emulating an x86 with VT-d under TCG).

> +    sPAPRIOMMUMemoryRegionClass *simrc =
> +            SPAPR_IOMMU_MEMORY_REGION_GET_CLASS(spapr_iommu_mr);
> +
> +    if (!simrc->get_fd) {
> +        error_report("vfio: No get_fd defined for IOMMU MR");
> +        return -EFAULT;
> +    }
> +
> +    param.tablefd = simrc->get_fd(spapr_iommu_mr);
> +
> +    if (param.tablefd != -1) {
> +        if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
> +            error_report("vfio: failed to setup fd %d for a group with fd %d: %s",
> +                         param.tablefd, param.groupfd, strerror(errno));
> +            return -errno;
> +        }
> +    }
> +    trace_vfio_spapr_notify_kvm(groupfd, param.tablefd);
> +#endif
> +    return 0;
> +}
> +
>  int vfio_spapr_remove_window(VFIOContainer *container,
>                               hwaddr offset_within_address_space)
>  {
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index 2561c6d31a..084a92f7c2 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -123,3 +123,4 @@ vfio_prereg_register(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"P
>  vfio_prereg_unregister(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d"
>  vfio_spapr_create_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x winsize=0x%"PRIx64" offset=0x%"PRIx64
>  vfio_spapr_remove_window(uint64_t off) "offset=%"PRIx64
> +vfio_spapr_notify_kvm(int groupfd, int tablefd) "Attached groupfd %d to liobn fd %d"
Alexey Kardashevskiy Aug. 28, 2017, 4:20 a.m. UTC | #2
On 25/08/17 16:21, David Gibson wrote:
> On Thu, Jul 20, 2017 at 05:22:30PM +1000, Alexey Kardashevskiy wrote:
>> This implements a notification for a new IOMMU group attached to
>> sPAPR's logical IO bus (LIOBN) to enable in-kernel TCE acceleration.
>>
>> This extends the TYPE_SPAPR_IOMMU_MEMORY_REGION class with a get_fd()
>> callback which returns KVM fd associated with LIOBN, the notifier uses it
>> to establish link between LIOBN and IOMMU group in the KVM.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>
>> The practical reason for adding get_fd() as a callback is avoiding static
>> linking to spapt_tce_get_fd(): hw/vfio/spapr.c compiles when
>> CONFIG_SOFTMMU=y to avoid multiple "ifdef PSERIES"'s in the rest
>> of VFIO code but hw/ppc/spapr_iommu.c (where spapt_tce_get_fd() besides)
>> compiles only when CONFIG_PSERIES=y.
> 
> Ok.  Nonetheless I don't think the get_fd() method is a good idea.
> First, it's basically an abstraction violation, exposing the region's
> internal fd.  Second, it's a method which only plausibly has one
> implementation which is rarely sensible.
> 
> What this comes down to is that the guest IOMMU mechanism needs
> information about host vfio groups mapped - for an optimization in
> this case.
> 
> So what would make sense to me is to put an "add_vfio_group" method
> into IOMMUMemoryRegionClass (or even MemoryRegionClass).  In most
> cases that will be NULL (== no-op).  For the spapr IOMMU region, it
> will (attempt to) connect the host group to the guest liobn.


Like this?

IOMMUMemoryRegionClass::add_vfio_group_kvm(int kvm_device_fd, int groupfd);

It is just cleaner to keep kvm_device_fd and its ioclts() in the same place
but ok.


> 
>> ---
>>  include/hw/ppc/spapr.h        | 15 +++++++++++++++
>>  include/hw/vfio/vfio-common.h |  2 ++
>>  hw/ppc/spapr_iommu.c          | 10 ++++++++++
>>  hw/vfio/common.c              | 10 ++++++++++
>>  hw/vfio/spapr.c               | 39 +++++++++++++++++++++++++++++++++++++++
>>  hw/vfio/trace-events          |  1 +
>>  6 files changed, 77 insertions(+)
>>
>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>> index 2a303a705c..c1d37e6356 100644
>> --- a/include/hw/ppc/spapr.h
>> +++ b/include/hw/ppc/spapr.h
>> @@ -591,6 +591,7 @@ void spapr_load_rtas(sPAPRMachineState *spapr, void *fdt, hwaddr addr);
>>  #define RTAS_EVENT_SCAN_RATE    1
>>  
>>  typedef struct sPAPRTCETable sPAPRTCETable;
>> +typedef struct sPAPRIOMMUMemoryRegionClass sPAPRIOMMUMemoryRegionClass;
>>  
>>  #define TYPE_SPAPR_TCE_TABLE "spapr-tce-table"
>>  #define SPAPR_TCE_TABLE(obj) \
>> @@ -599,6 +600,12 @@ typedef struct sPAPRTCETable sPAPRTCETable;
>>  #define TYPE_SPAPR_IOMMU_MEMORY_REGION "spapr-iommu-memory-region"
>>  #define SPAPR_IOMMU_MEMORY_REGION(obj) \
>>          OBJECT_CHECK(IOMMUMemoryRegion, (obj), TYPE_SPAPR_IOMMU_MEMORY_REGION)
>> +#define SPAPR_IOMMU_MEMORY_REGION_GET_CLASS(obj) \
>> +        OBJECT_GET_CLASS(sPAPRIOMMUMemoryRegionClass, obj, \
>> +                         TYPE_SPAPR_IOMMU_MEMORY_REGION)
>> +#define SPAPR_IOMMU_MEMORY_REGION_CLASS(klass) \
>> +        OBJECT_CLASS_CHECK(sPAPRIOMMUMemoryRegionClass, klass, \
>> +                           TYPE_SPAPR_IOMMU_MEMORY_REGION)
>>  
>>  struct sPAPRTCETable {
>>      DeviceState parent;
>> @@ -618,6 +625,14 @@ struct sPAPRTCETable {
>>      QLIST_ENTRY(sPAPRTCETable) list;
>>  };
>>  
>> +struct sPAPRIOMMUMemoryRegionClass {
>> +    /* private */
>> +    IOMMUMemoryRegionClass parent_class;
>> +
>> +    /* public */
>> +    int (*get_fd)(IOMMUMemoryRegion *iommu_mr);
>> +};
>> +
>>  sPAPRTCETable *spapr_tce_find_by_liobn(target_ulong liobn);
> 
> To make sure I'm understanding correctly: the MR subclass here is
> representing a guest-side property, yes?  It means that on the guest
> side the IOMMU mappings are managed by the PAPR {GET,PUT}_TCE
> interface.

I do not understand the question, sorry. MR is a QEMU thing, only QEMU
devices use these MRs as MRs.


>>  struct sPAPREventLogEntry {
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>> index f3a2ac9fee..d245d3cecc 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -177,6 +177,8 @@ extern const MemoryListener vfio_prereg_listener;
>>  int vfio_spapr_create_window(VFIOContainer *container,
>>                               MemoryRegionSection *section,
>>                               hwaddr *pgsize);
>> +int vfio_spapr_notify_kvm(int vfio_kvm_device_fd, int groupfd,
>> +                          IOMMUMemoryRegion *iommumr);
>>  int vfio_spapr_remove_window(VFIOContainer *container,
>>                               hwaddr offset_within_address_space);
>>  
>> diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
>> index 307dc3021e..82fca61a75 100644
>> --- a/hw/ppc/spapr_iommu.c
>> +++ b/hw/ppc/spapr_iommu.c
>> @@ -171,6 +171,13 @@ static void spapr_tce_notify_flag_changed(IOMMUMemoryRegion *iommu,
>>      }
>>  }
>>  
>> +static int spapr_tce_get_fd(IOMMUMemoryRegion *iommu_mr)
>> +{
>> +    sPAPRTCETable *tcet = container_of(iommu_mr, sPAPRTCETable, iommu);
>> +
>> +    return tcet->fd;
> 
> Does this have a well defined value if there's no KVM?


It is -1 in TCG, any other value is undefined.


> 
>> +}
>> +
>>  static int spapr_tce_table_post_load(void *opaque, int version_id)
>>  {
>>      sPAPRTCETable *tcet = SPAPR_TCE_TABLE(opaque);
>> @@ -631,16 +638,19 @@ static TypeInfo spapr_tce_table_info = {
>>  static void spapr_iommu_memory_region_class_init(ObjectClass *klass, void *data)
>>  {
>>      IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
>> +    sPAPRIOMMUMemoryRegionClass *simrc = SPAPR_IOMMU_MEMORY_REGION_CLASS(klass);
>>  
>>      imrc->translate = spapr_tce_translate_iommu;
>>      imrc->get_min_page_size = spapr_tce_get_min_page_size;
>>      imrc->notify_flag_changed = spapr_tce_notify_flag_changed;
>> +    simrc->get_fd = spapr_tce_get_fd;
>>  }
>>  
>>  static const TypeInfo spapr_iommu_memory_region_info = {
>>      .parent = TYPE_IOMMU_MEMORY_REGION,
>>      .name = TYPE_SPAPR_IOMMU_MEMORY_REGION,
>>      .class_init = spapr_iommu_memory_region_class_init,
>> +    .class_size = sizeof(sPAPRIOMMUMemoryRegionClass),
>>  };
>>  
>>  static void register_types(void)
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index 7b2924c0ef..92f1f88ae8 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -454,6 +454,16 @@ static void vfio_listener_region_add(MemoryListener *listener,
>>              goto fail;
>>          }
>>  
>> +#ifdef CONFIG_KVM
>> +        if (kvm_enabled()) {
>> +            VFIOGroup *group;
>> +
>> +            QLIST_FOREACH(group, &container->group_list, container_next) {
>> +                vfio_spapr_notify_kvm(vfio_kvm_device_fd, group->fd,
>> +                                      IOMMU_MEMORY_REGION(section->mr));
>> +            }
>> +        }
> 
> So, here you're informing the region of the groups when the region is
> mapped in.  But don't you similarly need to notify if a group is added
> to an existing address space? 

It is either in VFIO for ages (forever? in vfio_connect_container()), or I
did not understand the question...


> And won't you also need notifications
> of groups/regions being removed?

Same comment. vfio_disconnect_container()?


> 
>> +#endif
>>          vfio_host_win_add(container, section->offset_within_address_space,
>>                            section->offset_within_address_space +
>>                            int128_get64(section->size) - 1, pgsize);
>> diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
>> index 32fd6a9b54..2b9af75c03 100644
>> --- a/hw/vfio/spapr.c
>> +++ b/hw/vfio/spapr.c
>> @@ -15,8 +15,12 @@
>>  
>>  #include "hw/vfio/vfio-common.h"
>>  #include "hw/hw.h"
>> +#include "hw/ppc/spapr.h"
>>  #include "qemu/error-report.h"
>>  #include "trace.h"
>> +#ifdef CONFIG_KVM
>> +#include "linux/kvm.h"
>> +#endif
>>  
>>  static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section)
>>  {
>> @@ -188,6 +192,41 @@ int vfio_spapr_create_window(VFIOContainer *container,
>>      return 0;
>>  }
>>  
>> +int vfio_spapr_notify_kvm(int vfio_kvm_device_fd, int groupfd,
>> +                          IOMMUMemoryRegion *iommu_mr)
>> +{
>> +#ifdef CONFIG_KVM
>> +    struct kvm_vfio_spapr_tce param = {
>> +        .groupfd = groupfd,
>> +    };
>> +    struct kvm_device_attr attr = {
>> +        .group = KVM_DEV_VFIO_GROUP,
>> +        .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
>> +        .addr = (uint64_t)(unsigned long)&param,
>> +    };
>> +    IOMMUMemoryRegion *spapr_iommu_mr = SPAPR_IOMMU_MEMORY_REGION(iommu_mr);
> 
> This will assert if you have a non-spapr guest IOMMU on a ppc host
> (e.g. emulating an x86 with VT-d under TCG).


Sure but this should not execute when TCG, hence "_kvm" in
"vfio_spapr_notify_kvm".


> 
>> +    sPAPRIOMMUMemoryRegionClass *simrc =
>> +            SPAPR_IOMMU_MEMORY_REGION_GET_CLASS(spapr_iommu_mr);
>> +
>> +    if (!simrc->get_fd) {
>> +        error_report("vfio: No get_fd defined for IOMMU MR");
>> +        return -EFAULT;
>> +    }
>> +
>> +    param.tablefd = simrc->get_fd(spapr_iommu_mr);
>> +
>> +    if (param.tablefd != -1) {
>> +        if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
>> +            error_report("vfio: failed to setup fd %d for a group with fd %d: %s",
>> +                         param.tablefd, param.groupfd, strerror(errno));
>> +            return -errno;
>> +        }
>> +    }
>> +    trace_vfio_spapr_notify_kvm(groupfd, param.tablefd);
>> +#endif
>> +    return 0;
>> +}
>> +
>>  int vfio_spapr_remove_window(VFIOContainer *container,
>>                               hwaddr offset_within_address_space)
>>  {
>> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
>> index 2561c6d31a..084a92f7c2 100644
>> --- a/hw/vfio/trace-events
>> +++ b/hw/vfio/trace-events
>> @@ -123,3 +123,4 @@ vfio_prereg_register(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"P
>>  vfio_prereg_unregister(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d"
>>  vfio_spapr_create_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x winsize=0x%"PRIx64" offset=0x%"PRIx64
>>  vfio_spapr_remove_window(uint64_t off) "offset=%"PRIx64
>> +vfio_spapr_notify_kvm(int groupfd, int tablefd) "Attached groupfd %d to liobn fd %d"
>
Alexey Kardashevskiy Sept. 21, 2017, 9:21 a.m. UTC | #3
On 28/08/17 14:20, Alexey Kardashevskiy wrote:
> On 25/08/17 16:21, David Gibson wrote:
>> On Thu, Jul 20, 2017 at 05:22:30PM +1000, Alexey Kardashevskiy wrote:
>>> This implements a notification for a new IOMMU group attached to
>>> sPAPR's logical IO bus (LIOBN) to enable in-kernel TCE acceleration.
>>>
>>> This extends the TYPE_SPAPR_IOMMU_MEMORY_REGION class with a get_fd()
>>> callback which returns KVM fd associated with LIOBN, the notifier uses it
>>> to establish link between LIOBN and IOMMU group in the KVM.
>>>
>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>

Ping?


>>> ---
>>>
>>> The practical reason for adding get_fd() as a callback is avoiding static
>>> linking to spapt_tce_get_fd(): hw/vfio/spapr.c compiles when
>>> CONFIG_SOFTMMU=y to avoid multiple "ifdef PSERIES"'s in the rest
>>> of VFIO code but hw/ppc/spapr_iommu.c (where spapt_tce_get_fd() besides)
>>> compiles only when CONFIG_PSERIES=y.
>>
>> Ok.  Nonetheless I don't think the get_fd() method is a good idea.
>> First, it's basically an abstraction violation, exposing the region's
>> internal fd.  Second, it's a method which only plausibly has one
>> implementation which is rarely sensible.
>>
>> What this comes down to is that the guest IOMMU mechanism needs
>> information about host vfio groups mapped - for an optimization in
>> this case.
>>
>> So what would make sense to me is to put an "add_vfio_group" method
>> into IOMMUMemoryRegionClass (or even MemoryRegionClass).  In most
>> cases that will be NULL (== no-op).  For the spapr IOMMU region, it
>> will (attempt to) connect the host group to the guest liobn.
> 
> 
> Like this?
> 
> IOMMUMemoryRegionClass::add_vfio_group_kvm(int kvm_device_fd, int groupfd);
> 
> It is just cleaner to keep kvm_device_fd and its ioclts() in the same place
> but ok.
> 
> 
>>
>>> ---
>>>  include/hw/ppc/spapr.h        | 15 +++++++++++++++
>>>  include/hw/vfio/vfio-common.h |  2 ++
>>>  hw/ppc/spapr_iommu.c          | 10 ++++++++++
>>>  hw/vfio/common.c              | 10 ++++++++++
>>>  hw/vfio/spapr.c               | 39 +++++++++++++++++++++++++++++++++++++++
>>>  hw/vfio/trace-events          |  1 +
>>>  6 files changed, 77 insertions(+)
>>>
>>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>>> index 2a303a705c..c1d37e6356 100644
>>> --- a/include/hw/ppc/spapr.h
>>> +++ b/include/hw/ppc/spapr.h
>>> @@ -591,6 +591,7 @@ void spapr_load_rtas(sPAPRMachineState *spapr, void *fdt, hwaddr addr);
>>>  #define RTAS_EVENT_SCAN_RATE    1
>>>  
>>>  typedef struct sPAPRTCETable sPAPRTCETable;
>>> +typedef struct sPAPRIOMMUMemoryRegionClass sPAPRIOMMUMemoryRegionClass;
>>>  
>>>  #define TYPE_SPAPR_TCE_TABLE "spapr-tce-table"
>>>  #define SPAPR_TCE_TABLE(obj) \
>>> @@ -599,6 +600,12 @@ typedef struct sPAPRTCETable sPAPRTCETable;
>>>  #define TYPE_SPAPR_IOMMU_MEMORY_REGION "spapr-iommu-memory-region"
>>>  #define SPAPR_IOMMU_MEMORY_REGION(obj) \
>>>          OBJECT_CHECK(IOMMUMemoryRegion, (obj), TYPE_SPAPR_IOMMU_MEMORY_REGION)
>>> +#define SPAPR_IOMMU_MEMORY_REGION_GET_CLASS(obj) \
>>> +        OBJECT_GET_CLASS(sPAPRIOMMUMemoryRegionClass, obj, \
>>> +                         TYPE_SPAPR_IOMMU_MEMORY_REGION)
>>> +#define SPAPR_IOMMU_MEMORY_REGION_CLASS(klass) \
>>> +        OBJECT_CLASS_CHECK(sPAPRIOMMUMemoryRegionClass, klass, \
>>> +                           TYPE_SPAPR_IOMMU_MEMORY_REGION)
>>>  
>>>  struct sPAPRTCETable {
>>>      DeviceState parent;
>>> @@ -618,6 +625,14 @@ struct sPAPRTCETable {
>>>      QLIST_ENTRY(sPAPRTCETable) list;
>>>  };
>>>  
>>> +struct sPAPRIOMMUMemoryRegionClass {
>>> +    /* private */
>>> +    IOMMUMemoryRegionClass parent_class;
>>> +
>>> +    /* public */
>>> +    int (*get_fd)(IOMMUMemoryRegion *iommu_mr);
>>> +};
>>> +
>>>  sPAPRTCETable *spapr_tce_find_by_liobn(target_ulong liobn);
>>
>> To make sure I'm understanding correctly: the MR subclass here is
>> representing a guest-side property, yes?  It means that on the guest
>> side the IOMMU mappings are managed by the PAPR {GET,PUT}_TCE
>> interface.
> 
> I do not understand the question, sorry. MR is a QEMU thing, only QEMU
> devices use these MRs as MRs.
> 
> 
>>>  struct sPAPREventLogEntry {
>>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>>> index f3a2ac9fee..d245d3cecc 100644
>>> --- a/include/hw/vfio/vfio-common.h
>>> +++ b/include/hw/vfio/vfio-common.h
>>> @@ -177,6 +177,8 @@ extern const MemoryListener vfio_prereg_listener;
>>>  int vfio_spapr_create_window(VFIOContainer *container,
>>>                               MemoryRegionSection *section,
>>>                               hwaddr *pgsize);
>>> +int vfio_spapr_notify_kvm(int vfio_kvm_device_fd, int groupfd,
>>> +                          IOMMUMemoryRegion *iommumr);
>>>  int vfio_spapr_remove_window(VFIOContainer *container,
>>>                               hwaddr offset_within_address_space);
>>>  
>>> diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
>>> index 307dc3021e..82fca61a75 100644
>>> --- a/hw/ppc/spapr_iommu.c
>>> +++ b/hw/ppc/spapr_iommu.c
>>> @@ -171,6 +171,13 @@ static void spapr_tce_notify_flag_changed(IOMMUMemoryRegion *iommu,
>>>      }
>>>  }
>>>  
>>> +static int spapr_tce_get_fd(IOMMUMemoryRegion *iommu_mr)
>>> +{
>>> +    sPAPRTCETable *tcet = container_of(iommu_mr, sPAPRTCETable, iommu);
>>> +
>>> +    return tcet->fd;
>>
>> Does this have a well defined value if there's no KVM?
> 
> 
> It is -1 in TCG, any other value is undefined.
> 
> 
>>
>>> +}
>>> +
>>>  static int spapr_tce_table_post_load(void *opaque, int version_id)
>>>  {
>>>      sPAPRTCETable *tcet = SPAPR_TCE_TABLE(opaque);
>>> @@ -631,16 +638,19 @@ static TypeInfo spapr_tce_table_info = {
>>>  static void spapr_iommu_memory_region_class_init(ObjectClass *klass, void *data)
>>>  {
>>>      IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
>>> +    sPAPRIOMMUMemoryRegionClass *simrc = SPAPR_IOMMU_MEMORY_REGION_CLASS(klass);
>>>  
>>>      imrc->translate = spapr_tce_translate_iommu;
>>>      imrc->get_min_page_size = spapr_tce_get_min_page_size;
>>>      imrc->notify_flag_changed = spapr_tce_notify_flag_changed;
>>> +    simrc->get_fd = spapr_tce_get_fd;
>>>  }
>>>  
>>>  static const TypeInfo spapr_iommu_memory_region_info = {
>>>      .parent = TYPE_IOMMU_MEMORY_REGION,
>>>      .name = TYPE_SPAPR_IOMMU_MEMORY_REGION,
>>>      .class_init = spapr_iommu_memory_region_class_init,
>>> +    .class_size = sizeof(sPAPRIOMMUMemoryRegionClass),
>>>  };
>>>  
>>>  static void register_types(void)
>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>>> index 7b2924c0ef..92f1f88ae8 100644
>>> --- a/hw/vfio/common.c
>>> +++ b/hw/vfio/common.c
>>> @@ -454,6 +454,16 @@ static void vfio_listener_region_add(MemoryListener *listener,
>>>              goto fail;
>>>          }
>>>  
>>> +#ifdef CONFIG_KVM
>>> +        if (kvm_enabled()) {
>>> +            VFIOGroup *group;
>>> +
>>> +            QLIST_FOREACH(group, &container->group_list, container_next) {
>>> +                vfio_spapr_notify_kvm(vfio_kvm_device_fd, group->fd,
>>> +                                      IOMMU_MEMORY_REGION(section->mr));
>>> +            }
>>> +        }
>>
>> So, here you're informing the region of the groups when the region is
>> mapped in.  But don't you similarly need to notify if a group is added
>> to an existing address space? 
> 
> It is either in VFIO for ages (forever? in vfio_connect_container()), or I
> did not understand the question...
> 
> 
>> And won't you also need notifications
>> of groups/regions being removed?
> 
> Same comment. vfio_disconnect_container()?
> 
> 
>>
>>> +#endif
>>>          vfio_host_win_add(container, section->offset_within_address_space,
>>>                            section->offset_within_address_space +
>>>                            int128_get64(section->size) - 1, pgsize);
>>> diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
>>> index 32fd6a9b54..2b9af75c03 100644
>>> --- a/hw/vfio/spapr.c
>>> +++ b/hw/vfio/spapr.c
>>> @@ -15,8 +15,12 @@
>>>  
>>>  #include "hw/vfio/vfio-common.h"
>>>  #include "hw/hw.h"
>>> +#include "hw/ppc/spapr.h"
>>>  #include "qemu/error-report.h"
>>>  #include "trace.h"
>>> +#ifdef CONFIG_KVM
>>> +#include "linux/kvm.h"
>>> +#endif
>>>  
>>>  static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section)
>>>  {
>>> @@ -188,6 +192,41 @@ int vfio_spapr_create_window(VFIOContainer *container,
>>>      return 0;
>>>  }
>>>  
>>> +int vfio_spapr_notify_kvm(int vfio_kvm_device_fd, int groupfd,
>>> +                          IOMMUMemoryRegion *iommu_mr)
>>> +{
>>> +#ifdef CONFIG_KVM
>>> +    struct kvm_vfio_spapr_tce param = {
>>> +        .groupfd = groupfd,
>>> +    };
>>> +    struct kvm_device_attr attr = {
>>> +        .group = KVM_DEV_VFIO_GROUP,
>>> +        .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
>>> +        .addr = (uint64_t)(unsigned long)&param,
>>> +    };
>>> +    IOMMUMemoryRegion *spapr_iommu_mr = SPAPR_IOMMU_MEMORY_REGION(iommu_mr);
>>
>> This will assert if you have a non-spapr guest IOMMU on a ppc host
>> (e.g. emulating an x86 with VT-d under TCG).
> 
> 
> Sure but this should not execute when TCG, hence "_kvm" in
> "vfio_spapr_notify_kvm".
>
diff mbox

Patch

diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 2a303a705c..c1d37e6356 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -591,6 +591,7 @@  void spapr_load_rtas(sPAPRMachineState *spapr, void *fdt, hwaddr addr);
 #define RTAS_EVENT_SCAN_RATE    1
 
 typedef struct sPAPRTCETable sPAPRTCETable;
+typedef struct sPAPRIOMMUMemoryRegionClass sPAPRIOMMUMemoryRegionClass;
 
 #define TYPE_SPAPR_TCE_TABLE "spapr-tce-table"
 #define SPAPR_TCE_TABLE(obj) \
@@ -599,6 +600,12 @@  typedef struct sPAPRTCETable sPAPRTCETable;
 #define TYPE_SPAPR_IOMMU_MEMORY_REGION "spapr-iommu-memory-region"
 #define SPAPR_IOMMU_MEMORY_REGION(obj) \
         OBJECT_CHECK(IOMMUMemoryRegion, (obj), TYPE_SPAPR_IOMMU_MEMORY_REGION)
+#define SPAPR_IOMMU_MEMORY_REGION_GET_CLASS(obj) \
+        OBJECT_GET_CLASS(sPAPRIOMMUMemoryRegionClass, obj, \
+                         TYPE_SPAPR_IOMMU_MEMORY_REGION)
+#define SPAPR_IOMMU_MEMORY_REGION_CLASS(klass) \
+        OBJECT_CLASS_CHECK(sPAPRIOMMUMemoryRegionClass, klass, \
+                           TYPE_SPAPR_IOMMU_MEMORY_REGION)
 
 struct sPAPRTCETable {
     DeviceState parent;
@@ -618,6 +625,14 @@  struct sPAPRTCETable {
     QLIST_ENTRY(sPAPRTCETable) list;
 };
 
+struct sPAPRIOMMUMemoryRegionClass {
+    /* private */
+    IOMMUMemoryRegionClass parent_class;
+
+    /* public */
+    int (*get_fd)(IOMMUMemoryRegion *iommu_mr);
+};
+
 sPAPRTCETable *spapr_tce_find_by_liobn(target_ulong liobn);
 
 struct sPAPREventLogEntry {
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index f3a2ac9fee..d245d3cecc 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -177,6 +177,8 @@  extern const MemoryListener vfio_prereg_listener;
 int vfio_spapr_create_window(VFIOContainer *container,
                              MemoryRegionSection *section,
                              hwaddr *pgsize);
+int vfio_spapr_notify_kvm(int vfio_kvm_device_fd, int groupfd,
+                          IOMMUMemoryRegion *iommumr);
 int vfio_spapr_remove_window(VFIOContainer *container,
                              hwaddr offset_within_address_space);
 
diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
index 307dc3021e..82fca61a75 100644
--- a/hw/ppc/spapr_iommu.c
+++ b/hw/ppc/spapr_iommu.c
@@ -171,6 +171,13 @@  static void spapr_tce_notify_flag_changed(IOMMUMemoryRegion *iommu,
     }
 }
 
+static int spapr_tce_get_fd(IOMMUMemoryRegion *iommu_mr)
+{
+    sPAPRTCETable *tcet = container_of(iommu_mr, sPAPRTCETable, iommu);
+
+    return tcet->fd;
+}
+
 static int spapr_tce_table_post_load(void *opaque, int version_id)
 {
     sPAPRTCETable *tcet = SPAPR_TCE_TABLE(opaque);
@@ -631,16 +638,19 @@  static TypeInfo spapr_tce_table_info = {
 static void spapr_iommu_memory_region_class_init(ObjectClass *klass, void *data)
 {
     IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
+    sPAPRIOMMUMemoryRegionClass *simrc = SPAPR_IOMMU_MEMORY_REGION_CLASS(klass);
 
     imrc->translate = spapr_tce_translate_iommu;
     imrc->get_min_page_size = spapr_tce_get_min_page_size;
     imrc->notify_flag_changed = spapr_tce_notify_flag_changed;
+    simrc->get_fd = spapr_tce_get_fd;
 }
 
 static const TypeInfo spapr_iommu_memory_region_info = {
     .parent = TYPE_IOMMU_MEMORY_REGION,
     .name = TYPE_SPAPR_IOMMU_MEMORY_REGION,
     .class_init = spapr_iommu_memory_region_class_init,
+    .class_size = sizeof(sPAPRIOMMUMemoryRegionClass),
 };
 
 static void register_types(void)
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 7b2924c0ef..92f1f88ae8 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -454,6 +454,16 @@  static void vfio_listener_region_add(MemoryListener *listener,
             goto fail;
         }
 
+#ifdef CONFIG_KVM
+        if (kvm_enabled()) {
+            VFIOGroup *group;
+
+            QLIST_FOREACH(group, &container->group_list, container_next) {
+                vfio_spapr_notify_kvm(vfio_kvm_device_fd, group->fd,
+                                      IOMMU_MEMORY_REGION(section->mr));
+            }
+        }
+#endif
         vfio_host_win_add(container, section->offset_within_address_space,
                           section->offset_within_address_space +
                           int128_get64(section->size) - 1, pgsize);
diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
index 32fd6a9b54..2b9af75c03 100644
--- a/hw/vfio/spapr.c
+++ b/hw/vfio/spapr.c
@@ -15,8 +15,12 @@ 
 
 #include "hw/vfio/vfio-common.h"
 #include "hw/hw.h"
+#include "hw/ppc/spapr.h"
 #include "qemu/error-report.h"
 #include "trace.h"
+#ifdef CONFIG_KVM
+#include "linux/kvm.h"
+#endif
 
 static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section)
 {
@@ -188,6 +192,41 @@  int vfio_spapr_create_window(VFIOContainer *container,
     return 0;
 }
 
+int vfio_spapr_notify_kvm(int vfio_kvm_device_fd, int groupfd,
+                          IOMMUMemoryRegion *iommu_mr)
+{
+#ifdef CONFIG_KVM
+    struct kvm_vfio_spapr_tce param = {
+        .groupfd = groupfd,
+    };
+    struct kvm_device_attr attr = {
+        .group = KVM_DEV_VFIO_GROUP,
+        .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
+        .addr = (uint64_t)(unsigned long)&param,
+    };
+    IOMMUMemoryRegion *spapr_iommu_mr = SPAPR_IOMMU_MEMORY_REGION(iommu_mr);
+    sPAPRIOMMUMemoryRegionClass *simrc =
+            SPAPR_IOMMU_MEMORY_REGION_GET_CLASS(spapr_iommu_mr);
+
+    if (!simrc->get_fd) {
+        error_report("vfio: No get_fd defined for IOMMU MR");
+        return -EFAULT;
+    }
+
+    param.tablefd = simrc->get_fd(spapr_iommu_mr);
+
+    if (param.tablefd != -1) {
+        if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
+            error_report("vfio: failed to setup fd %d for a group with fd %d: %s",
+                         param.tablefd, param.groupfd, strerror(errno));
+            return -errno;
+        }
+    }
+    trace_vfio_spapr_notify_kvm(groupfd, param.tablefd);
+#endif
+    return 0;
+}
+
 int vfio_spapr_remove_window(VFIOContainer *container,
                              hwaddr offset_within_address_space)
 {
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 2561c6d31a..084a92f7c2 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -123,3 +123,4 @@  vfio_prereg_register(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"P
 vfio_prereg_unregister(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d"
 vfio_spapr_create_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x winsize=0x%"PRIx64" offset=0x%"PRIx64
 vfio_spapr_remove_window(uint64_t off) "offset=%"PRIx64
+vfio_spapr_notify_kvm(int groupfd, int tablefd) "Attached groupfd %d to liobn fd %d"