Message ID | 20170720072231.35054-3-aik@ozlabs.ru |
---|---|
State | New |
Headers | show |
On Thu, Jul 20, 2017 at 05:22:30PM +1000, Alexey Kardashevskiy wrote: > This implements a notification for a new IOMMU group attached to > sPAPR's logical IO bus (LIOBN) to enable in-kernel TCE acceleration. > > This extends the TYPE_SPAPR_IOMMU_MEMORY_REGION class with a get_fd() > callback which returns KVM fd associated with LIOBN, the notifier uses it > to establish link between LIOBN and IOMMU group in the KVM. > > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> > --- > > The practical reason for adding get_fd() as a callback is avoiding static > linking to spapt_tce_get_fd(): hw/vfio/spapr.c compiles when > CONFIG_SOFTMMU=y to avoid multiple "ifdef PSERIES"'s in the rest > of VFIO code but hw/ppc/spapr_iommu.c (where spapt_tce_get_fd() besides) > compiles only when CONFIG_PSERIES=y. Ok. Nonetheless I don't think the get_fd() method is a good idea. First, it's basically an abstraction violation, exposing the region's internal fd. Second, it's a method which only plausibly has one implementation which is rarely sensible. What this comes down to is that the guest IOMMU mechanism needs information about host vfio groups mapped - for an optimization in this case. So what would make sense to me is to put an "add_vfio_group" method into IOMMUMemoryRegionClass (or even MemoryRegionClass). In most cases that will be NULL (== no-op). For the spapr IOMMU region, it will (attempt to) connect the host group to the guest liobn. > --- > include/hw/ppc/spapr.h | 15 +++++++++++++++ > include/hw/vfio/vfio-common.h | 2 ++ > hw/ppc/spapr_iommu.c | 10 ++++++++++ > hw/vfio/common.c | 10 ++++++++++ > hw/vfio/spapr.c | 39 +++++++++++++++++++++++++++++++++++++++ > hw/vfio/trace-events | 1 + > 6 files changed, 77 insertions(+) > > diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h > index 2a303a705c..c1d37e6356 100644 > --- a/include/hw/ppc/spapr.h > +++ b/include/hw/ppc/spapr.h > @@ -591,6 +591,7 @@ void spapr_load_rtas(sPAPRMachineState *spapr, void *fdt, hwaddr addr); > #define RTAS_EVENT_SCAN_RATE 1 > > typedef struct sPAPRTCETable sPAPRTCETable; > +typedef struct sPAPRIOMMUMemoryRegionClass sPAPRIOMMUMemoryRegionClass; > > #define TYPE_SPAPR_TCE_TABLE "spapr-tce-table" > #define SPAPR_TCE_TABLE(obj) \ > @@ -599,6 +600,12 @@ typedef struct sPAPRTCETable sPAPRTCETable; > #define TYPE_SPAPR_IOMMU_MEMORY_REGION "spapr-iommu-memory-region" > #define SPAPR_IOMMU_MEMORY_REGION(obj) \ > OBJECT_CHECK(IOMMUMemoryRegion, (obj), TYPE_SPAPR_IOMMU_MEMORY_REGION) > +#define SPAPR_IOMMU_MEMORY_REGION_GET_CLASS(obj) \ > + OBJECT_GET_CLASS(sPAPRIOMMUMemoryRegionClass, obj, \ > + TYPE_SPAPR_IOMMU_MEMORY_REGION) > +#define SPAPR_IOMMU_MEMORY_REGION_CLASS(klass) \ > + OBJECT_CLASS_CHECK(sPAPRIOMMUMemoryRegionClass, klass, \ > + TYPE_SPAPR_IOMMU_MEMORY_REGION) > > struct sPAPRTCETable { > DeviceState parent; > @@ -618,6 +625,14 @@ struct sPAPRTCETable { > QLIST_ENTRY(sPAPRTCETable) list; > }; > > +struct sPAPRIOMMUMemoryRegionClass { > + /* private */ > + IOMMUMemoryRegionClass parent_class; > + > + /* public */ > + int (*get_fd)(IOMMUMemoryRegion *iommu_mr); > +}; > + > sPAPRTCETable *spapr_tce_find_by_liobn(target_ulong liobn); To make sure I'm understanding correctly: the MR subclass here is representing a guest-side property, yes? It means that on the guest side the IOMMU mappings are managed by the PAPR {GET,PUT}_TCE interface. > struct sPAPREventLogEntry { > diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h > index f3a2ac9fee..d245d3cecc 100644 > --- a/include/hw/vfio/vfio-common.h > +++ b/include/hw/vfio/vfio-common.h > @@ -177,6 +177,8 @@ extern const MemoryListener vfio_prereg_listener; > int vfio_spapr_create_window(VFIOContainer *container, > MemoryRegionSection *section, > hwaddr *pgsize); > +int vfio_spapr_notify_kvm(int vfio_kvm_device_fd, int groupfd, > + IOMMUMemoryRegion *iommumr); > int vfio_spapr_remove_window(VFIOContainer *container, > hwaddr offset_within_address_space); > > diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c > index 307dc3021e..82fca61a75 100644 > --- a/hw/ppc/spapr_iommu.c > +++ b/hw/ppc/spapr_iommu.c > @@ -171,6 +171,13 @@ static void spapr_tce_notify_flag_changed(IOMMUMemoryRegion *iommu, > } > } > > +static int spapr_tce_get_fd(IOMMUMemoryRegion *iommu_mr) > +{ > + sPAPRTCETable *tcet = container_of(iommu_mr, sPAPRTCETable, iommu); > + > + return tcet->fd; Does this have a well defined value if there's no KVM? > +} > + > static int spapr_tce_table_post_load(void *opaque, int version_id) > { > sPAPRTCETable *tcet = SPAPR_TCE_TABLE(opaque); > @@ -631,16 +638,19 @@ static TypeInfo spapr_tce_table_info = { > static void spapr_iommu_memory_region_class_init(ObjectClass *klass, void *data) > { > IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass); > + sPAPRIOMMUMemoryRegionClass *simrc = SPAPR_IOMMU_MEMORY_REGION_CLASS(klass); > > imrc->translate = spapr_tce_translate_iommu; > imrc->get_min_page_size = spapr_tce_get_min_page_size; > imrc->notify_flag_changed = spapr_tce_notify_flag_changed; > + simrc->get_fd = spapr_tce_get_fd; > } > > static const TypeInfo spapr_iommu_memory_region_info = { > .parent = TYPE_IOMMU_MEMORY_REGION, > .name = TYPE_SPAPR_IOMMU_MEMORY_REGION, > .class_init = spapr_iommu_memory_region_class_init, > + .class_size = sizeof(sPAPRIOMMUMemoryRegionClass), > }; > > static void register_types(void) > diff --git a/hw/vfio/common.c b/hw/vfio/common.c > index 7b2924c0ef..92f1f88ae8 100644 > --- a/hw/vfio/common.c > +++ b/hw/vfio/common.c > @@ -454,6 +454,16 @@ static void vfio_listener_region_add(MemoryListener *listener, > goto fail; > } > > +#ifdef CONFIG_KVM > + if (kvm_enabled()) { > + VFIOGroup *group; > + > + QLIST_FOREACH(group, &container->group_list, container_next) { > + vfio_spapr_notify_kvm(vfio_kvm_device_fd, group->fd, > + IOMMU_MEMORY_REGION(section->mr)); > + } > + } So, here you're informing the region of the groups when the region is mapped in. But don't you similarly need to notify if a group is added to an existing address space? And won't you also need notifications of groups/regions being removed? > +#endif > vfio_host_win_add(container, section->offset_within_address_space, > section->offset_within_address_space + > int128_get64(section->size) - 1, pgsize); > diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c > index 32fd6a9b54..2b9af75c03 100644 > --- a/hw/vfio/spapr.c > +++ b/hw/vfio/spapr.c > @@ -15,8 +15,12 @@ > > #include "hw/vfio/vfio-common.h" > #include "hw/hw.h" > +#include "hw/ppc/spapr.h" > #include "qemu/error-report.h" > #include "trace.h" > +#ifdef CONFIG_KVM > +#include "linux/kvm.h" > +#endif > > static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section) > { > @@ -188,6 +192,41 @@ int vfio_spapr_create_window(VFIOContainer *container, > return 0; > } > > +int vfio_spapr_notify_kvm(int vfio_kvm_device_fd, int groupfd, > + IOMMUMemoryRegion *iommu_mr) > +{ > +#ifdef CONFIG_KVM > + struct kvm_vfio_spapr_tce param = { > + .groupfd = groupfd, > + }; > + struct kvm_device_attr attr = { > + .group = KVM_DEV_VFIO_GROUP, > + .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE, > + .addr = (uint64_t)(unsigned long)¶m, > + }; > + IOMMUMemoryRegion *spapr_iommu_mr = SPAPR_IOMMU_MEMORY_REGION(iommu_mr); This will assert if you have a non-spapr guest IOMMU on a ppc host (e.g. emulating an x86 with VT-d under TCG). > + sPAPRIOMMUMemoryRegionClass *simrc = > + SPAPR_IOMMU_MEMORY_REGION_GET_CLASS(spapr_iommu_mr); > + > + if (!simrc->get_fd) { > + error_report("vfio: No get_fd defined for IOMMU MR"); > + return -EFAULT; > + } > + > + param.tablefd = simrc->get_fd(spapr_iommu_mr); > + > + if (param.tablefd != -1) { > + if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { > + error_report("vfio: failed to setup fd %d for a group with fd %d: %s", > + param.tablefd, param.groupfd, strerror(errno)); > + return -errno; > + } > + } > + trace_vfio_spapr_notify_kvm(groupfd, param.tablefd); > +#endif > + return 0; > +} > + > int vfio_spapr_remove_window(VFIOContainer *container, > hwaddr offset_within_address_space) > { > diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events > index 2561c6d31a..084a92f7c2 100644 > --- a/hw/vfio/trace-events > +++ b/hw/vfio/trace-events > @@ -123,3 +123,4 @@ vfio_prereg_register(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"P > vfio_prereg_unregister(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d" > vfio_spapr_create_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x winsize=0x%"PRIx64" offset=0x%"PRIx64 > vfio_spapr_remove_window(uint64_t off) "offset=%"PRIx64 > +vfio_spapr_notify_kvm(int groupfd, int tablefd) "Attached groupfd %d to liobn fd %d"
On 25/08/17 16:21, David Gibson wrote: > On Thu, Jul 20, 2017 at 05:22:30PM +1000, Alexey Kardashevskiy wrote: >> This implements a notification for a new IOMMU group attached to >> sPAPR's logical IO bus (LIOBN) to enable in-kernel TCE acceleration. >> >> This extends the TYPE_SPAPR_IOMMU_MEMORY_REGION class with a get_fd() >> callback which returns KVM fd associated with LIOBN, the notifier uses it >> to establish link between LIOBN and IOMMU group in the KVM. >> >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> >> --- >> >> The practical reason for adding get_fd() as a callback is avoiding static >> linking to spapt_tce_get_fd(): hw/vfio/spapr.c compiles when >> CONFIG_SOFTMMU=y to avoid multiple "ifdef PSERIES"'s in the rest >> of VFIO code but hw/ppc/spapr_iommu.c (where spapt_tce_get_fd() besides) >> compiles only when CONFIG_PSERIES=y. > > Ok. Nonetheless I don't think the get_fd() method is a good idea. > First, it's basically an abstraction violation, exposing the region's > internal fd. Second, it's a method which only plausibly has one > implementation which is rarely sensible. > > What this comes down to is that the guest IOMMU mechanism needs > information about host vfio groups mapped - for an optimization in > this case. > > So what would make sense to me is to put an "add_vfio_group" method > into IOMMUMemoryRegionClass (or even MemoryRegionClass). In most > cases that will be NULL (== no-op). For the spapr IOMMU region, it > will (attempt to) connect the host group to the guest liobn. Like this? IOMMUMemoryRegionClass::add_vfio_group_kvm(int kvm_device_fd, int groupfd); It is just cleaner to keep kvm_device_fd and its ioclts() in the same place but ok. > >> --- >> include/hw/ppc/spapr.h | 15 +++++++++++++++ >> include/hw/vfio/vfio-common.h | 2 ++ >> hw/ppc/spapr_iommu.c | 10 ++++++++++ >> hw/vfio/common.c | 10 ++++++++++ >> hw/vfio/spapr.c | 39 +++++++++++++++++++++++++++++++++++++++ >> hw/vfio/trace-events | 1 + >> 6 files changed, 77 insertions(+) >> >> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h >> index 2a303a705c..c1d37e6356 100644 >> --- a/include/hw/ppc/spapr.h >> +++ b/include/hw/ppc/spapr.h >> @@ -591,6 +591,7 @@ void spapr_load_rtas(sPAPRMachineState *spapr, void *fdt, hwaddr addr); >> #define RTAS_EVENT_SCAN_RATE 1 >> >> typedef struct sPAPRTCETable sPAPRTCETable; >> +typedef struct sPAPRIOMMUMemoryRegionClass sPAPRIOMMUMemoryRegionClass; >> >> #define TYPE_SPAPR_TCE_TABLE "spapr-tce-table" >> #define SPAPR_TCE_TABLE(obj) \ >> @@ -599,6 +600,12 @@ typedef struct sPAPRTCETable sPAPRTCETable; >> #define TYPE_SPAPR_IOMMU_MEMORY_REGION "spapr-iommu-memory-region" >> #define SPAPR_IOMMU_MEMORY_REGION(obj) \ >> OBJECT_CHECK(IOMMUMemoryRegion, (obj), TYPE_SPAPR_IOMMU_MEMORY_REGION) >> +#define SPAPR_IOMMU_MEMORY_REGION_GET_CLASS(obj) \ >> + OBJECT_GET_CLASS(sPAPRIOMMUMemoryRegionClass, obj, \ >> + TYPE_SPAPR_IOMMU_MEMORY_REGION) >> +#define SPAPR_IOMMU_MEMORY_REGION_CLASS(klass) \ >> + OBJECT_CLASS_CHECK(sPAPRIOMMUMemoryRegionClass, klass, \ >> + TYPE_SPAPR_IOMMU_MEMORY_REGION) >> >> struct sPAPRTCETable { >> DeviceState parent; >> @@ -618,6 +625,14 @@ struct sPAPRTCETable { >> QLIST_ENTRY(sPAPRTCETable) list; >> }; >> >> +struct sPAPRIOMMUMemoryRegionClass { >> + /* private */ >> + IOMMUMemoryRegionClass parent_class; >> + >> + /* public */ >> + int (*get_fd)(IOMMUMemoryRegion *iommu_mr); >> +}; >> + >> sPAPRTCETable *spapr_tce_find_by_liobn(target_ulong liobn); > > To make sure I'm understanding correctly: the MR subclass here is > representing a guest-side property, yes? It means that on the guest > side the IOMMU mappings are managed by the PAPR {GET,PUT}_TCE > interface. I do not understand the question, sorry. MR is a QEMU thing, only QEMU devices use these MRs as MRs. >> struct sPAPREventLogEntry { >> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h >> index f3a2ac9fee..d245d3cecc 100644 >> --- a/include/hw/vfio/vfio-common.h >> +++ b/include/hw/vfio/vfio-common.h >> @@ -177,6 +177,8 @@ extern const MemoryListener vfio_prereg_listener; >> int vfio_spapr_create_window(VFIOContainer *container, >> MemoryRegionSection *section, >> hwaddr *pgsize); >> +int vfio_spapr_notify_kvm(int vfio_kvm_device_fd, int groupfd, >> + IOMMUMemoryRegion *iommumr); >> int vfio_spapr_remove_window(VFIOContainer *container, >> hwaddr offset_within_address_space); >> >> diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c >> index 307dc3021e..82fca61a75 100644 >> --- a/hw/ppc/spapr_iommu.c >> +++ b/hw/ppc/spapr_iommu.c >> @@ -171,6 +171,13 @@ static void spapr_tce_notify_flag_changed(IOMMUMemoryRegion *iommu, >> } >> } >> >> +static int spapr_tce_get_fd(IOMMUMemoryRegion *iommu_mr) >> +{ >> + sPAPRTCETable *tcet = container_of(iommu_mr, sPAPRTCETable, iommu); >> + >> + return tcet->fd; > > Does this have a well defined value if there's no KVM? It is -1 in TCG, any other value is undefined. > >> +} >> + >> static int spapr_tce_table_post_load(void *opaque, int version_id) >> { >> sPAPRTCETable *tcet = SPAPR_TCE_TABLE(opaque); >> @@ -631,16 +638,19 @@ static TypeInfo spapr_tce_table_info = { >> static void spapr_iommu_memory_region_class_init(ObjectClass *klass, void *data) >> { >> IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass); >> + sPAPRIOMMUMemoryRegionClass *simrc = SPAPR_IOMMU_MEMORY_REGION_CLASS(klass); >> >> imrc->translate = spapr_tce_translate_iommu; >> imrc->get_min_page_size = spapr_tce_get_min_page_size; >> imrc->notify_flag_changed = spapr_tce_notify_flag_changed; >> + simrc->get_fd = spapr_tce_get_fd; >> } >> >> static const TypeInfo spapr_iommu_memory_region_info = { >> .parent = TYPE_IOMMU_MEMORY_REGION, >> .name = TYPE_SPAPR_IOMMU_MEMORY_REGION, >> .class_init = spapr_iommu_memory_region_class_init, >> + .class_size = sizeof(sPAPRIOMMUMemoryRegionClass), >> }; >> >> static void register_types(void) >> diff --git a/hw/vfio/common.c b/hw/vfio/common.c >> index 7b2924c0ef..92f1f88ae8 100644 >> --- a/hw/vfio/common.c >> +++ b/hw/vfio/common.c >> @@ -454,6 +454,16 @@ static void vfio_listener_region_add(MemoryListener *listener, >> goto fail; >> } >> >> +#ifdef CONFIG_KVM >> + if (kvm_enabled()) { >> + VFIOGroup *group; >> + >> + QLIST_FOREACH(group, &container->group_list, container_next) { >> + vfio_spapr_notify_kvm(vfio_kvm_device_fd, group->fd, >> + IOMMU_MEMORY_REGION(section->mr)); >> + } >> + } > > So, here you're informing the region of the groups when the region is > mapped in. But don't you similarly need to notify if a group is added > to an existing address space? It is either in VFIO for ages (forever? in vfio_connect_container()), or I did not understand the question... > And won't you also need notifications > of groups/regions being removed? Same comment. vfio_disconnect_container()? > >> +#endif >> vfio_host_win_add(container, section->offset_within_address_space, >> section->offset_within_address_space + >> int128_get64(section->size) - 1, pgsize); >> diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c >> index 32fd6a9b54..2b9af75c03 100644 >> --- a/hw/vfio/spapr.c >> +++ b/hw/vfio/spapr.c >> @@ -15,8 +15,12 @@ >> >> #include "hw/vfio/vfio-common.h" >> #include "hw/hw.h" >> +#include "hw/ppc/spapr.h" >> #include "qemu/error-report.h" >> #include "trace.h" >> +#ifdef CONFIG_KVM >> +#include "linux/kvm.h" >> +#endif >> >> static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section) >> { >> @@ -188,6 +192,41 @@ int vfio_spapr_create_window(VFIOContainer *container, >> return 0; >> } >> >> +int vfio_spapr_notify_kvm(int vfio_kvm_device_fd, int groupfd, >> + IOMMUMemoryRegion *iommu_mr) >> +{ >> +#ifdef CONFIG_KVM >> + struct kvm_vfio_spapr_tce param = { >> + .groupfd = groupfd, >> + }; >> + struct kvm_device_attr attr = { >> + .group = KVM_DEV_VFIO_GROUP, >> + .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE, >> + .addr = (uint64_t)(unsigned long)¶m, >> + }; >> + IOMMUMemoryRegion *spapr_iommu_mr = SPAPR_IOMMU_MEMORY_REGION(iommu_mr); > > This will assert if you have a non-spapr guest IOMMU on a ppc host > (e.g. emulating an x86 with VT-d under TCG). Sure but this should not execute when TCG, hence "_kvm" in "vfio_spapr_notify_kvm". > >> + sPAPRIOMMUMemoryRegionClass *simrc = >> + SPAPR_IOMMU_MEMORY_REGION_GET_CLASS(spapr_iommu_mr); >> + >> + if (!simrc->get_fd) { >> + error_report("vfio: No get_fd defined for IOMMU MR"); >> + return -EFAULT; >> + } >> + >> + param.tablefd = simrc->get_fd(spapr_iommu_mr); >> + >> + if (param.tablefd != -1) { >> + if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { >> + error_report("vfio: failed to setup fd %d for a group with fd %d: %s", >> + param.tablefd, param.groupfd, strerror(errno)); >> + return -errno; >> + } >> + } >> + trace_vfio_spapr_notify_kvm(groupfd, param.tablefd); >> +#endif >> + return 0; >> +} >> + >> int vfio_spapr_remove_window(VFIOContainer *container, >> hwaddr offset_within_address_space) >> { >> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events >> index 2561c6d31a..084a92f7c2 100644 >> --- a/hw/vfio/trace-events >> +++ b/hw/vfio/trace-events >> @@ -123,3 +123,4 @@ vfio_prereg_register(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"P >> vfio_prereg_unregister(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d" >> vfio_spapr_create_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x winsize=0x%"PRIx64" offset=0x%"PRIx64 >> vfio_spapr_remove_window(uint64_t off) "offset=%"PRIx64 >> +vfio_spapr_notify_kvm(int groupfd, int tablefd) "Attached groupfd %d to liobn fd %d" >
On 28/08/17 14:20, Alexey Kardashevskiy wrote: > On 25/08/17 16:21, David Gibson wrote: >> On Thu, Jul 20, 2017 at 05:22:30PM +1000, Alexey Kardashevskiy wrote: >>> This implements a notification for a new IOMMU group attached to >>> sPAPR's logical IO bus (LIOBN) to enable in-kernel TCE acceleration. >>> >>> This extends the TYPE_SPAPR_IOMMU_MEMORY_REGION class with a get_fd() >>> callback which returns KVM fd associated with LIOBN, the notifier uses it >>> to establish link between LIOBN and IOMMU group in the KVM. >>> >>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> Ping? >>> --- >>> >>> The practical reason for adding get_fd() as a callback is avoiding static >>> linking to spapt_tce_get_fd(): hw/vfio/spapr.c compiles when >>> CONFIG_SOFTMMU=y to avoid multiple "ifdef PSERIES"'s in the rest >>> of VFIO code but hw/ppc/spapr_iommu.c (where spapt_tce_get_fd() besides) >>> compiles only when CONFIG_PSERIES=y. >> >> Ok. Nonetheless I don't think the get_fd() method is a good idea. >> First, it's basically an abstraction violation, exposing the region's >> internal fd. Second, it's a method which only plausibly has one >> implementation which is rarely sensible. >> >> What this comes down to is that the guest IOMMU mechanism needs >> information about host vfio groups mapped - for an optimization in >> this case. >> >> So what would make sense to me is to put an "add_vfio_group" method >> into IOMMUMemoryRegionClass (or even MemoryRegionClass). In most >> cases that will be NULL (== no-op). For the spapr IOMMU region, it >> will (attempt to) connect the host group to the guest liobn. > > > Like this? > > IOMMUMemoryRegionClass::add_vfio_group_kvm(int kvm_device_fd, int groupfd); > > It is just cleaner to keep kvm_device_fd and its ioclts() in the same place > but ok. > > >> >>> --- >>> include/hw/ppc/spapr.h | 15 +++++++++++++++ >>> include/hw/vfio/vfio-common.h | 2 ++ >>> hw/ppc/spapr_iommu.c | 10 ++++++++++ >>> hw/vfio/common.c | 10 ++++++++++ >>> hw/vfio/spapr.c | 39 +++++++++++++++++++++++++++++++++++++++ >>> hw/vfio/trace-events | 1 + >>> 6 files changed, 77 insertions(+) >>> >>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h >>> index 2a303a705c..c1d37e6356 100644 >>> --- a/include/hw/ppc/spapr.h >>> +++ b/include/hw/ppc/spapr.h >>> @@ -591,6 +591,7 @@ void spapr_load_rtas(sPAPRMachineState *spapr, void *fdt, hwaddr addr); >>> #define RTAS_EVENT_SCAN_RATE 1 >>> >>> typedef struct sPAPRTCETable sPAPRTCETable; >>> +typedef struct sPAPRIOMMUMemoryRegionClass sPAPRIOMMUMemoryRegionClass; >>> >>> #define TYPE_SPAPR_TCE_TABLE "spapr-tce-table" >>> #define SPAPR_TCE_TABLE(obj) \ >>> @@ -599,6 +600,12 @@ typedef struct sPAPRTCETable sPAPRTCETable; >>> #define TYPE_SPAPR_IOMMU_MEMORY_REGION "spapr-iommu-memory-region" >>> #define SPAPR_IOMMU_MEMORY_REGION(obj) \ >>> OBJECT_CHECK(IOMMUMemoryRegion, (obj), TYPE_SPAPR_IOMMU_MEMORY_REGION) >>> +#define SPAPR_IOMMU_MEMORY_REGION_GET_CLASS(obj) \ >>> + OBJECT_GET_CLASS(sPAPRIOMMUMemoryRegionClass, obj, \ >>> + TYPE_SPAPR_IOMMU_MEMORY_REGION) >>> +#define SPAPR_IOMMU_MEMORY_REGION_CLASS(klass) \ >>> + OBJECT_CLASS_CHECK(sPAPRIOMMUMemoryRegionClass, klass, \ >>> + TYPE_SPAPR_IOMMU_MEMORY_REGION) >>> >>> struct sPAPRTCETable { >>> DeviceState parent; >>> @@ -618,6 +625,14 @@ struct sPAPRTCETable { >>> QLIST_ENTRY(sPAPRTCETable) list; >>> }; >>> >>> +struct sPAPRIOMMUMemoryRegionClass { >>> + /* private */ >>> + IOMMUMemoryRegionClass parent_class; >>> + >>> + /* public */ >>> + int (*get_fd)(IOMMUMemoryRegion *iommu_mr); >>> +}; >>> + >>> sPAPRTCETable *spapr_tce_find_by_liobn(target_ulong liobn); >> >> To make sure I'm understanding correctly: the MR subclass here is >> representing a guest-side property, yes? It means that on the guest >> side the IOMMU mappings are managed by the PAPR {GET,PUT}_TCE >> interface. > > I do not understand the question, sorry. MR is a QEMU thing, only QEMU > devices use these MRs as MRs. > > >>> struct sPAPREventLogEntry { >>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h >>> index f3a2ac9fee..d245d3cecc 100644 >>> --- a/include/hw/vfio/vfio-common.h >>> +++ b/include/hw/vfio/vfio-common.h >>> @@ -177,6 +177,8 @@ extern const MemoryListener vfio_prereg_listener; >>> int vfio_spapr_create_window(VFIOContainer *container, >>> MemoryRegionSection *section, >>> hwaddr *pgsize); >>> +int vfio_spapr_notify_kvm(int vfio_kvm_device_fd, int groupfd, >>> + IOMMUMemoryRegion *iommumr); >>> int vfio_spapr_remove_window(VFIOContainer *container, >>> hwaddr offset_within_address_space); >>> >>> diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c >>> index 307dc3021e..82fca61a75 100644 >>> --- a/hw/ppc/spapr_iommu.c >>> +++ b/hw/ppc/spapr_iommu.c >>> @@ -171,6 +171,13 @@ static void spapr_tce_notify_flag_changed(IOMMUMemoryRegion *iommu, >>> } >>> } >>> >>> +static int spapr_tce_get_fd(IOMMUMemoryRegion *iommu_mr) >>> +{ >>> + sPAPRTCETable *tcet = container_of(iommu_mr, sPAPRTCETable, iommu); >>> + >>> + return tcet->fd; >> >> Does this have a well defined value if there's no KVM? > > > It is -1 in TCG, any other value is undefined. > > >> >>> +} >>> + >>> static int spapr_tce_table_post_load(void *opaque, int version_id) >>> { >>> sPAPRTCETable *tcet = SPAPR_TCE_TABLE(opaque); >>> @@ -631,16 +638,19 @@ static TypeInfo spapr_tce_table_info = { >>> static void spapr_iommu_memory_region_class_init(ObjectClass *klass, void *data) >>> { >>> IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass); >>> + sPAPRIOMMUMemoryRegionClass *simrc = SPAPR_IOMMU_MEMORY_REGION_CLASS(klass); >>> >>> imrc->translate = spapr_tce_translate_iommu; >>> imrc->get_min_page_size = spapr_tce_get_min_page_size; >>> imrc->notify_flag_changed = spapr_tce_notify_flag_changed; >>> + simrc->get_fd = spapr_tce_get_fd; >>> } >>> >>> static const TypeInfo spapr_iommu_memory_region_info = { >>> .parent = TYPE_IOMMU_MEMORY_REGION, >>> .name = TYPE_SPAPR_IOMMU_MEMORY_REGION, >>> .class_init = spapr_iommu_memory_region_class_init, >>> + .class_size = sizeof(sPAPRIOMMUMemoryRegionClass), >>> }; >>> >>> static void register_types(void) >>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c >>> index 7b2924c0ef..92f1f88ae8 100644 >>> --- a/hw/vfio/common.c >>> +++ b/hw/vfio/common.c >>> @@ -454,6 +454,16 @@ static void vfio_listener_region_add(MemoryListener *listener, >>> goto fail; >>> } >>> >>> +#ifdef CONFIG_KVM >>> + if (kvm_enabled()) { >>> + VFIOGroup *group; >>> + >>> + QLIST_FOREACH(group, &container->group_list, container_next) { >>> + vfio_spapr_notify_kvm(vfio_kvm_device_fd, group->fd, >>> + IOMMU_MEMORY_REGION(section->mr)); >>> + } >>> + } >> >> So, here you're informing the region of the groups when the region is >> mapped in. But don't you similarly need to notify if a group is added >> to an existing address space? > > It is either in VFIO for ages (forever? in vfio_connect_container()), or I > did not understand the question... > > >> And won't you also need notifications >> of groups/regions being removed? > > Same comment. vfio_disconnect_container()? > > >> >>> +#endif >>> vfio_host_win_add(container, section->offset_within_address_space, >>> section->offset_within_address_space + >>> int128_get64(section->size) - 1, pgsize); >>> diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c >>> index 32fd6a9b54..2b9af75c03 100644 >>> --- a/hw/vfio/spapr.c >>> +++ b/hw/vfio/spapr.c >>> @@ -15,8 +15,12 @@ >>> >>> #include "hw/vfio/vfio-common.h" >>> #include "hw/hw.h" >>> +#include "hw/ppc/spapr.h" >>> #include "qemu/error-report.h" >>> #include "trace.h" >>> +#ifdef CONFIG_KVM >>> +#include "linux/kvm.h" >>> +#endif >>> >>> static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section) >>> { >>> @@ -188,6 +192,41 @@ int vfio_spapr_create_window(VFIOContainer *container, >>> return 0; >>> } >>> >>> +int vfio_spapr_notify_kvm(int vfio_kvm_device_fd, int groupfd, >>> + IOMMUMemoryRegion *iommu_mr) >>> +{ >>> +#ifdef CONFIG_KVM >>> + struct kvm_vfio_spapr_tce param = { >>> + .groupfd = groupfd, >>> + }; >>> + struct kvm_device_attr attr = { >>> + .group = KVM_DEV_VFIO_GROUP, >>> + .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE, >>> + .addr = (uint64_t)(unsigned long)¶m, >>> + }; >>> + IOMMUMemoryRegion *spapr_iommu_mr = SPAPR_IOMMU_MEMORY_REGION(iommu_mr); >> >> This will assert if you have a non-spapr guest IOMMU on a ppc host >> (e.g. emulating an x86 with VT-d under TCG). > > > Sure but this should not execute when TCG, hence "_kvm" in > "vfio_spapr_notify_kvm". >
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h index 2a303a705c..c1d37e6356 100644 --- a/include/hw/ppc/spapr.h +++ b/include/hw/ppc/spapr.h @@ -591,6 +591,7 @@ void spapr_load_rtas(sPAPRMachineState *spapr, void *fdt, hwaddr addr); #define RTAS_EVENT_SCAN_RATE 1 typedef struct sPAPRTCETable sPAPRTCETable; +typedef struct sPAPRIOMMUMemoryRegionClass sPAPRIOMMUMemoryRegionClass; #define TYPE_SPAPR_TCE_TABLE "spapr-tce-table" #define SPAPR_TCE_TABLE(obj) \ @@ -599,6 +600,12 @@ typedef struct sPAPRTCETable sPAPRTCETable; #define TYPE_SPAPR_IOMMU_MEMORY_REGION "spapr-iommu-memory-region" #define SPAPR_IOMMU_MEMORY_REGION(obj) \ OBJECT_CHECK(IOMMUMemoryRegion, (obj), TYPE_SPAPR_IOMMU_MEMORY_REGION) +#define SPAPR_IOMMU_MEMORY_REGION_GET_CLASS(obj) \ + OBJECT_GET_CLASS(sPAPRIOMMUMemoryRegionClass, obj, \ + TYPE_SPAPR_IOMMU_MEMORY_REGION) +#define SPAPR_IOMMU_MEMORY_REGION_CLASS(klass) \ + OBJECT_CLASS_CHECK(sPAPRIOMMUMemoryRegionClass, klass, \ + TYPE_SPAPR_IOMMU_MEMORY_REGION) struct sPAPRTCETable { DeviceState parent; @@ -618,6 +625,14 @@ struct sPAPRTCETable { QLIST_ENTRY(sPAPRTCETable) list; }; +struct sPAPRIOMMUMemoryRegionClass { + /* private */ + IOMMUMemoryRegionClass parent_class; + + /* public */ + int (*get_fd)(IOMMUMemoryRegion *iommu_mr); +}; + sPAPRTCETable *spapr_tce_find_by_liobn(target_ulong liobn); struct sPAPREventLogEntry { diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index f3a2ac9fee..d245d3cecc 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -177,6 +177,8 @@ extern const MemoryListener vfio_prereg_listener; int vfio_spapr_create_window(VFIOContainer *container, MemoryRegionSection *section, hwaddr *pgsize); +int vfio_spapr_notify_kvm(int vfio_kvm_device_fd, int groupfd, + IOMMUMemoryRegion *iommumr); int vfio_spapr_remove_window(VFIOContainer *container, hwaddr offset_within_address_space); diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c index 307dc3021e..82fca61a75 100644 --- a/hw/ppc/spapr_iommu.c +++ b/hw/ppc/spapr_iommu.c @@ -171,6 +171,13 @@ static void spapr_tce_notify_flag_changed(IOMMUMemoryRegion *iommu, } } +static int spapr_tce_get_fd(IOMMUMemoryRegion *iommu_mr) +{ + sPAPRTCETable *tcet = container_of(iommu_mr, sPAPRTCETable, iommu); + + return tcet->fd; +} + static int spapr_tce_table_post_load(void *opaque, int version_id) { sPAPRTCETable *tcet = SPAPR_TCE_TABLE(opaque); @@ -631,16 +638,19 @@ static TypeInfo spapr_tce_table_info = { static void spapr_iommu_memory_region_class_init(ObjectClass *klass, void *data) { IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass); + sPAPRIOMMUMemoryRegionClass *simrc = SPAPR_IOMMU_MEMORY_REGION_CLASS(klass); imrc->translate = spapr_tce_translate_iommu; imrc->get_min_page_size = spapr_tce_get_min_page_size; imrc->notify_flag_changed = spapr_tce_notify_flag_changed; + simrc->get_fd = spapr_tce_get_fd; } static const TypeInfo spapr_iommu_memory_region_info = { .parent = TYPE_IOMMU_MEMORY_REGION, .name = TYPE_SPAPR_IOMMU_MEMORY_REGION, .class_init = spapr_iommu_memory_region_class_init, + .class_size = sizeof(sPAPRIOMMUMemoryRegionClass), }; static void register_types(void) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 7b2924c0ef..92f1f88ae8 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -454,6 +454,16 @@ static void vfio_listener_region_add(MemoryListener *listener, goto fail; } +#ifdef CONFIG_KVM + if (kvm_enabled()) { + VFIOGroup *group; + + QLIST_FOREACH(group, &container->group_list, container_next) { + vfio_spapr_notify_kvm(vfio_kvm_device_fd, group->fd, + IOMMU_MEMORY_REGION(section->mr)); + } + } +#endif vfio_host_win_add(container, section->offset_within_address_space, section->offset_within_address_space + int128_get64(section->size) - 1, pgsize); diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c index 32fd6a9b54..2b9af75c03 100644 --- a/hw/vfio/spapr.c +++ b/hw/vfio/spapr.c @@ -15,8 +15,12 @@ #include "hw/vfio/vfio-common.h" #include "hw/hw.h" +#include "hw/ppc/spapr.h" #include "qemu/error-report.h" #include "trace.h" +#ifdef CONFIG_KVM +#include "linux/kvm.h" +#endif static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section) { @@ -188,6 +192,41 @@ int vfio_spapr_create_window(VFIOContainer *container, return 0; } +int vfio_spapr_notify_kvm(int vfio_kvm_device_fd, int groupfd, + IOMMUMemoryRegion *iommu_mr) +{ +#ifdef CONFIG_KVM + struct kvm_vfio_spapr_tce param = { + .groupfd = groupfd, + }; + struct kvm_device_attr attr = { + .group = KVM_DEV_VFIO_GROUP, + .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE, + .addr = (uint64_t)(unsigned long)¶m, + }; + IOMMUMemoryRegion *spapr_iommu_mr = SPAPR_IOMMU_MEMORY_REGION(iommu_mr); + sPAPRIOMMUMemoryRegionClass *simrc = + SPAPR_IOMMU_MEMORY_REGION_GET_CLASS(spapr_iommu_mr); + + if (!simrc->get_fd) { + error_report("vfio: No get_fd defined for IOMMU MR"); + return -EFAULT; + } + + param.tablefd = simrc->get_fd(spapr_iommu_mr); + + if (param.tablefd != -1) { + if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { + error_report("vfio: failed to setup fd %d for a group with fd %d: %s", + param.tablefd, param.groupfd, strerror(errno)); + return -errno; + } + } + trace_vfio_spapr_notify_kvm(groupfd, param.tablefd); +#endif + return 0; +} + int vfio_spapr_remove_window(VFIOContainer *container, hwaddr offset_within_address_space) { diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 2561c6d31a..084a92f7c2 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -123,3 +123,4 @@ vfio_prereg_register(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"P vfio_prereg_unregister(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d" vfio_spapr_create_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x winsize=0x%"PRIx64" offset=0x%"PRIx64 vfio_spapr_remove_window(uint64_t off) "offset=%"PRIx64 +vfio_spapr_notify_kvm(int groupfd, int tablefd) "Attached groupfd %d to liobn fd %d"
This implements a notification for a new IOMMU group attached to sPAPR's logical IO bus (LIOBN) to enable in-kernel TCE acceleration. This extends the TYPE_SPAPR_IOMMU_MEMORY_REGION class with a get_fd() callback which returns KVM fd associated with LIOBN, the notifier uses it to establish link between LIOBN and IOMMU group in the KVM. Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> --- The practical reason for adding get_fd() as a callback is avoiding static linking to spapt_tce_get_fd(): hw/vfio/spapr.c compiles when CONFIG_SOFTMMU=y to avoid multiple "ifdef PSERIES"'s in the rest of VFIO code but hw/ppc/spapr_iommu.c (where spapt_tce_get_fd() besides) compiles only when CONFIG_PSERIES=y. --- include/hw/ppc/spapr.h | 15 +++++++++++++++ include/hw/vfio/vfio-common.h | 2 ++ hw/ppc/spapr_iommu.c | 10 ++++++++++ hw/vfio/common.c | 10 ++++++++++ hw/vfio/spapr.c | 39 +++++++++++++++++++++++++++++++++++++++ hw/vfio/trace-events | 1 + 6 files changed, 77 insertions(+)