diff mbox

[RFC,qemu,v3,3/4] vfio/spapr: Add a notifier for PPC64 HV/PR KVM about new group attached to LIOBN

Message ID 20170401123741.38469-4-aik@ozlabs.ru
State New
Headers show

Commit Message

Alexey Kardashevskiy April 1, 2017, 12:37 p.m. UTC
This implements a notification for a new IOMMU group attached to
sPAPR's logical IO bus (LIOBN) to enable in-kernel TCE acceleration.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 include/hw/ppc/spapr.h        |  1 +
 include/hw/vfio/vfio-common.h |  2 ++
 hw/ppc/spapr_iommu.c          |  5 +++++
 hw/vfio/common.c              | 10 ++++++++++
 hw/vfio/spapr.c               | 31 +++++++++++++++++++++++++++++++
 hw/vfio/trace-events          |  1 +
 6 files changed, 50 insertions(+)

Comments

David Gibson April 3, 2017, 3:01 a.m. UTC | #1
On Sat, Apr 01, 2017 at 11:37:40PM +1100, Alexey Kardashevskiy wrote:
> This implements a notification for a new IOMMU group attached to
> sPAPR's logical IO bus (LIOBN) to enable in-kernel TCE acceleration.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  include/hw/ppc/spapr.h        |  1 +
>  include/hw/vfio/vfio-common.h |  2 ++
>  hw/ppc/spapr_iommu.c          |  5 +++++
>  hw/vfio/common.c              | 10 ++++++++++
>  hw/vfio/spapr.c               | 31 +++++++++++++++++++++++++++++++
>  hw/vfio/trace-events          |  1 +
>  6 files changed, 50 insertions(+)
> 
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index 6997ed7e98..8a1b32f89a 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -617,6 +617,7 @@ void spapr_tce_table_enable(sPAPRTCETable *tcet,
>                              uint32_t page_shift, uint64_t bus_offset,
>                              uint32_t nb_table);
>  void spapr_tce_table_disable(sPAPRTCETable *tcet);
> +int spapr_tce_get_fd(sPAPRTCETable *tcet);
>  void spapr_tce_set_need_vfio(sPAPRTCETable *tcet, bool need_vfio);
>  
>  MemoryRegion *spapr_tce_get_iommu(sPAPRTCETable *tcet);
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index 7a4135ae6f..b99f4af96e 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -175,6 +175,8 @@ extern const MemoryListener vfio_prereg_listener;
>  int vfio_spapr_create_window(VFIOContainer *container,
>                               MemoryRegionSection *section,
>                               hwaddr *pgsize);
> +int vfio_spapr_notify_kvm(int vfio_kvm_device_fd, int groupfd,
> +                          IOMMUMemoryRegion *iommumr);
>  int vfio_spapr_remove_window(VFIOContainer *container,
>                               hwaddr offset_within_address_space);
>  
> diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
> index 5051110b9d..f7531a6408 100644
> --- a/hw/ppc/spapr_iommu.c
> +++ b/hw/ppc/spapr_iommu.c
> @@ -171,6 +171,11 @@ static void spapr_tce_notify_flag_changed(IOMMUMemoryRegion *iommu,
>      }
>  }
>  
> +int spapr_tce_get_fd(sPAPRTCETable *tcet)
> +{
> +    return tcet->fd;
> +}
> +

I don't think this actually abstracts anything worthwhile.  The caller
needs the sPAPRTCETable definition anyway to use container_of(), so it
might as well just grab the field directly.

>  static int spapr_tce_table_post_load(void *opaque, int version_id)
>  {
>      sPAPRTCETable *tcet = SPAPR_TCE_TABLE(opaque);
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index e8188eb3d5..b94b29be15 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -440,6 +440,16 @@ static void vfio_listener_region_add(MemoryListener *listener,
>              goto fail;
>          }
>  
> +#ifdef CONFIG_KVM
> +        if (kvm_enabled()) {
> +            VFIOGroup *group;
> +
> +            QLIST_FOREACH(group, &container->group_list, container_next) {
> +                vfio_spapr_notify_kvm(vfio_kvm_device_fd, group->fd,
> +                                      IOMMU_MEMORY_REGION(section->mr));
> +            }
> +        }
> +#endif
>          vfio_host_win_add(container, section->offset_within_address_space,
>                            section->offset_within_address_space +
>                            int128_get64(section->size) - 1, pgsize);
> diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
> index 551870d46b..6410438e62 100644
> --- a/hw/vfio/spapr.c
> +++ b/hw/vfio/spapr.c
> @@ -15,8 +15,12 @@
>  
>  #include "hw/vfio/vfio-common.h"
>  #include "hw/hw.h"
> +#include "hw/ppc/spapr.h"
>  #include "qemu/error-report.h"
>  #include "trace.h"
> +#ifdef CONFIG_KVM
> +#include "linux/kvm.h"
> +#endif
>  
>  static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section)
>  {
> @@ -188,6 +192,33 @@ int vfio_spapr_create_window(VFIOContainer *container,
>      return 0;
>  }
>  
> +int vfio_spapr_notify_kvm(int vfio_kvm_device_fd, int groupfd,
> +                          IOMMUMemoryRegion *iommumr)
> +{
> +#ifdef CONFIG_KVM
> +    struct kvm_vfio_spapr_tce param = {
> +        .groupfd = groupfd,
> +    };
> +    struct kvm_device_attr attr = {
> +        .group = KVM_DEV_VFIO_GROUP,
> +        .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
> +        .addr = (uint64_t)(unsigned long)&param,
> +    };
> +    sPAPRTCETable *tcet = container_of(iommumr, sPAPRTCETable, iommu);

This isn't safe.  The caller has verified that the host backend IOMMU
is sPAPR TCE, but you haven't verified that the *guest* IOMMU is TCE
based.  I suspect other details would prevent a TCG x86 machine with
VT-d running on a Power host from getting this far, but it's not good
to rely on that.

So, you need to explicitly verify that the guest IOMMU region really
is a PAPR TCE region.  The obvious way would be to continue your
QOMification and make sPAPRTCETable a subtype of IOMMUMemoryRegion,
rather than just including it by composition.

> +
> +    param.tablefd = spapr_tce_get_fd(tcet);
> +    if (param.tablefd != -1) {
> +        if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
> +            error_report("vfio: failed to setup fd %d for a group with fd %d: %s",
> +                         param.tablefd, param.groupfd, strerror(errno));
> +            return -errno;
> +        }
> +    }
> +    trace_vfio_spapr_notify_kvm(groupfd, param.tablefd);
> +#endif
> +    return 0;
> +}
> +
>  int vfio_spapr_remove_window(VFIOContainer *container,
>                               hwaddr offset_within_address_space)
>  {
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index 2561c6d31a..084a92f7c2 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -123,3 +123,4 @@ vfio_prereg_register(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"P
>  vfio_prereg_unregister(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d"
>  vfio_spapr_create_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x winsize=0x%"PRIx64" offset=0x%"PRIx64
>  vfio_spapr_remove_window(uint64_t off) "offset=%"PRIx64
> +vfio_spapr_notify_kvm(int groupfd, int tablefd) "Attached groupfd %d to liobn fd %d"
Alexey Kardashevskiy May 1, 2017, 12:21 p.m. UTC | #2
On 03/04/17 13:01, David Gibson wrote:
> On Sat, Apr 01, 2017 at 11:37:40PM +1100, Alexey Kardashevskiy wrote:
>> This implements a notification for a new IOMMU group attached to
>> sPAPR's logical IO bus (LIOBN) to enable in-kernel TCE acceleration.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>  include/hw/ppc/spapr.h        |  1 +
>>  include/hw/vfio/vfio-common.h |  2 ++
>>  hw/ppc/spapr_iommu.c          |  5 +++++
>>  hw/vfio/common.c              | 10 ++++++++++
>>  hw/vfio/spapr.c               | 31 +++++++++++++++++++++++++++++++
>>  hw/vfio/trace-events          |  1 +
>>  6 files changed, 50 insertions(+)
>>
>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>> index 6997ed7e98..8a1b32f89a 100644
>> --- a/include/hw/ppc/spapr.h
>> +++ b/include/hw/ppc/spapr.h
>> @@ -617,6 +617,7 @@ void spapr_tce_table_enable(sPAPRTCETable *tcet,
>>                              uint32_t page_shift, uint64_t bus_offset,
>>                              uint32_t nb_table);
>>  void spapr_tce_table_disable(sPAPRTCETable *tcet);
>> +int spapr_tce_get_fd(sPAPRTCETable *tcet);
>>  void spapr_tce_set_need_vfio(sPAPRTCETable *tcet, bool need_vfio);
>>  
>>  MemoryRegion *spapr_tce_get_iommu(sPAPRTCETable *tcet);
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>> index 7a4135ae6f..b99f4af96e 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -175,6 +175,8 @@ extern const MemoryListener vfio_prereg_listener;
>>  int vfio_spapr_create_window(VFIOContainer *container,
>>                               MemoryRegionSection *section,
>>                               hwaddr *pgsize);
>> +int vfio_spapr_notify_kvm(int vfio_kvm_device_fd, int groupfd,
>> +                          IOMMUMemoryRegion *iommumr);
>>  int vfio_spapr_remove_window(VFIOContainer *container,
>>                               hwaddr offset_within_address_space);
>>  
>> diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
>> index 5051110b9d..f7531a6408 100644
>> --- a/hw/ppc/spapr_iommu.c
>> +++ b/hw/ppc/spapr_iommu.c
>> @@ -171,6 +171,11 @@ static void spapr_tce_notify_flag_changed(IOMMUMemoryRegion *iommu,
>>      }
>>  }
>>  
>> +int spapr_tce_get_fd(sPAPRTCETable *tcet)
>> +{
>> +    return tcet->fd;
>> +}
>> +
> 
> I don't think this actually abstracts anything worthwhile.  The caller
> needs the sPAPRTCETable definition anyway to use container_of(), so it
> might as well just grab the field directly.

So far @fd has only been accesses from hw/ppc/spapr_iommu.c and I'd like to
keep it that way ("encapsulation"?).


> 
>>  static int spapr_tce_table_post_load(void *opaque, int version_id)
>>  {
>>      sPAPRTCETable *tcet = SPAPR_TCE_TABLE(opaque);
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index e8188eb3d5..b94b29be15 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -440,6 +440,16 @@ static void vfio_listener_region_add(MemoryListener *listener,
>>              goto fail;
>>          }
>>  
>> +#ifdef CONFIG_KVM
>> +        if (kvm_enabled()) {
>> +            VFIOGroup *group;
>> +
>> +            QLIST_FOREACH(group, &container->group_list, container_next) {
>> +                vfio_spapr_notify_kvm(vfio_kvm_device_fd, group->fd,
>> +                                      IOMMU_MEMORY_REGION(section->mr));
>> +            }
>> +        }
>> +#endif
>>          vfio_host_win_add(container, section->offset_within_address_space,
>>                            section->offset_within_address_space +
>>                            int128_get64(section->size) - 1, pgsize);
>> diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
>> index 551870d46b..6410438e62 100644
>> --- a/hw/vfio/spapr.c
>> +++ b/hw/vfio/spapr.c
>> @@ -15,8 +15,12 @@
>>  
>>  #include "hw/vfio/vfio-common.h"
>>  #include "hw/hw.h"
>> +#include "hw/ppc/spapr.h"
>>  #include "qemu/error-report.h"
>>  #include "trace.h"
>> +#ifdef CONFIG_KVM
>> +#include "linux/kvm.h"
>> +#endif
>>  
>>  static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section)
>>  {
>> @@ -188,6 +192,33 @@ int vfio_spapr_create_window(VFIOContainer *container,
>>      return 0;
>>  }
>>  
>> +int vfio_spapr_notify_kvm(int vfio_kvm_device_fd, int groupfd,
>> +                          IOMMUMemoryRegion *iommumr)
>> +{
>> +#ifdef CONFIG_KVM
>> +    struct kvm_vfio_spapr_tce param = {
>> +        .groupfd = groupfd,
>> +    };
>> +    struct kvm_device_attr attr = {
>> +        .group = KVM_DEV_VFIO_GROUP,
>> +        .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
>> +        .addr = (uint64_t)(unsigned long)&param,
>> +    };
>> +    sPAPRTCETable *tcet = container_of(iommumr, sPAPRTCETable, iommu);
> 
> This isn't safe.  The caller has verified that the host backend IOMMU
> is sPAPR TCE, but you haven't verified that the *guest* IOMMU is TCE
> based.  I suspect other details would prevent a TCG x86 machine with
> VT-d running on a Power host from getting this far, but it's not good
> to rely on that.
> 
> So, you need to explicitly verify that the guest IOMMU region really
> is a PAPR TCE region.  The obvious way would be to continue your
> QOMification and make sPAPRTCETable a subtype of IOMMUMemoryRegion,
> rather than just including it by composition.

sPAPRTCETable is a device now, with 2 memory regions - one for entire 64bit
space and different sPAPRTCETable root MRs overlap, another one is the
actual IOMMU MR - may be QOM just this one, not the entire sPAPRTCETable
thingy?




> 
>> +
>> +    param.tablefd = spapr_tce_get_fd(tcet);
>> +    if (param.tablefd != -1) {
>> +        if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
>> +            error_report("vfio: failed to setup fd %d for a group with fd %d: %s",
>> +                         param.tablefd, param.groupfd, strerror(errno));
>> +            return -errno;
>> +        }
>> +    }
>> +    trace_vfio_spapr_notify_kvm(groupfd, param.tablefd);
>> +#endif
>> +    return 0;
>> +}
>> +
>>  int vfio_spapr_remove_window(VFIOContainer *container,
>>                               hwaddr offset_within_address_space)
>>  {
>> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
>> index 2561c6d31a..084a92f7c2 100644
>> --- a/hw/vfio/trace-events
>> +++ b/hw/vfio/trace-events
>> @@ -123,3 +123,4 @@ vfio_prereg_register(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"P
>>  vfio_prereg_unregister(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d"
>>  vfio_spapr_create_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x winsize=0x%"PRIx64" offset=0x%"PRIx64
>>  vfio_spapr_remove_window(uint64_t off) "offset=%"PRIx64
>> +vfio_spapr_notify_kvm(int groupfd, int tablefd) "Attached groupfd %d to liobn fd %d"
>
diff mbox

Patch

diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 6997ed7e98..8a1b32f89a 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -617,6 +617,7 @@  void spapr_tce_table_enable(sPAPRTCETable *tcet,
                             uint32_t page_shift, uint64_t bus_offset,
                             uint32_t nb_table);
 void spapr_tce_table_disable(sPAPRTCETable *tcet);
+int spapr_tce_get_fd(sPAPRTCETable *tcet);
 void spapr_tce_set_need_vfio(sPAPRTCETable *tcet, bool need_vfio);
 
 MemoryRegion *spapr_tce_get_iommu(sPAPRTCETable *tcet);
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 7a4135ae6f..b99f4af96e 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -175,6 +175,8 @@  extern const MemoryListener vfio_prereg_listener;
 int vfio_spapr_create_window(VFIOContainer *container,
                              MemoryRegionSection *section,
                              hwaddr *pgsize);
+int vfio_spapr_notify_kvm(int vfio_kvm_device_fd, int groupfd,
+                          IOMMUMemoryRegion *iommumr);
 int vfio_spapr_remove_window(VFIOContainer *container,
                              hwaddr offset_within_address_space);
 
diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
index 5051110b9d..f7531a6408 100644
--- a/hw/ppc/spapr_iommu.c
+++ b/hw/ppc/spapr_iommu.c
@@ -171,6 +171,11 @@  static void spapr_tce_notify_flag_changed(IOMMUMemoryRegion *iommu,
     }
 }
 
+int spapr_tce_get_fd(sPAPRTCETable *tcet)
+{
+    return tcet->fd;
+}
+
 static int spapr_tce_table_post_load(void *opaque, int version_id)
 {
     sPAPRTCETable *tcet = SPAPR_TCE_TABLE(opaque);
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index e8188eb3d5..b94b29be15 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -440,6 +440,16 @@  static void vfio_listener_region_add(MemoryListener *listener,
             goto fail;
         }
 
+#ifdef CONFIG_KVM
+        if (kvm_enabled()) {
+            VFIOGroup *group;
+
+            QLIST_FOREACH(group, &container->group_list, container_next) {
+                vfio_spapr_notify_kvm(vfio_kvm_device_fd, group->fd,
+                                      IOMMU_MEMORY_REGION(section->mr));
+            }
+        }
+#endif
         vfio_host_win_add(container, section->offset_within_address_space,
                           section->offset_within_address_space +
                           int128_get64(section->size) - 1, pgsize);
diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
index 551870d46b..6410438e62 100644
--- a/hw/vfio/spapr.c
+++ b/hw/vfio/spapr.c
@@ -15,8 +15,12 @@ 
 
 #include "hw/vfio/vfio-common.h"
 #include "hw/hw.h"
+#include "hw/ppc/spapr.h"
 #include "qemu/error-report.h"
 #include "trace.h"
+#ifdef CONFIG_KVM
+#include "linux/kvm.h"
+#endif
 
 static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section)
 {
@@ -188,6 +192,33 @@  int vfio_spapr_create_window(VFIOContainer *container,
     return 0;
 }
 
+int vfio_spapr_notify_kvm(int vfio_kvm_device_fd, int groupfd,
+                          IOMMUMemoryRegion *iommumr)
+{
+#ifdef CONFIG_KVM
+    struct kvm_vfio_spapr_tce param = {
+        .groupfd = groupfd,
+    };
+    struct kvm_device_attr attr = {
+        .group = KVM_DEV_VFIO_GROUP,
+        .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
+        .addr = (uint64_t)(unsigned long)&param,
+    };
+    sPAPRTCETable *tcet = container_of(iommumr, sPAPRTCETable, iommu);
+
+    param.tablefd = spapr_tce_get_fd(tcet);
+    if (param.tablefd != -1) {
+        if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
+            error_report("vfio: failed to setup fd %d for a group with fd %d: %s",
+                         param.tablefd, param.groupfd, strerror(errno));
+            return -errno;
+        }
+    }
+    trace_vfio_spapr_notify_kvm(groupfd, param.tablefd);
+#endif
+    return 0;
+}
+
 int vfio_spapr_remove_window(VFIOContainer *container,
                              hwaddr offset_within_address_space)
 {
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 2561c6d31a..084a92f7c2 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -123,3 +123,4 @@  vfio_prereg_register(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"P
 vfio_prereg_unregister(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d"
 vfio_spapr_create_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x winsize=0x%"PRIx64" offset=0x%"PRIx64
 vfio_spapr_remove_window(uint64_t off) "offset=%"PRIx64
+vfio_spapr_notify_kvm(int groupfd, int tablefd) "Attached groupfd %d to liobn fd %d"