diff mbox

[RFC] vfio: spapr: Move SPAPR-related code to a separate file

Message ID 1424679772-10328-1-git-send-email-aik@ozlabs.ru
State New
Headers show

Commit Message

Alexey Kardashevskiy Feb. 23, 2015, 8:22 a.m. UTC
This moves SPAPR bits to a separate file to avoid pollution of x86 code.

This is a mechanical patch.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---

There is another patch coming with DMA memory preregistration which will
add another listener on RAM (not IOMMU as in previos posted patchset which is
wrong); having these two in the same file with "type1" might be confusing.

Does this make sense? There is some code duplication, is there any sense
to move them all to helpers?


---
 hw/vfio/Makefile.objs         |   1 +
 hw/vfio/common.c              | 134 ++-----------------------
 hw/vfio/spapr.c               | 226 ++++++++++++++++++++++++++++++++++++++++++
 include/hw/vfio/vfio-common.h |  13 +++
 4 files changed, 246 insertions(+), 128 deletions(-)
 create mode 100644 hw/vfio/spapr.c

Comments

David Gibson Feb. 24, 2015, 4:57 a.m. UTC | #1
On Mon, Feb 23, 2015 at 07:22:52PM +1100, Alexey Kardashevskiy wrote:
> This moves SPAPR bits to a separate file to avoid pollution of x86 code.
> 
> This is a mechanical patch.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>

This seems sensible to me.

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>

> ---
> 
> There is another patch coming with DMA memory preregistration which will
> add another listener on RAM (not IOMMU as in previos posted patchset which is
> wrong); having these two in the same file with "type1" might be confusing.
> 
> Does this make sense? There is some code duplication, is there any sense
> to move them all to helpers?
> 
> 
> ---
>  hw/vfio/Makefile.objs         |   1 +
>  hw/vfio/common.c              | 134 ++-----------------------
>  hw/vfio/spapr.c               | 226 ++++++++++++++++++++++++++++++++++++++++++
>  include/hw/vfio/vfio-common.h |  13 +++
>  4 files changed, 246 insertions(+), 128 deletions(-)
>  create mode 100644 hw/vfio/spapr.c
> 
> diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs
> index e31f30e..92d32b6 100644
> --- a/hw/vfio/Makefile.objs
> +++ b/hw/vfio/Makefile.objs
> @@ -1,4 +1,5 @@
>  ifeq ($(CONFIG_LINUX), y)
>  obj-$(CONFIG_SOFTMMU) += common.o
>  obj-$(CONFIG_PCI) += pci.o
> +obj-$(CONFIG_PSERIES) += spapr.o
>  endif
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 15fbaed..4e48aed 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -190,8 +190,8 @@ const MemoryRegionOps vfio_region_ops = {
>  /*
>   * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
>   */
> -static int vfio_dma_unmap(VFIOContainer *container,
> -                          hwaddr iova, ram_addr_t size)
> +int vfio_dma_unmap(VFIOContainer *container,
> +                   hwaddr iova, ram_addr_t size)
>  {
>      struct vfio_iommu_type1_dma_unmap unmap = {
>          .argsz = sizeof(unmap),
> @@ -208,8 +208,8 @@ static int vfio_dma_unmap(VFIOContainer *container,
>      return 0;
>  }
>  
> -static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
> -                        ram_addr_t size, void *vaddr, bool readonly)
> +int vfio_dma_map(VFIOContainer *container, hwaddr iova,
> +                 ram_addr_t size, void *vaddr, bool readonly)
>  {
>      struct vfio_iommu_type1_dma_map map = {
>          .argsz = sizeof(map),
> @@ -238,7 +238,7 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
>      return -errno;
>  }
>  
> -static bool vfio_listener_skipped_section(MemoryRegionSection *section)
> +bool vfio_listener_skipped_section(MemoryRegionSection *section)
>  {
>      return (!memory_region_is_ram(section->mr) &&
>              !memory_region_is_iommu(section->mr)) ||
> @@ -251,64 +251,6 @@ static bool vfio_listener_skipped_section(MemoryRegionSection *section)
>             section->offset_within_address_space & (1ULL << 63);
>  }
>  
> -static void vfio_iommu_map_notify(Notifier *n, void *data)
> -{
> -    VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
> -    VFIOContainer *container = giommu->container;
> -    IOMMUTLBEntry *iotlb = data;
> -    MemoryRegion *mr;
> -    hwaddr xlat;
> -    hwaddr len = iotlb->addr_mask + 1;
> -    void *vaddr;
> -    int ret;
> -
> -    trace_vfio_iommu_map_notify(iotlb->iova,
> -                                iotlb->iova + iotlb->addr_mask);
> -
> -    /*
> -     * The IOMMU TLB entry we have just covers translation through
> -     * this IOMMU to its immediate target.  We need to translate
> -     * it the rest of the way through to memory.
> -     */
> -    mr = address_space_translate(&address_space_memory,
> -                                 iotlb->translated_addr,
> -                                 &xlat, &len, iotlb->perm & IOMMU_WO);
> -    if (!memory_region_is_ram(mr)) {
> -        error_report("iommu map to non memory area %"HWADDR_PRIx"\n",
> -                     xlat);
> -        return;
> -    }
> -    /*
> -     * Translation truncates length to the IOMMU page size,
> -     * check that it did not truncate too much.
> -     */
> -    if (len & iotlb->addr_mask) {
> -        error_report("iommu has granularity incompatible with target AS\n");
> -        return;
> -    }
> -
> -    if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
> -        vaddr = memory_region_get_ram_ptr(mr) + xlat;
> -        ret = vfio_dma_map(container, iotlb->iova,
> -                           iotlb->addr_mask + 1, vaddr,
> -                           !(iotlb->perm & IOMMU_WO) || mr->readonly);
> -        if (ret) {
> -            error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
> -                         "0x%"HWADDR_PRIx", %p) = %d (%m)",
> -                         container, iotlb->iova,
> -                         iotlb->addr_mask + 1, vaddr, ret);
> -        }
> -    } else {
> -        ret = vfio_dma_unmap(container, iotlb->iova, iotlb->addr_mask + 1);
> -        if (ret) {
> -            error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
> -                         "0x%"HWADDR_PRIx") = %d (%m)",
> -                         container, iotlb->iova,
> -                         iotlb->addr_mask + 1, ret);
> -        }
> -    }
> -}
> -
>  static void vfio_listener_region_add(MemoryListener *listener,
>                                       MemoryRegionSection *section)
>  {
> @@ -344,45 +286,6 @@ static void vfio_listener_region_add(MemoryListener *listener,
>  
>      memory_region_ref(section->mr);
>  
> -    if (memory_region_is_iommu(section->mr)) {
> -        VFIOGuestIOMMU *giommu;
> -
> -        trace_vfio_listener_region_add_iommu(iova,
> -                    int128_get64(int128_sub(llend, int128_one())));
> -        /*
> -         * FIXME: We should do some checking to see if the
> -         * capabilities of the host VFIO IOMMU are adequate to model
> -         * the guest IOMMU
> -         *
> -         * FIXME: For VFIO iommu types which have KVM acceleration to
> -         * avoid bouncing all map/unmaps through qemu this way, this
> -         * would be the right place to wire that up (tell the KVM
> -         * device emulation the VFIO iommu handles to use).
> -         */
> -        /*
> -         * This assumes that the guest IOMMU is empty of
> -         * mappings at this point.
> -         *
> -         * One way of doing this is:
> -         * 1. Avoid sharing IOMMUs between emulated devices or different
> -         * IOMMU groups.
> -         * 2. Implement VFIO_IOMMU_ENABLE in the host kernel to fail if
> -         * there are some mappings in IOMMU.
> -         *
> -         * VFIO on SPAPR does that. Other IOMMU models may do that different,
> -         * they must make sure there are no existing mappings or
> -         * loop through existing mappings to map them into VFIO.
> -         */
> -        giommu = g_malloc0(sizeof(*giommu));
> -        giommu->iommu = section->mr;
> -        giommu->container = container;
> -        giommu->n.notify = vfio_iommu_map_notify;
> -        QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
> -        memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
> -
> -        return;
> -    }
> -
>      /* Here we assume that memory_region_is_ram(section->mr)==true */
>  
>      end = int128_get64(llend);
> @@ -435,27 +338,6 @@ static void vfio_listener_region_del(MemoryListener *listener,
>          return;
>      }
>  
> -    if (memory_region_is_iommu(section->mr)) {
> -        VFIOGuestIOMMU *giommu;
> -
> -        QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
> -            if (giommu->iommu == section->mr) {
> -                memory_region_unregister_iommu_notifier(&giommu->n);
> -                QLIST_REMOVE(giommu, giommu_next);
> -                g_free(giommu);
> -                break;
> -            }
> -        }
> -
> -        /*
> -         * FIXME: We assume the one big unmap below is adequate to
> -         * remove any individual page mappings in the IOMMU which
> -         * might have been copied into VFIO. This works for a page table
> -         * based IOMMU where a big unmap flattens a large range of IO-PTEs.
> -         * That may not be true for all IOMMU types.
> -         */
> -    }
> -
>      iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
>      end = (section->offset_within_address_space + int128_get64(section->size)) &
>            TARGET_PAGE_MASK;
> @@ -721,11 +603,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
>              goto free_container_exit;
>          }
>  
> -        container->iommu_data.type1.listener = vfio_memory_listener;
> -        container->iommu_data.release = vfio_listener_release;
> -
> -        memory_listener_register(&container->iommu_data.type1.listener,
> -                                 container->space->as);
> +        spapr_memory_listener_register(container);
>  
>      } else {
>          error_report("vfio: No available IOMMU models");
> diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
> new file mode 100644
> index 0000000..f743138
> --- /dev/null
> +++ b/hw/vfio/spapr.c
> @@ -0,0 +1,226 @@
> +/*
> + * QEMU sPAPR VFIO IOMMU
> + *
> + * Copyright (c) 2015 Alexey Kardashevskiy, IBM Corporation.
> + *
> + *  This program is free software; you can redistribute it and/or modify
> + *  it under the terms of the GNU General Public License as published by
> + *  the Free Software Foundation; either version 2 of the License,
> + *  or (at your option) any later version.
> + *
> + *  This program is distributed in the hope that it will be useful,
> + *  but WITHOUT ANY WARRANTY; without even the implied warranty of
> + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + *  GNU General Public License for more details.
> + *
> + *  You should have received a copy of the GNU General Public License
> + *  along with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "hw/vfio/vfio-common.h"
> +#include "qemu/error-report.h"
> +#include "trace.h"
> +
> +static void vfio_iommu_map_notify(Notifier *n, void *data)
> +{
> +    VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
> +    VFIOContainer *container = giommu->container;
> +    IOMMUTLBEntry *iotlb = data;
> +    MemoryRegion *mr;
> +    hwaddr xlat;
> +    hwaddr len = iotlb->addr_mask + 1;
> +    void *vaddr;
> +    int ret;
> +
> +    trace_vfio_iommu_map_notify(iotlb->iova,
> +                                iotlb->iova + iotlb->addr_mask);
> +
> +    /*
> +     * The IOMMU TLB entry we have just covers translation through
> +     * this IOMMU to its immediate target.  We need to translate
> +     * it the rest of the way through to memory.
> +     */
> +    mr = address_space_translate(&address_space_memory,
> +                                 iotlb->translated_addr,
> +                                 &xlat, &len, iotlb->perm & IOMMU_WO);
> +    if (!memory_region_is_ram(mr)) {
> +        error_report("iommu map to non memory area %"HWADDR_PRIx"\n",
> +                     xlat);
> +        return;
> +    }
> +    /*
> +     * Translation truncates length to the IOMMU page size,
> +     * check that it did not truncate too much.
> +     */
> +    if (len & iotlb->addr_mask) {
> +        error_report("iommu has granularity incompatible with target AS\n");
> +        return;
> +    }
> +
> +    if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
> +        vaddr = memory_region_get_ram_ptr(mr) + xlat;
> +        ret = vfio_dma_map(container, iotlb->iova,
> +                           iotlb->addr_mask + 1, vaddr,
> +                           !(iotlb->perm & IOMMU_WO) || mr->readonly);
> +        if (ret) {
> +            error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
> +                         "0x%"HWADDR_PRIx", %p) = %d (%m)",
> +                         container, iotlb->iova,
> +                         iotlb->addr_mask + 1, vaddr, ret);
> +        }
> +    } else {
> +        ret = vfio_dma_unmap(container, iotlb->iova, iotlb->addr_mask + 1);
> +        if (ret) {
> +            error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
> +                         "0x%"HWADDR_PRIx") = %d (%m)",
> +                         container, iotlb->iova,
> +                         iotlb->addr_mask + 1, ret);
> +        }
> +    }
> +}
> +
> +static void vfio_spapr_listener_region_add(MemoryListener *listener,
> +                                     MemoryRegionSection *section)
> +{
> +    VFIOContainer *container = container_of(listener, VFIOContainer,
> +                                            iommu_data.spapr.listener);
> +    hwaddr iova;
> +    Int128 llend;
> +    VFIOGuestIOMMU *giommu;
> +
> +    if (vfio_listener_skipped_section(section)) {
> +        trace_vfio_listener_region_add_skip(
> +            section->offset_within_address_space,
> +            section->offset_within_address_space +
> +            int128_get64(int128_sub(section->size, int128_one())));
> +        return;
> +    }
> +
> +    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
> +                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
> +        error_report("%s received unaligned region", __func__);
> +        return;
> +    }
> +
> +    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
> +    llend = int128_make64(section->offset_within_address_space);
> +    llend = int128_add(llend, section->size);
> +    llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
> +
> +    if (int128_ge(int128_make64(iova), llend)) {
> +        return;
> +    }
> +
> +    memory_region_ref(section->mr);
> +
> +    trace_vfio_listener_region_add_iommu(iova,
> +         int128_get64(int128_sub(llend, int128_one())));
> +    /*
> +     * FIXME: We should do some checking to see if the
> +     * capabilities of the host VFIO IOMMU are adequate to model
> +     * the guest IOMMU
> +     *
> +     * FIXME: For VFIO iommu types which have KVM acceleration to
> +     * avoid bouncing all map/unmaps through qemu this way, this
> +     * would be the right place to wire that up (tell the KVM
> +     * device emulation the VFIO iommu handles to use).
> +     */
> +    /*
> +     * This assumes that the guest IOMMU is empty of
> +     * mappings at this point.
> +     *
> +     * One way of doing this is:
> +     * 1. Avoid sharing IOMMUs between emulated devices or different
> +     * IOMMU groups.
> +     * 2. Implement VFIO_IOMMU_ENABLE in the host kernel to fail if
> +     * there are some mappings in IOMMU.
> +     *
> +     * VFIO on SPAPR does that. Other IOMMU models may do that different,
> +     * they must make sure there are no existing mappings or
> +     * loop through existing mappings to map them into VFIO.
> +     */
> +    giommu = g_malloc0(sizeof(*giommu));
> +    giommu->iommu = section->mr;
> +    giommu->container = container;
> +    giommu->n.notify = vfio_iommu_map_notify;
> +    QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
> +    memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
> +}
> +
> +static void vfio_spapr_listener_region_del(MemoryListener *listener,
> +                                     MemoryRegionSection *section)
> +{
> +    VFIOContainer *container = container_of(listener, VFIOContainer,
> +                                            iommu_data.spapr.listener);
> +    hwaddr iova, end;
> +    int ret;
> +    VFIOGuestIOMMU *giommu;
> +
> +    if (vfio_listener_skipped_section(section)) {
> +        trace_vfio_listener_region_del_skip(
> +            section->offset_within_address_space,
> +            section->offset_within_address_space +
> +            int128_get64(int128_sub(section->size, int128_one())));
> +        return;
> +    }
> +
> +    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
> +                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
> +        error_report("%s received unaligned region", __func__);
> +        return;
> +    }
> +
> +    QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
> +        if (giommu->iommu == section->mr) {
> +            memory_region_unregister_iommu_notifier(&giommu->n);
> +            QLIST_REMOVE(giommu, giommu_next);
> +            g_free(giommu);
> +            break;
> +        }
> +    }
> +
> +    /*
> +     * FIXME: We assume the one big unmap below is adequate to
> +     * remove any individual page mappings in the IOMMU which
> +     * might have been copied into VFIO. This works for a page table
> +     * based IOMMU where a big unmap flattens a large range of IO-PTEs.
> +     * That may not be true for all IOMMU types.
> +     */
> +
> +    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
> +    end = (section->offset_within_address_space + int128_get64(section->size)) &
> +        TARGET_PAGE_MASK;
> +
> +    if (iova >= end) {
> +        return;
> +    }
> +
> +    trace_vfio_listener_region_del(iova, end - 1);
> +
> +    ret = vfio_dma_unmap(container, iova, end - iova);
> +    memory_region_unref(section->mr);
> +    if (ret) {
> +        error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
> +                     "0x%"HWADDR_PRIx") = %d (%m)",
> +                     container, iova, end - iova, ret);
> +    }
> +}
> +
> +static const MemoryListener vfio_spapr_memory_listener = {
> +    .region_add = vfio_spapr_listener_region_add,
> +    .region_del = vfio_spapr_listener_region_del,
> +};
> +
> +static void vfio_spapr_listener_release(VFIOContainer *container)
> +{
> +    memory_listener_unregister(&container->iommu_data.spapr.listener);
> +}
> +
> +void spapr_memory_listener_register(VFIOContainer *container)
> +{
> +    container->iommu_data.spapr.listener = vfio_spapr_memory_listener;
> +    container->iommu_data.release = vfio_spapr_listener_release;
> +
> +    memory_listener_register(&container->iommu_data.spapr.listener,
> +                             container->space->as);
> +}
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index da85aa9..88d9e1f 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -70,6 +70,10 @@ typedef struct VFIOType1 {
>      bool initialized;
>  } VFIOType1;
>  
> +typedef struct VFIOSPAPR {
> +    MemoryListener listener;
> +} VFIOSPAPR;
> +
>  typedef struct VFIOContainer {
>      VFIOAddressSpace *space;
>      int fd; /* /dev/vfio/vfio, empowered by the attached groups */
> @@ -77,6 +81,7 @@ typedef struct VFIOContainer {
>          /* enable abstraction to support various iommu backends */
>          union {
>              VFIOType1 type1;
> +            VFIOSPAPR spapr;
>          };
>          void (*release)(struct VFIOContainer *);
>      } iommu_data;
> @@ -145,4 +150,12 @@ extern const MemoryRegionOps vfio_region_ops;
>  extern QLIST_HEAD(vfio_group_head, VFIOGroup) vfio_group_list;
>  extern QLIST_HEAD(vfio_as_head, VFIOAddressSpace) vfio_address_spaces;
>  
> +extern int vfio_dma_map(VFIOContainer *container, hwaddr iova,
> +                        ram_addr_t size, void *vaddr, bool readonly);
> +extern int vfio_dma_unmap(VFIOContainer *container,
> +                          hwaddr iova, ram_addr_t size);
> +bool vfio_listener_skipped_section(MemoryRegionSection *section);
> +
> +extern void spapr_memory_listener_register(VFIOContainer *container);
> +
>  #endif /* !HW_VFIO_VFIO_COMMON_H */
Alex Williamson March 3, 2015, 8:24 p.m. UTC | #2
On Mon, 2015-02-23 at 19:22 +1100, Alexey Kardashevskiy wrote:
> This moves SPAPR bits to a separate file to avoid pollution of x86 code.
> 
> This is a mechanical patch.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
> 
> There is another patch coming with DMA memory preregistration which will
> add another listener on RAM (not IOMMU as in previos posted patchset which is
> wrong); having these two in the same file with "type1" might be confusing.
> 
> Does this make sense? There is some code duplication, is there any sense
> to move them all to helpers?

I guess I don't have a significant objection to this.  It does sort of
beg the question whether type1 should also have it's memory listener
functions pulled out to a separate file as well, in which case yes, we
may want to create helpers to avoid the code duplication.  I'm also a
little disappointed that this seems like it might be removing guest
IOMMU support from common paths, because we'll probably need to deal
with that eventually for x86/type1, but I expect it's currently broken
for anything other than spapr anyway.  Thanks,

Alex

> ---
>  hw/vfio/Makefile.objs         |   1 +
>  hw/vfio/common.c              | 134 ++-----------------------
>  hw/vfio/spapr.c               | 226 ++++++++++++++++++++++++++++++++++++++++++
>  include/hw/vfio/vfio-common.h |  13 +++
>  4 files changed, 246 insertions(+), 128 deletions(-)
>  create mode 100644 hw/vfio/spapr.c
> 
> diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs
> index e31f30e..92d32b6 100644
> --- a/hw/vfio/Makefile.objs
> +++ b/hw/vfio/Makefile.objs
> @@ -1,4 +1,5 @@
>  ifeq ($(CONFIG_LINUX), y)
>  obj-$(CONFIG_SOFTMMU) += common.o
>  obj-$(CONFIG_PCI) += pci.o
> +obj-$(CONFIG_PSERIES) += spapr.o
>  endif
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 15fbaed..4e48aed 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -190,8 +190,8 @@ const MemoryRegionOps vfio_region_ops = {
>  /*
>   * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
>   */
> -static int vfio_dma_unmap(VFIOContainer *container,
> -                          hwaddr iova, ram_addr_t size)
> +int vfio_dma_unmap(VFIOContainer *container,
> +                   hwaddr iova, ram_addr_t size)
>  {
>      struct vfio_iommu_type1_dma_unmap unmap = {
>          .argsz = sizeof(unmap),
> @@ -208,8 +208,8 @@ static int vfio_dma_unmap(VFIOContainer *container,
>      return 0;
>  }
>  
> -static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
> -                        ram_addr_t size, void *vaddr, bool readonly)
> +int vfio_dma_map(VFIOContainer *container, hwaddr iova,
> +                 ram_addr_t size, void *vaddr, bool readonly)
>  {
>      struct vfio_iommu_type1_dma_map map = {
>          .argsz = sizeof(map),
> @@ -238,7 +238,7 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
>      return -errno;
>  }
>  
> -static bool vfio_listener_skipped_section(MemoryRegionSection *section)
> +bool vfio_listener_skipped_section(MemoryRegionSection *section)
>  {
>      return (!memory_region_is_ram(section->mr) &&
>              !memory_region_is_iommu(section->mr)) ||
> @@ -251,64 +251,6 @@ static bool vfio_listener_skipped_section(MemoryRegionSection *section)
>             section->offset_within_address_space & (1ULL << 63);
>  }
>  
> -static void vfio_iommu_map_notify(Notifier *n, void *data)
> -{
> -    VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
> -    VFIOContainer *container = giommu->container;
> -    IOMMUTLBEntry *iotlb = data;
> -    MemoryRegion *mr;
> -    hwaddr xlat;
> -    hwaddr len = iotlb->addr_mask + 1;
> -    void *vaddr;
> -    int ret;
> -
> -    trace_vfio_iommu_map_notify(iotlb->iova,
> -                                iotlb->iova + iotlb->addr_mask);
> -
> -    /*
> -     * The IOMMU TLB entry we have just covers translation through
> -     * this IOMMU to its immediate target.  We need to translate
> -     * it the rest of the way through to memory.
> -     */
> -    mr = address_space_translate(&address_space_memory,
> -                                 iotlb->translated_addr,
> -                                 &xlat, &len, iotlb->perm & IOMMU_WO);
> -    if (!memory_region_is_ram(mr)) {
> -        error_report("iommu map to non memory area %"HWADDR_PRIx"\n",
> -                     xlat);
> -        return;
> -    }
> -    /*
> -     * Translation truncates length to the IOMMU page size,
> -     * check that it did not truncate too much.
> -     */
> -    if (len & iotlb->addr_mask) {
> -        error_report("iommu has granularity incompatible with target AS\n");
> -        return;
> -    }
> -
> -    if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
> -        vaddr = memory_region_get_ram_ptr(mr) + xlat;
> -        ret = vfio_dma_map(container, iotlb->iova,
> -                           iotlb->addr_mask + 1, vaddr,
> -                           !(iotlb->perm & IOMMU_WO) || mr->readonly);
> -        if (ret) {
> -            error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
> -                         "0x%"HWADDR_PRIx", %p) = %d (%m)",
> -                         container, iotlb->iova,
> -                         iotlb->addr_mask + 1, vaddr, ret);
> -        }
> -    } else {
> -        ret = vfio_dma_unmap(container, iotlb->iova, iotlb->addr_mask + 1);
> -        if (ret) {
> -            error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
> -                         "0x%"HWADDR_PRIx") = %d (%m)",
> -                         container, iotlb->iova,
> -                         iotlb->addr_mask + 1, ret);
> -        }
> -    }
> -}
> -
>  static void vfio_listener_region_add(MemoryListener *listener,
>                                       MemoryRegionSection *section)
>  {
> @@ -344,45 +286,6 @@ static void vfio_listener_region_add(MemoryListener *listener,
>  
>      memory_region_ref(section->mr);
>  
> -    if (memory_region_is_iommu(section->mr)) {
> -        VFIOGuestIOMMU *giommu;
> -
> -        trace_vfio_listener_region_add_iommu(iova,
> -                    int128_get64(int128_sub(llend, int128_one())));
> -        /*
> -         * FIXME: We should do some checking to see if the
> -         * capabilities of the host VFIO IOMMU are adequate to model
> -         * the guest IOMMU
> -         *
> -         * FIXME: For VFIO iommu types which have KVM acceleration to
> -         * avoid bouncing all map/unmaps through qemu this way, this
> -         * would be the right place to wire that up (tell the KVM
> -         * device emulation the VFIO iommu handles to use).
> -         */
> -        /*
> -         * This assumes that the guest IOMMU is empty of
> -         * mappings at this point.
> -         *
> -         * One way of doing this is:
> -         * 1. Avoid sharing IOMMUs between emulated devices or different
> -         * IOMMU groups.
> -         * 2. Implement VFIO_IOMMU_ENABLE in the host kernel to fail if
> -         * there are some mappings in IOMMU.
> -         *
> -         * VFIO on SPAPR does that. Other IOMMU models may do that different,
> -         * they must make sure there are no existing mappings or
> -         * loop through existing mappings to map them into VFIO.
> -         */
> -        giommu = g_malloc0(sizeof(*giommu));
> -        giommu->iommu = section->mr;
> -        giommu->container = container;
> -        giommu->n.notify = vfio_iommu_map_notify;
> -        QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
> -        memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
> -
> -        return;
> -    }
> -
>      /* Here we assume that memory_region_is_ram(section->mr)==true */
>  
>      end = int128_get64(llend);
> @@ -435,27 +338,6 @@ static void vfio_listener_region_del(MemoryListener *listener,
>          return;
>      }
>  
> -    if (memory_region_is_iommu(section->mr)) {
> -        VFIOGuestIOMMU *giommu;
> -
> -        QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
> -            if (giommu->iommu == section->mr) {
> -                memory_region_unregister_iommu_notifier(&giommu->n);
> -                QLIST_REMOVE(giommu, giommu_next);
> -                g_free(giommu);
> -                break;
> -            }
> -        }
> -
> -        /*
> -         * FIXME: We assume the one big unmap below is adequate to
> -         * remove any individual page mappings in the IOMMU which
> -         * might have been copied into VFIO. This works for a page table
> -         * based IOMMU where a big unmap flattens a large range of IO-PTEs.
> -         * That may not be true for all IOMMU types.
> -         */
> -    }
> -
>      iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
>      end = (section->offset_within_address_space + int128_get64(section->size)) &
>            TARGET_PAGE_MASK;
> @@ -721,11 +603,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
>              goto free_container_exit;
>          }
>  
> -        container->iommu_data.type1.listener = vfio_memory_listener;
> -        container->iommu_data.release = vfio_listener_release;
> -
> -        memory_listener_register(&container->iommu_data.type1.listener,
> -                                 container->space->as);
> +        spapr_memory_listener_register(container);
>  
>      } else {
>          error_report("vfio: No available IOMMU models");
> diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
> new file mode 100644
> index 0000000..f743138
> --- /dev/null
> +++ b/hw/vfio/spapr.c
> @@ -0,0 +1,226 @@
> +/*
> + * QEMU sPAPR VFIO IOMMU
> + *
> + * Copyright (c) 2015 Alexey Kardashevskiy, IBM Corporation.
> + *
> + *  This program is free software; you can redistribute it and/or modify
> + *  it under the terms of the GNU General Public License as published by
> + *  the Free Software Foundation; either version 2 of the License,
> + *  or (at your option) any later version.
> + *
> + *  This program is distributed in the hope that it will be useful,
> + *  but WITHOUT ANY WARRANTY; without even the implied warranty of
> + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + *  GNU General Public License for more details.
> + *
> + *  You should have received a copy of the GNU General Public License
> + *  along with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "hw/vfio/vfio-common.h"
> +#include "qemu/error-report.h"
> +#include "trace.h"
> +
> +static void vfio_iommu_map_notify(Notifier *n, void *data)
> +{
> +    VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
> +    VFIOContainer *container = giommu->container;
> +    IOMMUTLBEntry *iotlb = data;
> +    MemoryRegion *mr;
> +    hwaddr xlat;
> +    hwaddr len = iotlb->addr_mask + 1;
> +    void *vaddr;
> +    int ret;
> +
> +    trace_vfio_iommu_map_notify(iotlb->iova,
> +                                iotlb->iova + iotlb->addr_mask);
> +
> +    /*
> +     * The IOMMU TLB entry we have just covers translation through
> +     * this IOMMU to its immediate target.  We need to translate
> +     * it the rest of the way through to memory.
> +     */
> +    mr = address_space_translate(&address_space_memory,
> +                                 iotlb->translated_addr,
> +                                 &xlat, &len, iotlb->perm & IOMMU_WO);
> +    if (!memory_region_is_ram(mr)) {
> +        error_report("iommu map to non memory area %"HWADDR_PRIx"\n",
> +                     xlat);
> +        return;
> +    }
> +    /*
> +     * Translation truncates length to the IOMMU page size,
> +     * check that it did not truncate too much.
> +     */
> +    if (len & iotlb->addr_mask) {
> +        error_report("iommu has granularity incompatible with target AS\n");
> +        return;
> +    }
> +
> +    if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
> +        vaddr = memory_region_get_ram_ptr(mr) + xlat;
> +        ret = vfio_dma_map(container, iotlb->iova,
> +                           iotlb->addr_mask + 1, vaddr,
> +                           !(iotlb->perm & IOMMU_WO) || mr->readonly);
> +        if (ret) {
> +            error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
> +                         "0x%"HWADDR_PRIx", %p) = %d (%m)",
> +                         container, iotlb->iova,
> +                         iotlb->addr_mask + 1, vaddr, ret);
> +        }
> +    } else {
> +        ret = vfio_dma_unmap(container, iotlb->iova, iotlb->addr_mask + 1);
> +        if (ret) {
> +            error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
> +                         "0x%"HWADDR_PRIx") = %d (%m)",
> +                         container, iotlb->iova,
> +                         iotlb->addr_mask + 1, ret);
> +        }
> +    }
> +}
> +
> +static void vfio_spapr_listener_region_add(MemoryListener *listener,
> +                                     MemoryRegionSection *section)
> +{
> +    VFIOContainer *container = container_of(listener, VFIOContainer,
> +                                            iommu_data.spapr.listener);
> +    hwaddr iova;
> +    Int128 llend;
> +    VFIOGuestIOMMU *giommu;
> +
> +    if (vfio_listener_skipped_section(section)) {
> +        trace_vfio_listener_region_add_skip(
> +            section->offset_within_address_space,
> +            section->offset_within_address_space +
> +            int128_get64(int128_sub(section->size, int128_one())));
> +        return;
> +    }
> +
> +    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
> +                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
> +        error_report("%s received unaligned region", __func__);
> +        return;
> +    }
> +
> +    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
> +    llend = int128_make64(section->offset_within_address_space);
> +    llend = int128_add(llend, section->size);
> +    llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
> +
> +    if (int128_ge(int128_make64(iova), llend)) {
> +        return;
> +    }
> +
> +    memory_region_ref(section->mr);
> +
> +    trace_vfio_listener_region_add_iommu(iova,
> +         int128_get64(int128_sub(llend, int128_one())));
> +    /*
> +     * FIXME: We should do some checking to see if the
> +     * capabilities of the host VFIO IOMMU are adequate to model
> +     * the guest IOMMU
> +     *
> +     * FIXME: For VFIO iommu types which have KVM acceleration to
> +     * avoid bouncing all map/unmaps through qemu this way, this
> +     * would be the right place to wire that up (tell the KVM
> +     * device emulation the VFIO iommu handles to use).
> +     */
> +    /*
> +     * This assumes that the guest IOMMU is empty of
> +     * mappings at this point.
> +     *
> +     * One way of doing this is:
> +     * 1. Avoid sharing IOMMUs between emulated devices or different
> +     * IOMMU groups.
> +     * 2. Implement VFIO_IOMMU_ENABLE in the host kernel to fail if
> +     * there are some mappings in IOMMU.
> +     *
> +     * VFIO on SPAPR does that. Other IOMMU models may do that different,
> +     * they must make sure there are no existing mappings or
> +     * loop through existing mappings to map them into VFIO.
> +     */
> +    giommu = g_malloc0(sizeof(*giommu));
> +    giommu->iommu = section->mr;
> +    giommu->container = container;
> +    giommu->n.notify = vfio_iommu_map_notify;
> +    QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
> +    memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
> +}
> +
> +static void vfio_spapr_listener_region_del(MemoryListener *listener,
> +                                     MemoryRegionSection *section)
> +{
> +    VFIOContainer *container = container_of(listener, VFIOContainer,
> +                                            iommu_data.spapr.listener);
> +    hwaddr iova, end;
> +    int ret;
> +    VFIOGuestIOMMU *giommu;
> +
> +    if (vfio_listener_skipped_section(section)) {
> +        trace_vfio_listener_region_del_skip(
> +            section->offset_within_address_space,
> +            section->offset_within_address_space +
> +            int128_get64(int128_sub(section->size, int128_one())));
> +        return;
> +    }
> +
> +    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
> +                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
> +        error_report("%s received unaligned region", __func__);
> +        return;
> +    }
> +
> +    QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
> +        if (giommu->iommu == section->mr) {
> +            memory_region_unregister_iommu_notifier(&giommu->n);
> +            QLIST_REMOVE(giommu, giommu_next);
> +            g_free(giommu);
> +            break;
> +        }
> +    }
> +
> +    /*
> +     * FIXME: We assume the one big unmap below is adequate to
> +     * remove any individual page mappings in the IOMMU which
> +     * might have been copied into VFIO. This works for a page table
> +     * based IOMMU where a big unmap flattens a large range of IO-PTEs.
> +     * That may not be true for all IOMMU types.
> +     */
> +
> +    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
> +    end = (section->offset_within_address_space + int128_get64(section->size)) &
> +        TARGET_PAGE_MASK;
> +
> +    if (iova >= end) {
> +        return;
> +    }
> +
> +    trace_vfio_listener_region_del(iova, end - 1);
> +
> +    ret = vfio_dma_unmap(container, iova, end - iova);
> +    memory_region_unref(section->mr);
> +    if (ret) {
> +        error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
> +                     "0x%"HWADDR_PRIx") = %d (%m)",
> +                     container, iova, end - iova, ret);
> +    }
> +}
> +
> +static const MemoryListener vfio_spapr_memory_listener = {
> +    .region_add = vfio_spapr_listener_region_add,
> +    .region_del = vfio_spapr_listener_region_del,
> +};
> +
> +static void vfio_spapr_listener_release(VFIOContainer *container)
> +{
> +    memory_listener_unregister(&container->iommu_data.spapr.listener);
> +}
> +
> +void spapr_memory_listener_register(VFIOContainer *container)
> +{
> +    container->iommu_data.spapr.listener = vfio_spapr_memory_listener;
> +    container->iommu_data.release = vfio_spapr_listener_release;
> +
> +    memory_listener_register(&container->iommu_data.spapr.listener,
> +                             container->space->as);
> +}
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index da85aa9..88d9e1f 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -70,6 +70,10 @@ typedef struct VFIOType1 {
>      bool initialized;
>  } VFIOType1;
>  
> +typedef struct VFIOSPAPR {
> +    MemoryListener listener;
> +} VFIOSPAPR;
> +
>  typedef struct VFIOContainer {
>      VFIOAddressSpace *space;
>      int fd; /* /dev/vfio/vfio, empowered by the attached groups */
> @@ -77,6 +81,7 @@ typedef struct VFIOContainer {
>          /* enable abstraction to support various iommu backends */
>          union {
>              VFIOType1 type1;
> +            VFIOSPAPR spapr;
>          };
>          void (*release)(struct VFIOContainer *);
>      } iommu_data;
> @@ -145,4 +150,12 @@ extern const MemoryRegionOps vfio_region_ops;
>  extern QLIST_HEAD(vfio_group_head, VFIOGroup) vfio_group_list;
>  extern QLIST_HEAD(vfio_as_head, VFIOAddressSpace) vfio_address_spaces;
>  
> +extern int vfio_dma_map(VFIOContainer *container, hwaddr iova,
> +                        ram_addr_t size, void *vaddr, bool readonly);
> +extern int vfio_dma_unmap(VFIOContainer *container,
> +                          hwaddr iova, ram_addr_t size);
> +bool vfio_listener_skipped_section(MemoryRegionSection *section);
> +
> +extern void spapr_memory_listener_register(VFIOContainer *container);
> +
>  #endif /* !HW_VFIO_VFIO_COMMON_H */
Alexey Kardashevskiy March 4, 2015, 12:18 a.m. UTC | #3
On 03/04/2015 07:24 AM, Alex Williamson wrote:
> On Mon, 2015-02-23 at 19:22 +1100, Alexey Kardashevskiy wrote:
>> This moves SPAPR bits to a separate file to avoid pollution of x86 code.
>>
>> This is a mechanical patch.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>
>> There is another patch coming with DMA memory preregistration which will
>> add another listener on RAM (not IOMMU as in previos posted patchset which is
>> wrong); having these two in the same file with "type1" might be confusing.
>>
>> Does this make sense? There is some code duplication, is there any sense
>> to move them all to helpers?
>
> I guess I don't have a significant objection to this.  It does sort of
> beg the question whether type1 should also have it's memory listener
> functions pulled out to a separate file as well, in which case yes, we
> may want to create helpers to avoid the code duplication.  I'm also a
> little disappointed that this seems like it might be removing guest
> IOMMU support from common paths, because we'll probably need to deal
> with that eventually for x86/type1, but I expect it's currently broken
> for anything other than spapr anyway.  Thanks,

Well, I could move type1 listener to a new file too, and rename spapr.c to 
iommu.c to make it is less sPAPR-ish and add RAM listener for memory 
preregistration to a separate file (prereg.c or ramreg.c or ???, now I keep 
it in spapr.c). Or it is too much and this patch is enough for today?


>
> Alex
>
>> ---
>>   hw/vfio/Makefile.objs         |   1 +
>>   hw/vfio/common.c              | 134 ++-----------------------
>>   hw/vfio/spapr.c               | 226 ++++++++++++++++++++++++++++++++++++++++++
>>   include/hw/vfio/vfio-common.h |  13 +++
>>   4 files changed, 246 insertions(+), 128 deletions(-)
>>   create mode 100644 hw/vfio/spapr.c
>>
>> diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs
>> index e31f30e..92d32b6 100644
>> --- a/hw/vfio/Makefile.objs
>> +++ b/hw/vfio/Makefile.objs
>> @@ -1,4 +1,5 @@
>>   ifeq ($(CONFIG_LINUX), y)
>>   obj-$(CONFIG_SOFTMMU) += common.o
>>   obj-$(CONFIG_PCI) += pci.o
>> +obj-$(CONFIG_PSERIES) += spapr.o
>>   endif
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index 15fbaed..4e48aed 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -190,8 +190,8 @@ const MemoryRegionOps vfio_region_ops = {
>>   /*
>>    * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
>>    */
>> -static int vfio_dma_unmap(VFIOContainer *container,
>> -                          hwaddr iova, ram_addr_t size)
>> +int vfio_dma_unmap(VFIOContainer *container,
>> +                   hwaddr iova, ram_addr_t size)
>>   {
>>       struct vfio_iommu_type1_dma_unmap unmap = {
>>           .argsz = sizeof(unmap),
>> @@ -208,8 +208,8 @@ static int vfio_dma_unmap(VFIOContainer *container,
>>       return 0;
>>   }
>>
>> -static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
>> -                        ram_addr_t size, void *vaddr, bool readonly)
>> +int vfio_dma_map(VFIOContainer *container, hwaddr iova,
>> +                 ram_addr_t size, void *vaddr, bool readonly)
>>   {
>>       struct vfio_iommu_type1_dma_map map = {
>>           .argsz = sizeof(map),
>> @@ -238,7 +238,7 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
>>       return -errno;
>>   }
>>
>> -static bool vfio_listener_skipped_section(MemoryRegionSection *section)
>> +bool vfio_listener_skipped_section(MemoryRegionSection *section)
>>   {
>>       return (!memory_region_is_ram(section->mr) &&
>>               !memory_region_is_iommu(section->mr)) ||
>> @@ -251,64 +251,6 @@ static bool vfio_listener_skipped_section(MemoryRegionSection *section)
>>              section->offset_within_address_space & (1ULL << 63);
>>   }
>>
>> -static void vfio_iommu_map_notify(Notifier *n, void *data)
>> -{
>> -    VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
>> -    VFIOContainer *container = giommu->container;
>> -    IOMMUTLBEntry *iotlb = data;
>> -    MemoryRegion *mr;
>> -    hwaddr xlat;
>> -    hwaddr len = iotlb->addr_mask + 1;
>> -    void *vaddr;
>> -    int ret;
>> -
>> -    trace_vfio_iommu_map_notify(iotlb->iova,
>> -                                iotlb->iova + iotlb->addr_mask);
>> -
>> -    /*
>> -     * The IOMMU TLB entry we have just covers translation through
>> -     * this IOMMU to its immediate target.  We need to translate
>> -     * it the rest of the way through to memory.
>> -     */
>> -    mr = address_space_translate(&address_space_memory,
>> -                                 iotlb->translated_addr,
>> -                                 &xlat, &len, iotlb->perm & IOMMU_WO);
>> -    if (!memory_region_is_ram(mr)) {
>> -        error_report("iommu map to non memory area %"HWADDR_PRIx"\n",
>> -                     xlat);
>> -        return;
>> -    }
>> -    /*
>> -     * Translation truncates length to the IOMMU page size,
>> -     * check that it did not truncate too much.
>> -     */
>> -    if (len & iotlb->addr_mask) {
>> -        error_report("iommu has granularity incompatible with target AS\n");
>> -        return;
>> -    }
>> -
>> -    if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
>> -        vaddr = memory_region_get_ram_ptr(mr) + xlat;
>> -        ret = vfio_dma_map(container, iotlb->iova,
>> -                           iotlb->addr_mask + 1, vaddr,
>> -                           !(iotlb->perm & IOMMU_WO) || mr->readonly);
>> -        if (ret) {
>> -            error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
>> -                         "0x%"HWADDR_PRIx", %p) = %d (%m)",
>> -                         container, iotlb->iova,
>> -                         iotlb->addr_mask + 1, vaddr, ret);
>> -        }
>> -    } else {
>> -        ret = vfio_dma_unmap(container, iotlb->iova, iotlb->addr_mask + 1);
>> -        if (ret) {
>> -            error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
>> -                         "0x%"HWADDR_PRIx") = %d (%m)",
>> -                         container, iotlb->iova,
>> -                         iotlb->addr_mask + 1, ret);
>> -        }
>> -    }
>> -}
>> -
>>   static void vfio_listener_region_add(MemoryListener *listener,
>>                                        MemoryRegionSection *section)
>>   {
>> @@ -344,45 +286,6 @@ static void vfio_listener_region_add(MemoryListener *listener,
>>
>>       memory_region_ref(section->mr);
>>
>> -    if (memory_region_is_iommu(section->mr)) {
>> -        VFIOGuestIOMMU *giommu;
>> -
>> -        trace_vfio_listener_region_add_iommu(iova,
>> -                    int128_get64(int128_sub(llend, int128_one())));
>> -        /*
>> -         * FIXME: We should do some checking to see if the
>> -         * capabilities of the host VFIO IOMMU are adequate to model
>> -         * the guest IOMMU
>> -         *
>> -         * FIXME: For VFIO iommu types which have KVM acceleration to
>> -         * avoid bouncing all map/unmaps through qemu this way, this
>> -         * would be the right place to wire that up (tell the KVM
>> -         * device emulation the VFIO iommu handles to use).
>> -         */
>> -        /*
>> -         * This assumes that the guest IOMMU is empty of
>> -         * mappings at this point.
>> -         *
>> -         * One way of doing this is:
>> -         * 1. Avoid sharing IOMMUs between emulated devices or different
>> -         * IOMMU groups.
>> -         * 2. Implement VFIO_IOMMU_ENABLE in the host kernel to fail if
>> -         * there are some mappings in IOMMU.
>> -         *
>> -         * VFIO on SPAPR does that. Other IOMMU models may do that different,
>> -         * they must make sure there are no existing mappings or
>> -         * loop through existing mappings to map them into VFIO.
>> -         */
>> -        giommu = g_malloc0(sizeof(*giommu));
>> -        giommu->iommu = section->mr;
>> -        giommu->container = container;
>> -        giommu->n.notify = vfio_iommu_map_notify;
>> -        QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
>> -        memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
>> -
>> -        return;
>> -    }
>> -
>>       /* Here we assume that memory_region_is_ram(section->mr)==true */
>>
>>       end = int128_get64(llend);
>> @@ -435,27 +338,6 @@ static void vfio_listener_region_del(MemoryListener *listener,
>>           return;
>>       }
>>
>> -    if (memory_region_is_iommu(section->mr)) {
>> -        VFIOGuestIOMMU *giommu;
>> -
>> -        QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
>> -            if (giommu->iommu == section->mr) {
>> -                memory_region_unregister_iommu_notifier(&giommu->n);
>> -                QLIST_REMOVE(giommu, giommu_next);
>> -                g_free(giommu);
>> -                break;
>> -            }
>> -        }
>> -
>> -        /*
>> -         * FIXME: We assume the one big unmap below is adequate to
>> -         * remove any individual page mappings in the IOMMU which
>> -         * might have been copied into VFIO. This works for a page table
>> -         * based IOMMU where a big unmap flattens a large range of IO-PTEs.
>> -         * That may not be true for all IOMMU types.
>> -         */
>> -    }
>> -
>>       iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
>>       end = (section->offset_within_address_space + int128_get64(section->size)) &
>>             TARGET_PAGE_MASK;
>> @@ -721,11 +603,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
>>               goto free_container_exit;
>>           }
>>
>> -        container->iommu_data.type1.listener = vfio_memory_listener;
>> -        container->iommu_data.release = vfio_listener_release;
>> -
>> -        memory_listener_register(&container->iommu_data.type1.listener,
>> -                                 container->space->as);
>> +        spapr_memory_listener_register(container);
>>
>>       } else {
>>           error_report("vfio: No available IOMMU models");
>> diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
>> new file mode 100644
>> index 0000000..f743138
>> --- /dev/null
>> +++ b/hw/vfio/spapr.c
>> @@ -0,0 +1,226 @@
>> +/*
>> + * QEMU sPAPR VFIO IOMMU
>> + *
>> + * Copyright (c) 2015 Alexey Kardashevskiy, IBM Corporation.
>> + *
>> + *  This program is free software; you can redistribute it and/or modify
>> + *  it under the terms of the GNU General Public License as published by
>> + *  the Free Software Foundation; either version 2 of the License,
>> + *  or (at your option) any later version.
>> + *
>> + *  This program is distributed in the hope that it will be useful,
>> + *  but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> + *  GNU General Public License for more details.
>> + *
>> + *  You should have received a copy of the GNU General Public License
>> + *  along with this program; if not, see <http://www.gnu.org/licenses/>.
>> + */
>> +
>> +#include "hw/vfio/vfio-common.h"
>> +#include "qemu/error-report.h"
>> +#include "trace.h"
>> +
>> +static void vfio_iommu_map_notify(Notifier *n, void *data)
>> +{
>> +    VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
>> +    VFIOContainer *container = giommu->container;
>> +    IOMMUTLBEntry *iotlb = data;
>> +    MemoryRegion *mr;
>> +    hwaddr xlat;
>> +    hwaddr len = iotlb->addr_mask + 1;
>> +    void *vaddr;
>> +    int ret;
>> +
>> +    trace_vfio_iommu_map_notify(iotlb->iova,
>> +                                iotlb->iova + iotlb->addr_mask);
>> +
>> +    /*
>> +     * The IOMMU TLB entry we have just covers translation through
>> +     * this IOMMU to its immediate target.  We need to translate
>> +     * it the rest of the way through to memory.
>> +     */
>> +    mr = address_space_translate(&address_space_memory,
>> +                                 iotlb->translated_addr,
>> +                                 &xlat, &len, iotlb->perm & IOMMU_WO);
>> +    if (!memory_region_is_ram(mr)) {
>> +        error_report("iommu map to non memory area %"HWADDR_PRIx"\n",
>> +                     xlat);
>> +        return;
>> +    }
>> +    /*
>> +     * Translation truncates length to the IOMMU page size,
>> +     * check that it did not truncate too much.
>> +     */
>> +    if (len & iotlb->addr_mask) {
>> +        error_report("iommu has granularity incompatible with target AS\n");
>> +        return;
>> +    }
>> +
>> +    if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
>> +        vaddr = memory_region_get_ram_ptr(mr) + xlat;
>> +        ret = vfio_dma_map(container, iotlb->iova,
>> +                           iotlb->addr_mask + 1, vaddr,
>> +                           !(iotlb->perm & IOMMU_WO) || mr->readonly);
>> +        if (ret) {
>> +            error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
>> +                         "0x%"HWADDR_PRIx", %p) = %d (%m)",
>> +                         container, iotlb->iova,
>> +                         iotlb->addr_mask + 1, vaddr, ret);
>> +        }
>> +    } else {
>> +        ret = vfio_dma_unmap(container, iotlb->iova, iotlb->addr_mask + 1);
>> +        if (ret) {
>> +            error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
>> +                         "0x%"HWADDR_PRIx") = %d (%m)",
>> +                         container, iotlb->iova,
>> +                         iotlb->addr_mask + 1, ret);
>> +        }
>> +    }
>> +}
>> +
>> +static void vfio_spapr_listener_region_add(MemoryListener *listener,
>> +                                     MemoryRegionSection *section)
>> +{
>> +    VFIOContainer *container = container_of(listener, VFIOContainer,
>> +                                            iommu_data.spapr.listener);
>> +    hwaddr iova;
>> +    Int128 llend;
>> +    VFIOGuestIOMMU *giommu;
>> +
>> +    if (vfio_listener_skipped_section(section)) {
>> +        trace_vfio_listener_region_add_skip(
>> +            section->offset_within_address_space,
>> +            section->offset_within_address_space +
>> +            int128_get64(int128_sub(section->size, int128_one())));
>> +        return;
>> +    }
>> +
>> +    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
>> +                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
>> +        error_report("%s received unaligned region", __func__);
>> +        return;
>> +    }
>> +
>> +    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
>> +    llend = int128_make64(section->offset_within_address_space);
>> +    llend = int128_add(llend, section->size);
>> +    llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
>> +
>> +    if (int128_ge(int128_make64(iova), llend)) {
>> +        return;
>> +    }
>> +
>> +    memory_region_ref(section->mr);
>> +
>> +    trace_vfio_listener_region_add_iommu(iova,
>> +         int128_get64(int128_sub(llend, int128_one())));
>> +    /*
>> +     * FIXME: We should do some checking to see if the
>> +     * capabilities of the host VFIO IOMMU are adequate to model
>> +     * the guest IOMMU
>> +     *
>> +     * FIXME: For VFIO iommu types which have KVM acceleration to
>> +     * avoid bouncing all map/unmaps through qemu this way, this
>> +     * would be the right place to wire that up (tell the KVM
>> +     * device emulation the VFIO iommu handles to use).
>> +     */
>> +    /*
>> +     * This assumes that the guest IOMMU is empty of
>> +     * mappings at this point.
>> +     *
>> +     * One way of doing this is:
>> +     * 1. Avoid sharing IOMMUs between emulated devices or different
>> +     * IOMMU groups.
>> +     * 2. Implement VFIO_IOMMU_ENABLE in the host kernel to fail if
>> +     * there are some mappings in IOMMU.
>> +     *
>> +     * VFIO on SPAPR does that. Other IOMMU models may do that different,
>> +     * they must make sure there are no existing mappings or
>> +     * loop through existing mappings to map them into VFIO.
>> +     */
>> +    giommu = g_malloc0(sizeof(*giommu));
>> +    giommu->iommu = section->mr;
>> +    giommu->container = container;
>> +    giommu->n.notify = vfio_iommu_map_notify;
>> +    QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
>> +    memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
>> +}
>> +
>> +static void vfio_spapr_listener_region_del(MemoryListener *listener,
>> +                                     MemoryRegionSection *section)
>> +{
>> +    VFIOContainer *container = container_of(listener, VFIOContainer,
>> +                                            iommu_data.spapr.listener);
>> +    hwaddr iova, end;
>> +    int ret;
>> +    VFIOGuestIOMMU *giommu;
>> +
>> +    if (vfio_listener_skipped_section(section)) {
>> +        trace_vfio_listener_region_del_skip(
>> +            section->offset_within_address_space,
>> +            section->offset_within_address_space +
>> +            int128_get64(int128_sub(section->size, int128_one())));
>> +        return;
>> +    }
>> +
>> +    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
>> +                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
>> +        error_report("%s received unaligned region", __func__);
>> +        return;
>> +    }
>> +
>> +    QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
>> +        if (giommu->iommu == section->mr) {
>> +            memory_region_unregister_iommu_notifier(&giommu->n);
>> +            QLIST_REMOVE(giommu, giommu_next);
>> +            g_free(giommu);
>> +            break;
>> +        }
>> +    }
>> +
>> +    /*
>> +     * FIXME: We assume the one big unmap below is adequate to
>> +     * remove any individual page mappings in the IOMMU which
>> +     * might have been copied into VFIO. This works for a page table
>> +     * based IOMMU where a big unmap flattens a large range of IO-PTEs.
>> +     * That may not be true for all IOMMU types.
>> +     */
>> +
>> +    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
>> +    end = (section->offset_within_address_space + int128_get64(section->size)) &
>> +        TARGET_PAGE_MASK;
>> +
>> +    if (iova >= end) {
>> +        return;
>> +    }
>> +
>> +    trace_vfio_listener_region_del(iova, end - 1);
>> +
>> +    ret = vfio_dma_unmap(container, iova, end - iova);
>> +    memory_region_unref(section->mr);
>> +    if (ret) {
>> +        error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
>> +                     "0x%"HWADDR_PRIx") = %d (%m)",
>> +                     container, iova, end - iova, ret);
>> +    }
>> +}
>> +
>> +static const MemoryListener vfio_spapr_memory_listener = {
>> +    .region_add = vfio_spapr_listener_region_add,
>> +    .region_del = vfio_spapr_listener_region_del,
>> +};
>> +
>> +static void vfio_spapr_listener_release(VFIOContainer *container)
>> +{
>> +    memory_listener_unregister(&container->iommu_data.spapr.listener);
>> +}
>> +
>> +void spapr_memory_listener_register(VFIOContainer *container)
>> +{
>> +    container->iommu_data.spapr.listener = vfio_spapr_memory_listener;
>> +    container->iommu_data.release = vfio_spapr_listener_release;
>> +
>> +    memory_listener_register(&container->iommu_data.spapr.listener,
>> +                             container->space->as);
>> +}
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>> index da85aa9..88d9e1f 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -70,6 +70,10 @@ typedef struct VFIOType1 {
>>       bool initialized;
>>   } VFIOType1;
>>
>> +typedef struct VFIOSPAPR {
>> +    MemoryListener listener;
>> +} VFIOSPAPR;
>> +
>>   typedef struct VFIOContainer {
>>       VFIOAddressSpace *space;
>>       int fd; /* /dev/vfio/vfio, empowered by the attached groups */
>> @@ -77,6 +81,7 @@ typedef struct VFIOContainer {
>>           /* enable abstraction to support various iommu backends */
>>           union {
>>               VFIOType1 type1;
>> +            VFIOSPAPR spapr;
>>           };
>>           void (*release)(struct VFIOContainer *);
>>       } iommu_data;
>> @@ -145,4 +150,12 @@ extern const MemoryRegionOps vfio_region_ops;
>>   extern QLIST_HEAD(vfio_group_head, VFIOGroup) vfio_group_list;
>>   extern QLIST_HEAD(vfio_as_head, VFIOAddressSpace) vfio_address_spaces;
>>
>> +extern int vfio_dma_map(VFIOContainer *container, hwaddr iova,
>> +                        ram_addr_t size, void *vaddr, bool readonly);
>> +extern int vfio_dma_unmap(VFIOContainer *container,
>> +                          hwaddr iova, ram_addr_t size);
>> +bool vfio_listener_skipped_section(MemoryRegionSection *section);
>> +
>> +extern void spapr_memory_listener_register(VFIOContainer *container);
>> +
>>   #endif /* !HW_VFIO_VFIO_COMMON_H */
>
>
>
diff mbox

Patch

diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs
index e31f30e..92d32b6 100644
--- a/hw/vfio/Makefile.objs
+++ b/hw/vfio/Makefile.objs
@@ -1,4 +1,5 @@ 
 ifeq ($(CONFIG_LINUX), y)
 obj-$(CONFIG_SOFTMMU) += common.o
 obj-$(CONFIG_PCI) += pci.o
+obj-$(CONFIG_PSERIES) += spapr.o
 endif
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 15fbaed..4e48aed 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -190,8 +190,8 @@  const MemoryRegionOps vfio_region_ops = {
 /*
  * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
  */
-static int vfio_dma_unmap(VFIOContainer *container,
-                          hwaddr iova, ram_addr_t size)
+int vfio_dma_unmap(VFIOContainer *container,
+                   hwaddr iova, ram_addr_t size)
 {
     struct vfio_iommu_type1_dma_unmap unmap = {
         .argsz = sizeof(unmap),
@@ -208,8 +208,8 @@  static int vfio_dma_unmap(VFIOContainer *container,
     return 0;
 }
 
-static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
-                        ram_addr_t size, void *vaddr, bool readonly)
+int vfio_dma_map(VFIOContainer *container, hwaddr iova,
+                 ram_addr_t size, void *vaddr, bool readonly)
 {
     struct vfio_iommu_type1_dma_map map = {
         .argsz = sizeof(map),
@@ -238,7 +238,7 @@  static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
     return -errno;
 }
 
-static bool vfio_listener_skipped_section(MemoryRegionSection *section)
+bool vfio_listener_skipped_section(MemoryRegionSection *section)
 {
     return (!memory_region_is_ram(section->mr) &&
             !memory_region_is_iommu(section->mr)) ||
@@ -251,64 +251,6 @@  static bool vfio_listener_skipped_section(MemoryRegionSection *section)
            section->offset_within_address_space & (1ULL << 63);
 }
 
-static void vfio_iommu_map_notify(Notifier *n, void *data)
-{
-    VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
-    VFIOContainer *container = giommu->container;
-    IOMMUTLBEntry *iotlb = data;
-    MemoryRegion *mr;
-    hwaddr xlat;
-    hwaddr len = iotlb->addr_mask + 1;
-    void *vaddr;
-    int ret;
-
-    trace_vfio_iommu_map_notify(iotlb->iova,
-                                iotlb->iova + iotlb->addr_mask);
-
-    /*
-     * The IOMMU TLB entry we have just covers translation through
-     * this IOMMU to its immediate target.  We need to translate
-     * it the rest of the way through to memory.
-     */
-    mr = address_space_translate(&address_space_memory,
-                                 iotlb->translated_addr,
-                                 &xlat, &len, iotlb->perm & IOMMU_WO);
-    if (!memory_region_is_ram(mr)) {
-        error_report("iommu map to non memory area %"HWADDR_PRIx"\n",
-                     xlat);
-        return;
-    }
-    /*
-     * Translation truncates length to the IOMMU page size,
-     * check that it did not truncate too much.
-     */
-    if (len & iotlb->addr_mask) {
-        error_report("iommu has granularity incompatible with target AS\n");
-        return;
-    }
-
-    if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
-        vaddr = memory_region_get_ram_ptr(mr) + xlat;
-        ret = vfio_dma_map(container, iotlb->iova,
-                           iotlb->addr_mask + 1, vaddr,
-                           !(iotlb->perm & IOMMU_WO) || mr->readonly);
-        if (ret) {
-            error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
-                         "0x%"HWADDR_PRIx", %p) = %d (%m)",
-                         container, iotlb->iova,
-                         iotlb->addr_mask + 1, vaddr, ret);
-        }
-    } else {
-        ret = vfio_dma_unmap(container, iotlb->iova, iotlb->addr_mask + 1);
-        if (ret) {
-            error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
-                         "0x%"HWADDR_PRIx") = %d (%m)",
-                         container, iotlb->iova,
-                         iotlb->addr_mask + 1, ret);
-        }
-    }
-}
-
 static void vfio_listener_region_add(MemoryListener *listener,
                                      MemoryRegionSection *section)
 {
@@ -344,45 +286,6 @@  static void vfio_listener_region_add(MemoryListener *listener,
 
     memory_region_ref(section->mr);
 
-    if (memory_region_is_iommu(section->mr)) {
-        VFIOGuestIOMMU *giommu;
-
-        trace_vfio_listener_region_add_iommu(iova,
-                    int128_get64(int128_sub(llend, int128_one())));
-        /*
-         * FIXME: We should do some checking to see if the
-         * capabilities of the host VFIO IOMMU are adequate to model
-         * the guest IOMMU
-         *
-         * FIXME: For VFIO iommu types which have KVM acceleration to
-         * avoid bouncing all map/unmaps through qemu this way, this
-         * would be the right place to wire that up (tell the KVM
-         * device emulation the VFIO iommu handles to use).
-         */
-        /*
-         * This assumes that the guest IOMMU is empty of
-         * mappings at this point.
-         *
-         * One way of doing this is:
-         * 1. Avoid sharing IOMMUs between emulated devices or different
-         * IOMMU groups.
-         * 2. Implement VFIO_IOMMU_ENABLE in the host kernel to fail if
-         * there are some mappings in IOMMU.
-         *
-         * VFIO on SPAPR does that. Other IOMMU models may do that different,
-         * they must make sure there are no existing mappings or
-         * loop through existing mappings to map them into VFIO.
-         */
-        giommu = g_malloc0(sizeof(*giommu));
-        giommu->iommu = section->mr;
-        giommu->container = container;
-        giommu->n.notify = vfio_iommu_map_notify;
-        QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
-        memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
-
-        return;
-    }
-
     /* Here we assume that memory_region_is_ram(section->mr)==true */
 
     end = int128_get64(llend);
@@ -435,27 +338,6 @@  static void vfio_listener_region_del(MemoryListener *listener,
         return;
     }
 
-    if (memory_region_is_iommu(section->mr)) {
-        VFIOGuestIOMMU *giommu;
-
-        QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
-            if (giommu->iommu == section->mr) {
-                memory_region_unregister_iommu_notifier(&giommu->n);
-                QLIST_REMOVE(giommu, giommu_next);
-                g_free(giommu);
-                break;
-            }
-        }
-
-        /*
-         * FIXME: We assume the one big unmap below is adequate to
-         * remove any individual page mappings in the IOMMU which
-         * might have been copied into VFIO. This works for a page table
-         * based IOMMU where a big unmap flattens a large range of IO-PTEs.
-         * That may not be true for all IOMMU types.
-         */
-    }
-
     iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
     end = (section->offset_within_address_space + int128_get64(section->size)) &
           TARGET_PAGE_MASK;
@@ -721,11 +603,7 @@  static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
             goto free_container_exit;
         }
 
-        container->iommu_data.type1.listener = vfio_memory_listener;
-        container->iommu_data.release = vfio_listener_release;
-
-        memory_listener_register(&container->iommu_data.type1.listener,
-                                 container->space->as);
+        spapr_memory_listener_register(container);
 
     } else {
         error_report("vfio: No available IOMMU models");
diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
new file mode 100644
index 0000000..f743138
--- /dev/null
+++ b/hw/vfio/spapr.c
@@ -0,0 +1,226 @@ 
+/*
+ * QEMU sPAPR VFIO IOMMU
+ *
+ * Copyright (c) 2015 Alexey Kardashevskiy, IBM Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License,
+ *  or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "hw/vfio/vfio-common.h"
+#include "qemu/error-report.h"
+#include "trace.h"
+
+static void vfio_iommu_map_notify(Notifier *n, void *data)
+{
+    VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
+    VFIOContainer *container = giommu->container;
+    IOMMUTLBEntry *iotlb = data;
+    MemoryRegion *mr;
+    hwaddr xlat;
+    hwaddr len = iotlb->addr_mask + 1;
+    void *vaddr;
+    int ret;
+
+    trace_vfio_iommu_map_notify(iotlb->iova,
+                                iotlb->iova + iotlb->addr_mask);
+
+    /*
+     * The IOMMU TLB entry we have just covers translation through
+     * this IOMMU to its immediate target.  We need to translate
+     * it the rest of the way through to memory.
+     */
+    mr = address_space_translate(&address_space_memory,
+                                 iotlb->translated_addr,
+                                 &xlat, &len, iotlb->perm & IOMMU_WO);
+    if (!memory_region_is_ram(mr)) {
+        error_report("iommu map to non memory area %"HWADDR_PRIx"\n",
+                     xlat);
+        return;
+    }
+    /*
+     * Translation truncates length to the IOMMU page size,
+     * check that it did not truncate too much.
+     */
+    if (len & iotlb->addr_mask) {
+        error_report("iommu has granularity incompatible with target AS\n");
+        return;
+    }
+
+    if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
+        vaddr = memory_region_get_ram_ptr(mr) + xlat;
+        ret = vfio_dma_map(container, iotlb->iova,
+                           iotlb->addr_mask + 1, vaddr,
+                           !(iotlb->perm & IOMMU_WO) || mr->readonly);
+        if (ret) {
+            error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
+                         "0x%"HWADDR_PRIx", %p) = %d (%m)",
+                         container, iotlb->iova,
+                         iotlb->addr_mask + 1, vaddr, ret);
+        }
+    } else {
+        ret = vfio_dma_unmap(container, iotlb->iova, iotlb->addr_mask + 1);
+        if (ret) {
+            error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
+                         "0x%"HWADDR_PRIx") = %d (%m)",
+                         container, iotlb->iova,
+                         iotlb->addr_mask + 1, ret);
+        }
+    }
+}
+
+static void vfio_spapr_listener_region_add(MemoryListener *listener,
+                                     MemoryRegionSection *section)
+{
+    VFIOContainer *container = container_of(listener, VFIOContainer,
+                                            iommu_data.spapr.listener);
+    hwaddr iova;
+    Int128 llend;
+    VFIOGuestIOMMU *giommu;
+
+    if (vfio_listener_skipped_section(section)) {
+        trace_vfio_listener_region_add_skip(
+            section->offset_within_address_space,
+            section->offset_within_address_space +
+            int128_get64(int128_sub(section->size, int128_one())));
+        return;
+    }
+
+    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
+                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
+        error_report("%s received unaligned region", __func__);
+        return;
+    }
+
+    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
+    llend = int128_make64(section->offset_within_address_space);
+    llend = int128_add(llend, section->size);
+    llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
+
+    if (int128_ge(int128_make64(iova), llend)) {
+        return;
+    }
+
+    memory_region_ref(section->mr);
+
+    trace_vfio_listener_region_add_iommu(iova,
+         int128_get64(int128_sub(llend, int128_one())));
+    /*
+     * FIXME: We should do some checking to see if the
+     * capabilities of the host VFIO IOMMU are adequate to model
+     * the guest IOMMU
+     *
+     * FIXME: For VFIO iommu types which have KVM acceleration to
+     * avoid bouncing all map/unmaps through qemu this way, this
+     * would be the right place to wire that up (tell the KVM
+     * device emulation the VFIO iommu handles to use).
+     */
+    /*
+     * This assumes that the guest IOMMU is empty of
+     * mappings at this point.
+     *
+     * One way of doing this is:
+     * 1. Avoid sharing IOMMUs between emulated devices or different
+     * IOMMU groups.
+     * 2. Implement VFIO_IOMMU_ENABLE in the host kernel to fail if
+     * there are some mappings in IOMMU.
+     *
+     * VFIO on SPAPR does that. Other IOMMU models may do that different,
+     * they must make sure there are no existing mappings or
+     * loop through existing mappings to map them into VFIO.
+     */
+    giommu = g_malloc0(sizeof(*giommu));
+    giommu->iommu = section->mr;
+    giommu->container = container;
+    giommu->n.notify = vfio_iommu_map_notify;
+    QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
+    memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
+}
+
+static void vfio_spapr_listener_region_del(MemoryListener *listener,
+                                     MemoryRegionSection *section)
+{
+    VFIOContainer *container = container_of(listener, VFIOContainer,
+                                            iommu_data.spapr.listener);
+    hwaddr iova, end;
+    int ret;
+    VFIOGuestIOMMU *giommu;
+
+    if (vfio_listener_skipped_section(section)) {
+        trace_vfio_listener_region_del_skip(
+            section->offset_within_address_space,
+            section->offset_within_address_space +
+            int128_get64(int128_sub(section->size, int128_one())));
+        return;
+    }
+
+    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
+                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
+        error_report("%s received unaligned region", __func__);
+        return;
+    }
+
+    QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
+        if (giommu->iommu == section->mr) {
+            memory_region_unregister_iommu_notifier(&giommu->n);
+            QLIST_REMOVE(giommu, giommu_next);
+            g_free(giommu);
+            break;
+        }
+    }
+
+    /*
+     * FIXME: We assume the one big unmap below is adequate to
+     * remove any individual page mappings in the IOMMU which
+     * might have been copied into VFIO. This works for a page table
+     * based IOMMU where a big unmap flattens a large range of IO-PTEs.
+     * That may not be true for all IOMMU types.
+     */
+
+    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
+    end = (section->offset_within_address_space + int128_get64(section->size)) &
+        TARGET_PAGE_MASK;
+
+    if (iova >= end) {
+        return;
+    }
+
+    trace_vfio_listener_region_del(iova, end - 1);
+
+    ret = vfio_dma_unmap(container, iova, end - iova);
+    memory_region_unref(section->mr);
+    if (ret) {
+        error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
+                     "0x%"HWADDR_PRIx") = %d (%m)",
+                     container, iova, end - iova, ret);
+    }
+}
+
+static const MemoryListener vfio_spapr_memory_listener = {
+    .region_add = vfio_spapr_listener_region_add,
+    .region_del = vfio_spapr_listener_region_del,
+};
+
+static void vfio_spapr_listener_release(VFIOContainer *container)
+{
+    memory_listener_unregister(&container->iommu_data.spapr.listener);
+}
+
+void spapr_memory_listener_register(VFIOContainer *container)
+{
+    container->iommu_data.spapr.listener = vfio_spapr_memory_listener;
+    container->iommu_data.release = vfio_spapr_listener_release;
+
+    memory_listener_register(&container->iommu_data.spapr.listener,
+                             container->space->as);
+}
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index da85aa9..88d9e1f 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -70,6 +70,10 @@  typedef struct VFIOType1 {
     bool initialized;
 } VFIOType1;
 
+typedef struct VFIOSPAPR {
+    MemoryListener listener;
+} VFIOSPAPR;
+
 typedef struct VFIOContainer {
     VFIOAddressSpace *space;
     int fd; /* /dev/vfio/vfio, empowered by the attached groups */
@@ -77,6 +81,7 @@  typedef struct VFIOContainer {
         /* enable abstraction to support various iommu backends */
         union {
             VFIOType1 type1;
+            VFIOSPAPR spapr;
         };
         void (*release)(struct VFIOContainer *);
     } iommu_data;
@@ -145,4 +150,12 @@  extern const MemoryRegionOps vfio_region_ops;
 extern QLIST_HEAD(vfio_group_head, VFIOGroup) vfio_group_list;
 extern QLIST_HEAD(vfio_as_head, VFIOAddressSpace) vfio_address_spaces;
 
+extern int vfio_dma_map(VFIOContainer *container, hwaddr iova,
+                        ram_addr_t size, void *vaddr, bool readonly);
+extern int vfio_dma_unmap(VFIOContainer *container,
+                          hwaddr iova, ram_addr_t size);
+bool vfio_listener_skipped_section(MemoryRegionSection *section);
+
+extern void spapr_memory_listener_register(VFIOContainer *container);
+
 #endif /* !HW_VFIO_VFIO_COMMON_H */