diff mbox

[qemu,v14,18/18] spapr_pci/spapr_pci_vfio: Support Dynamic DMA Windows (DDW)

Message ID 1458546426-26222-19-git-send-email-aik@ozlabs.ru
State New
Headers show

Commit Message

Alexey Kardashevskiy March 21, 2016, 7:47 a.m. UTC
This adds support for Dynamic DMA Windows (DDW) option defined by
the SPAPR specification which allows to have additional DMA window(s)

This implements DDW for emulated and VFIO devices.
This reserves RTAS token numbers for DDW calls.

This changes the TCE table migration descriptor to support dynamic
tables as from now on, PHB will create as many stub TCE table objects
as PHB can possibly support but not all of them might be initialized at
the time of migration because DDW might or might not be requested by
the guest.

The "ddw" property is enabled by default on a PHB but for compatibility
the pseries-2.5 machine and older disable it.

This implements DDW for VFIO. The host kernel support is required.
This adds a "levels" property to PHB to control the number of levels
in the actual TCE table allocated by the host kernel, 0 is the default
value to tell QEMU to calculate the correct value. Current hardware
supports up to 5 levels.

The existing linux guests try creating one additional huge DMA window
with 64K or 16MB pages and map the entire guest RAM to. If succeeded,
the guest switches to dma_direct_ops and never calls TCE hypercalls
(H_PUT_TCE,...) again. This enables VFIO devices to use the entire RAM
and not waste time on map/unmap later. This adds a "dma64_win_addr"
property which is a bus address for the 64bit window and by default
set to 0x800.0000.0000.0000 as this is what the modern POWER8 hardware
uses and this allows having emulated and VFIO devices on the same bus.

This adds 4 RTAS handlers:
* ibm,query-pe-dma-window
* ibm,create-pe-dma-window
* ibm,remove-pe-dma-window
* ibm,reset-pe-dma-window
These are registered from type_init() callback.

These RTAS handlers are implemented in a separate file to avoid polluting
spapr_iommu.c with PCI.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 hw/ppc/Makefile.objs        |   1 +
 hw/ppc/spapr.c              |   7 +-
 hw/ppc/spapr_pci.c          |  73 ++++++++---
 hw/ppc/spapr_rtas_ddw.c     | 300 ++++++++++++++++++++++++++++++++++++++++++++
 hw/vfio/common.c            |   5 -
 include/hw/pci-host/spapr.h |  13 ++
 include/hw/ppc/spapr.h      |  16 ++-
 trace-events                |   4 +
 8 files changed, 395 insertions(+), 24 deletions(-)
 create mode 100644 hw/ppc/spapr_rtas_ddw.c

Comments

David Gibson March 23, 2016, 2:13 a.m. UTC | #1
On Mon, Mar 21, 2016 at 06:47:06PM +1100, Alexey Kardashevskiy wrote:
> This adds support for Dynamic DMA Windows (DDW) option defined by
> the SPAPR specification which allows to have additional DMA window(s)
> 
> This implements DDW for emulated and VFIO devices.
> This reserves RTAS token numbers for DDW calls.
> 
> This changes the TCE table migration descriptor to support dynamic
> tables as from now on, PHB will create as many stub TCE table objects
> as PHB can possibly support but not all of them might be initialized at
> the time of migration because DDW might or might not be requested by
> the guest.
> 
> The "ddw" property is enabled by default on a PHB but for compatibility
> the pseries-2.5 machine and older disable it.
> 
> This implements DDW for VFIO. The host kernel support is required.
> This adds a "levels" property to PHB to control the number of levels
> in the actual TCE table allocated by the host kernel, 0 is the default
> value to tell QEMU to calculate the correct value. Current hardware
> supports up to 5 levels.
> 
> The existing linux guests try creating one additional huge DMA window
> with 64K or 16MB pages and map the entire guest RAM to. If succeeded,
> the guest switches to dma_direct_ops and never calls TCE hypercalls
> (H_PUT_TCE,...) again. This enables VFIO devices to use the entire RAM
> and not waste time on map/unmap later. This adds a "dma64_win_addr"
> property which is a bus address for the 64bit window and by default
> set to 0x800.0000.0000.0000 as this is what the modern POWER8 hardware
> uses and this allows having emulated and VFIO devices on the same bus.
> 
> This adds 4 RTAS handlers:
> * ibm,query-pe-dma-window
> * ibm,create-pe-dma-window
> * ibm,remove-pe-dma-window
> * ibm,reset-pe-dma-window
> These are registered from type_init() callback.
> 
> These RTAS handlers are implemented in a separate file to avoid polluting
> spapr_iommu.c with PCI.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  hw/ppc/Makefile.objs        |   1 +
>  hw/ppc/spapr.c              |   7 +-
>  hw/ppc/spapr_pci.c          |  73 ++++++++---
>  hw/ppc/spapr_rtas_ddw.c     | 300 ++++++++++++++++++++++++++++++++++++++++++++
>  hw/vfio/common.c            |   5 -
>  include/hw/pci-host/spapr.h |  13 ++
>  include/hw/ppc/spapr.h      |  16 ++-
>  trace-events                |   4 +
>  8 files changed, 395 insertions(+), 24 deletions(-)
>  create mode 100644 hw/ppc/spapr_rtas_ddw.c
> 
> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
> index c1ffc77..986b36f 100644
> --- a/hw/ppc/Makefile.objs
> +++ b/hw/ppc/Makefile.objs
> @@ -7,6 +7,7 @@ obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o spapr_rng.o
>  ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
>  obj-y += spapr_pci_vfio.o
>  endif
> +obj-$(CONFIG_PSERIES) += spapr_rtas_ddw.o
>  # PowerPC 4xx boards
>  obj-y += ppc405_boards.o ppc4xx_devs.o ppc405_uc.o ppc440_bamboo.o
>  obj-y += ppc4xx_pci.o
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index d0bb423..ef4c637 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -2362,7 +2362,12 @@ DEFINE_SPAPR_MACHINE(2_6, "2.6", true);
>   * pseries-2.5
>   */
>  #define SPAPR_COMPAT_2_5 \
> -        HW_COMPAT_2_5
> +        HW_COMPAT_2_5 \
> +        {\
> +            .driver   = TYPE_SPAPR_PCI_HOST_BRIDGE,\
> +            .property = "ddw",\
> +            .value    = stringify(off),\
> +        },
>  
>  static void spapr_machine_2_5_instance_options(MachineState *machine)
>  {
> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> index af99a36..3bb294a 100644
> --- a/hw/ppc/spapr_pci.c
> +++ b/hw/ppc/spapr_pci.c
> @@ -803,12 +803,12 @@ static char *spapr_phb_get_loc_code(sPAPRPHBState *sphb, PCIDevice *pdev)
>      return buf;
>  }
>  
> -static void spapr_phb_dma_window_enable(sPAPRPHBState *sphb,
> -                                       uint32_t liobn,
> -                                       uint32_t page_shift,
> -                                       uint64_t window_addr,
> -                                       uint64_t window_size,
> -                                       Error **errp)
> +void spapr_phb_dma_window_enable(sPAPRPHBState *sphb,
> +                                 uint32_t liobn,
> +                                 uint32_t page_shift,
> +                                 uint64_t window_addr,
> +                                 uint64_t window_size,
> +                                 Error **errp)
>  {
>      sPAPRTCETable *tcet;
>      uint32_t nb_table = window_size >> page_shift;
> @@ -825,10 +825,16 @@ static void spapr_phb_dma_window_enable(sPAPRPHBState *sphb,
>          return;
>      }
>  
> +    if (SPAPR_PCI_DMA_WINDOW_NUM(liobn) && !sphb->ddw_enabled) {
> +        error_setg(errp,
> +                   "Attempt to use second window when DDW is disabled on PHB");
> +        return;
> +    }

This should never happen unless something is wrong with the tests in
the RTAS functions, yes?  In which case it should probably be an
assert().


>      spapr_tce_table_enable(tcet, page_shift, window_addr, nb_table);
>  }
>  
> -static int spapr_phb_dma_window_disable(sPAPRPHBState *sphb, uint32_t liobn)
> +int spapr_phb_dma_window_disable(sPAPRPHBState *sphb, uint32_t liobn)
>  {
>      sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
>  
> @@ -1492,14 +1498,18 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
>      }
>  
>      /* DMA setup */
> -    tcet = spapr_tce_new_table(DEVICE(sphb), sphb->dma_liobn);
> -    if (!tcet) {
> -        error_report("No default TCE table for %s", sphb->dtbusname);
> -        return;
> -    }
>  
> -    memory_region_add_subregion_overlap(&sphb->iommu_root, 0,
> -                                        spapr_tce_get_iommu(tcet), 0);
> +    for (i = 0; i < SPAPR_PCI_DMA_MAX_WINDOWS; ++i) {
> +        tcet = spapr_tce_new_table(DEVICE(sphb),
> +                                   SPAPR_PCI_LIOBN(sphb->index, i));
> +        if (!tcet) {
> +            error_setg(errp, "Creating window#%d failed for %s",
> +                       i, sphb->dtbusname);
> +            return;
> +        }
> +        memory_region_add_subregion_overlap(&sphb->iommu_root, 0,
> +                                            spapr_tce_get_iommu(tcet), 0);
> +    }
>  
>      sphb->msi = g_hash_table_new_full(g_int_hash, g_int_equal, g_free, g_free);
>  }
> @@ -1517,11 +1527,16 @@ static int spapr_phb_children_reset(Object *child, void *opaque)
>  
>  void spapr_phb_dma_reset(sPAPRPHBState *sphb)
>  {
> -    sPAPRTCETable *tcet = spapr_tce_find_by_liobn(sphb->dma_liobn);
>      Error *local_err = NULL;
> +    int i;
>  
> -    if (tcet && tcet->enabled) {
> -        spapr_phb_dma_window_disable(sphb, sphb->dma_liobn);
> +    for (i = 0; i < SPAPR_PCI_DMA_MAX_WINDOWS; ++i) {
> +        uint32_t liobn = SPAPR_PCI_LIOBN(sphb->index, i);
> +        sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
> +
> +        if (tcet && tcet->enabled) {
> +            spapr_phb_dma_window_disable(sphb, liobn);
> +        }
>      }
>  
>      /* Register default 32bit DMA window */
> @@ -1562,6 +1577,13 @@ static Property spapr_phb_properties[] = {
>      /* Default DMA window is 0..1GB */
>      DEFINE_PROP_UINT64("dma_win_addr", sPAPRPHBState, dma_win_addr, 0),
>      DEFINE_PROP_UINT64("dma_win_size", sPAPRPHBState, dma_win_size, 0x40000000),
> +    DEFINE_PROP_UINT64("dma64_win_addr", sPAPRPHBState, dma64_window_addr,
> +                       0x800000000000000ULL),
> +    DEFINE_PROP_BOOL("ddw", sPAPRPHBState, ddw_enabled, true),
> +    DEFINE_PROP_UINT32("windows", sPAPRPHBState, windows_supported,
> +                       SPAPR_PCI_DMA_MAX_WINDOWS),

What will happen if the user tries to set 'windows' larger than
SPAPR_PCI_DMA_MAX_WINDOWS?

> +    DEFINE_PROP_UINT64("pgsz", sPAPRPHBState, page_size_mask,
> +                       (1ULL << 12) | (1ULL << 16) | (1ULL << 24)),
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> @@ -1815,6 +1837,15 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
>      uint32_t interrupt_map_mask[] = {
>          cpu_to_be32(b_ddddd(-1)|b_fff(0)), 0x0, 0x0, cpu_to_be32(-1)};
>      uint32_t interrupt_map[PCI_SLOT_MAX * PCI_NUM_PINS][7];
> +    uint32_t ddw_applicable[] = {
> +        cpu_to_be32(RTAS_IBM_QUERY_PE_DMA_WINDOW),
> +        cpu_to_be32(RTAS_IBM_CREATE_PE_DMA_WINDOW),
> +        cpu_to_be32(RTAS_IBM_REMOVE_PE_DMA_WINDOW)
> +    };
> +    uint32_t ddw_extensions[] = {
> +        cpu_to_be32(1),
> +        cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW)
> +    };
>      sPAPRTCETable *tcet;
>      PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus;
>      sPAPRFDT s_fdt;
> @@ -1839,6 +1870,14 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
>      _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pci-config-space-type", 0x1));
>      _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pe-total-#msi", XICS_IRQS));
>  
> +    /* Dynamic DMA window */
> +    if (phb->ddw_enabled) {
> +        _FDT(fdt_setprop(fdt, bus_off, "ibm,ddw-applicable", &ddw_applicable,
> +                         sizeof(ddw_applicable)));
> +        _FDT(fdt_setprop(fdt, bus_off, "ibm,ddw-extensions",
> +                         &ddw_extensions, sizeof(ddw_extensions)));
> +    }
> +
>      /* Build the interrupt-map, this must matches what is done
>       * in pci_spapr_map_irq
>       */
> diff --git a/hw/ppc/spapr_rtas_ddw.c b/hw/ppc/spapr_rtas_ddw.c
> new file mode 100644
> index 0000000..37f805f
> --- /dev/null
> +++ b/hw/ppc/spapr_rtas_ddw.c
> @@ -0,0 +1,300 @@
> +/*
> + * QEMU sPAPR Dynamic DMA windows support
> + *
> + * Copyright (c) 2015 Alexey Kardashevskiy, IBM Corporation.
> + *
> + *  This program is free software; you can redistribute it and/or modify
> + *  it under the terms of the GNU General Public License as published by
> + *  the Free Software Foundation; either version 2 of the License,
> + *  or (at your option) any later version.
> + *
> + *  This program is distributed in the hope that it will be useful,
> + *  but WITHOUT ANY WARRANTY; without even the implied warranty of
> + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + *  GNU General Public License for more details.
> + *
> + *  You should have received a copy of the GNU General Public License
> + *  along with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu/error-report.h"
> +#include "hw/ppc/spapr.h"
> +#include "hw/pci-host/spapr.h"
> +#include "trace.h"
> +
> +static int spapr_phb_get_active_win_num_cb(Object *child, void *opaque)
> +{
> +    sPAPRTCETable *tcet;
> +
> +    tcet = (sPAPRTCETable *) object_dynamic_cast(child, TYPE_SPAPR_TCE_TABLE);
> +    if (tcet && tcet->enabled) {
> +        ++*(unsigned *)opaque;
> +    }
> +    return 0;
> +}
> +
> +static unsigned spapr_phb_get_active_win_num(sPAPRPHBState *sphb)
> +{
> +    unsigned ret = 0;
> +
> +    object_child_foreach(OBJECT(sphb), spapr_phb_get_active_win_num_cb, &ret);
> +
> +    return ret;
> +}
> +
> +static int spapr_phb_get_free_liobn_cb(Object *child, void *opaque)
> +{
> +    sPAPRTCETable *tcet;
> +
> +    tcet = (sPAPRTCETable *) object_dynamic_cast(child, TYPE_SPAPR_TCE_TABLE);
> +    if (tcet && !tcet->enabled) {
> +        *(uint32_t *)opaque = tcet->liobn;
> +        return 1;
> +    }
> +    return 0;
> +}
> +
> +static unsigned spapr_phb_get_free_liobn(sPAPRPHBState *sphb)
> +{
> +    uint32_t liobn = 0;
> +
> +    object_child_foreach(OBJECT(sphb), spapr_phb_get_free_liobn_cb, &liobn);
> +
> +    return liobn;
> +}
> +
> +static uint32_t spapr_query_mask(struct ppc_one_seg_page_size *sps,
> +                                 uint64_t page_mask)
> +{
> +    int i, j;
> +    uint32_t mask = 0;
> +    const struct { int shift; uint32_t mask; } masks[] = {
> +        { 12, RTAS_DDW_PGSIZE_4K },
> +        { 16, RTAS_DDW_PGSIZE_64K },
> +        { 24, RTAS_DDW_PGSIZE_16M },
> +        { 25, RTAS_DDW_PGSIZE_32M },
> +        { 26, RTAS_DDW_PGSIZE_64M },
> +        { 27, RTAS_DDW_PGSIZE_128M },
> +        { 28, RTAS_DDW_PGSIZE_256M },
> +        { 34, RTAS_DDW_PGSIZE_16G },
> +    };
> +
> +    for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) {
> +        for (j = 0; j < ARRAY_SIZE(masks); ++j) {
> +            if ((sps[i].page_shift == masks[j].shift) &&
> +                    (page_mask & (1ULL << masks[j].shift))) {
> +                mask |= masks[j].mask;
> +            }
> +        }
> +    }
> +
> +    return mask;
> +}
> +
> +static void rtas_ibm_query_pe_dma_window(PowerPCCPU *cpu,
> +                                         sPAPRMachineState *spapr,
> +                                         uint32_t token, uint32_t nargs,
> +                                         target_ulong args,
> +                                         uint32_t nret, target_ulong rets)
> +{
> +    CPUPPCState *env = &cpu->env;
> +    sPAPRPHBState *sphb;
> +    uint64_t buid, max_window_size;
> +    uint32_t avail, addr, pgmask = 0;
> +
> +    if ((nargs != 3) || (nret != 5)) {
> +        goto param_error_exit;
> +    }
> +
> +    buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
> +    addr = rtas_ld(args, 0);
> +    sphb = spapr_pci_find_phb(spapr, buid);
> +    if (!sphb || !sphb->ddw_enabled) {
> +        goto param_error_exit;
> +    }
> +
> +    /* Work out supported page masks */
> +    pgmask = spapr_query_mask(env->sps.sps, sphb->page_size_mask);

There are a few potential problems here.  First you're just
arbitrarily picking the first entry in the sps array to filter
against, which doesn't seem right (except by accident).  It's a little
bit odd filtering against guest page sizes at all, although I get what
you're really trying to do is filter against allowed host page sizes.

The other problem is that silently filtering capabilities based on the
host can be a pain for migration - I've made the mistake and had it
bite me in the past.  I think it would be safer to just check the
pagesizes requested in the property against what's possible and
outright fail if they don't match.  For convenience you could also set
according to host capabilities if the user doesn't specify anything,
but that would require explicit handling of the "default" case.

Remember that this code will be relevant for DDW with emulated
devices, even if VFIO is not in play at all.

All those considerations aside, it seems like it would make more sense
to do this filtering during device realize, rather than leaving it
until the guest queries.

> +    /*
> +     * This is "Largest contiguous block of TCEs allocated specifically
> +     * for (that is, are reserved for) this PE".
> +     * Return the maximum number as maximum supported RAM size was in 4K pages.
> +     */
> +    max_window_size = MACHINE(spapr)->maxram_size >> SPAPR_TCE_PAGE_SHIFT;

Will maxram_size always be enough?  There will sometimes be an
alignment gap between the "base" RAM and the hotpluggable RAM, meaning
that if everything is plugged the last RAM address will be beyond
maxram_size.  Will that require pushing this number up, or will the
guest "repack" the RAM layout when it maps it into the TCE tables?

> +    avail = sphb->windows_supported - spapr_phb_get_active_win_num(sphb);
> +
> +    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
> +    rtas_st(rets, 1, avail);
> +    rtas_st(rets, 2, max_window_size);
> +    rtas_st(rets, 3, pgmask);
> +    rtas_st(rets, 4, 0); /* DMA migration mask, not supported */
> +
> +    trace_spapr_iommu_ddw_query(buid, addr, avail, max_window_size, pgmask);
> +    return;
> +
> +param_error_exit:
> +    rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
> +}
> +
> +static void rtas_ibm_create_pe_dma_window(PowerPCCPU *cpu,
> +                                          sPAPRMachineState *spapr,
> +                                          uint32_t token, uint32_t nargs,
> +                                          target_ulong args,
> +                                          uint32_t nret, target_ulong rets)
> +{
> +    sPAPRPHBState *sphb;
> +    sPAPRTCETable *tcet = NULL;
> +    uint32_t addr, page_shift, window_shift, liobn;
> +    uint64_t buid;
> +    Error *local_err = NULL;
> +
> +    if ((nargs != 5) || (nret != 4)) {
> +        goto param_error_exit;
> +    }
> +
> +    buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
> +    addr = rtas_ld(args, 0);
> +    sphb = spapr_pci_find_phb(spapr, buid);
> +    if (!sphb || !sphb->ddw_enabled) {
> +        goto param_error_exit;
> +    }
> +
> +    page_shift = rtas_ld(args, 3);
> +    window_shift = rtas_ld(args, 4);
> +    liobn = spapr_phb_get_free_liobn(sphb);
> +
> +    if (!liobn || !(sphb->page_size_mask & (1ULL << page_shift)) ||
> +        spapr_phb_get_active_win_num(sphb) == sphb->windows_supported) {
> +        goto hw_error_exit;
> +    }

Bad page sizes should be H_PARAM, not H_HARDWARE, no?

Also is no available liobns H_HARDWARE, or H_RESOURCE?

> +
> +    if (window_shift < page_shift) {
> +        goto param_error_exit;
> +    }
> +
> +    spapr_phb_dma_window_enable(sphb, liobn, page_shift,
> +                                sphb->dma64_window_addr,
> +                                1ULL << window_shift, &local_err);
> +    if (local_err) {
> +        error_report_err(local_err);
> +        goto hw_error_exit;
> +    }
> +
> +    tcet = spapr_tce_find_by_liobn(liobn);
> +    trace_spapr_iommu_ddw_create(buid, addr, 1ULL << page_shift,
> +                                 1ULL << window_shift,
> +                                 tcet ? tcet->bus_offset : 0xbaadf00d, liobn);
> +    if (local_err || !tcet) {
> +        goto hw_error_exit;
> +    }
> +
> +    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
> +    rtas_st(rets, 1, liobn);
> +    rtas_st(rets, 2, tcet->bus_offset >> 32);
> +    rtas_st(rets, 3, tcet->bus_offset & ((uint32_t) -1));
> +
> +    return;
> +
> +hw_error_exit:
> +    rtas_st(rets, 0, RTAS_OUT_HW_ERROR);
> +    return;
> +
> +param_error_exit:
> +    rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
> +}
> +
> +static void rtas_ibm_remove_pe_dma_window(PowerPCCPU *cpu,
> +                                          sPAPRMachineState *spapr,
> +                                          uint32_t token, uint32_t nargs,
> +                                          target_ulong args,
> +                                          uint32_t nret, target_ulong rets)
> +{
> +    sPAPRPHBState *sphb;
> +    sPAPRTCETable *tcet;
> +    uint32_t liobn;
> +    long ret;
> +
> +    if ((nargs != 1) || (nret != 1)) {
> +        goto param_error_exit;
> +    }
> +
> +    liobn = rtas_ld(args, 0);
> +    tcet = spapr_tce_find_by_liobn(liobn);
> +    if (!tcet) {
> +        goto param_error_exit;
> +    }
> +
> +    sphb = SPAPR_PCI_HOST_BRIDGE(OBJECT(tcet)->parent);
> +    if (!sphb || !sphb->ddw_enabled || !spapr_phb_get_active_win_num(sphb)) {

Checking spapr_phb_get_active_win_num() seems weird.  You already know
that this tcet is a child of the sphb.  Can't you just check its own
enabled property.

> +        goto param_error_exit;
> +    }
> +
> +    ret = spapr_phb_dma_window_disable(sphb, liobn);
> +    trace_spapr_iommu_ddw_remove(liobn, ret);
> +    if (ret) {
> +        goto hw_error_exit;
> +    }
> +
> +    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
> +    return;
> +
> +hw_error_exit:
> +    rtas_st(rets, 0, RTAS_OUT_HW_ERROR);
> +    return;
> +
> +param_error_exit:
> +    rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
> +}
> +
> +static void rtas_ibm_reset_pe_dma_window(PowerPCCPU *cpu,
> +                                         sPAPRMachineState *spapr,
> +                                         uint32_t token, uint32_t nargs,
> +                                         target_ulong args,
> +                                         uint32_t nret, target_ulong rets)
> +{
> +    sPAPRPHBState *sphb;
> +    uint64_t buid;
> +    uint32_t addr;
> +
> +    if ((nargs != 3) || (nret != 1)) {
> +        goto param_error_exit;
> +    }
> +
> +    buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
> +    addr = rtas_ld(args, 0);
> +    sphb = spapr_pci_find_phb(spapr, buid);
> +    if (!sphb || !sphb->ddw_enabled) {
> +        goto param_error_exit;
> +    }
> +
> +    spapr_phb_dma_reset(sphb);
> +    trace_spapr_iommu_ddw_reset(buid, addr);
> +
> +    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
> +
> +    return;
> +
> +param_error_exit:
> +    rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
> +}
> +
> +static void spapr_rtas_ddw_init(void)
> +{
> +    spapr_rtas_register(RTAS_IBM_QUERY_PE_DMA_WINDOW,
> +                        "ibm,query-pe-dma-window",
> +                        rtas_ibm_query_pe_dma_window);
> +    spapr_rtas_register(RTAS_IBM_CREATE_PE_DMA_WINDOW,
> +                        "ibm,create-pe-dma-window",
> +                        rtas_ibm_create_pe_dma_window);
> +    spapr_rtas_register(RTAS_IBM_REMOVE_PE_DMA_WINDOW,
> +                        "ibm,remove-pe-dma-window",
> +                        rtas_ibm_remove_pe_dma_window);
> +    spapr_rtas_register(RTAS_IBM_RESET_PE_DMA_WINDOW,
> +                        "ibm,reset-pe-dma-window",
> +                        rtas_ibm_reset_pe_dma_window);
> +}
> +
> +type_init(spapr_rtas_ddw_init)
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 421d6eb..b0ea146 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -994,11 +994,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
>              }
>          }
>  
> -        /*
> -         * This only considers the host IOMMU's 32-bit window.  At
> -         * some point we need to add support for the optional 64-bit
> -         * window and dynamic windows
> -         */

This seems like a stray change to VFIO code in a patch that's
otherwise only affecting spapr_pci.  Does this hunk belong in a
different patch?

>          info.argsz = sizeof(info);
>          ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
>          if (ret) {
> diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
> index 7848366..e81b751 100644
> --- a/include/hw/pci-host/spapr.h
> +++ b/include/hw/pci-host/spapr.h
> @@ -71,6 +71,11 @@ struct sPAPRPHBState {
>      spapr_pci_msi_mig *msi_devs;
>  
>      QLIST_ENTRY(sPAPRPHBState) list;
> +
> +    bool ddw_enabled;
> +    uint32_t windows_supported;
> +    uint64_t page_size_mask;
> +    uint64_t dma64_window_addr;
>  };
>  
>  #define SPAPR_PCI_MAX_INDEX          255
> @@ -89,6 +94,8 @@ struct sPAPRPHBState {
>  
>  #define SPAPR_PCI_MSI_WINDOW         0x40000000000ULL
>  
> +#define SPAPR_PCI_DMA_MAX_WINDOWS    2
> +
>  static inline qemu_irq spapr_phb_lsi_qirq(struct sPAPRPHBState *phb, int pin)
>  {
>      sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> @@ -148,5 +155,11 @@ static inline void spapr_phb_vfio_reset(DeviceState *qdev)
>  #endif
>  
>  void spapr_phb_dma_reset(sPAPRPHBState *sphb);
> +void spapr_phb_dma_window_enable(sPAPRPHBState *sphb,
> +                                uint32_t liobn, uint32_t page_shift,
> +                                uint64_t window_addr,
> +                                uint64_t window_size,
> +                                 Error **errp);
> +int spapr_phb_dma_window_disable(sPAPRPHBState *sphb, uint32_t liobn);
>  
>  #endif /* __HW_SPAPR_PCI_H__ */
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index 471eb4a..41b32c6 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -417,6 +417,16 @@ int spapr_allocate_irq_block(int num, bool lsi, bool msi);
>  #define RTAS_OUT_NOT_AUTHORIZED                 -9002
>  #define RTAS_OUT_SYSPARM_PARAM_ERROR            -9999
>  
> +/* DDW pagesize mask values from ibm,query-pe-dma-window */
> +#define RTAS_DDW_PGSIZE_4K       0x01
> +#define RTAS_DDW_PGSIZE_64K      0x02
> +#define RTAS_DDW_PGSIZE_16M      0x04
> +#define RTAS_DDW_PGSIZE_32M      0x08
> +#define RTAS_DDW_PGSIZE_64M      0x10
> +#define RTAS_DDW_PGSIZE_128M     0x20
> +#define RTAS_DDW_PGSIZE_256M     0x40
> +#define RTAS_DDW_PGSIZE_16G      0x80
> +
>  /* RTAS tokens */
>  #define RTAS_TOKEN_BASE      0x2000
>  
> @@ -458,8 +468,12 @@ int spapr_allocate_irq_block(int num, bool lsi, bool msi);
>  #define RTAS_IBM_SET_SLOT_RESET                 (RTAS_TOKEN_BASE + 0x23)
>  #define RTAS_IBM_CONFIGURE_PE                   (RTAS_TOKEN_BASE + 0x24)
>  #define RTAS_IBM_SLOT_ERROR_DETAIL              (RTAS_TOKEN_BASE + 0x25)
> +#define RTAS_IBM_QUERY_PE_DMA_WINDOW            (RTAS_TOKEN_BASE + 0x26)
> +#define RTAS_IBM_CREATE_PE_DMA_WINDOW           (RTAS_TOKEN_BASE + 0x27)
> +#define RTAS_IBM_REMOVE_PE_DMA_WINDOW           (RTAS_TOKEN_BASE + 0x28)
> +#define RTAS_IBM_RESET_PE_DMA_WINDOW            (RTAS_TOKEN_BASE + 0x29)
>  
> -#define RTAS_TOKEN_MAX                          (RTAS_TOKEN_BASE + 0x26)
> +#define RTAS_TOKEN_MAX                          (RTAS_TOKEN_BASE + 0x2A)
>  
>  /* RTAS ibm,get-system-parameter token values */
>  #define RTAS_SYSPARM_SPLPAR_CHARACTERISTICS      20
> diff --git a/trace-events b/trace-events
> index f2b75a3..e68d0e4 100644
> --- a/trace-events
> +++ b/trace-events
> @@ -1431,6 +1431,10 @@ spapr_iommu_pci_indirect(uint64_t liobn, uint64_t ioba, uint64_t tce, uint64_t i
>  spapr_iommu_pci_stuff(uint64_t liobn, uint64_t ioba, uint64_t tce_value, uint64_t npages, uint64_t ret) "liobn=%"PRIx64" ioba=0x%"PRIx64" tcevalue=0x%"PRIx64" npages=%"PRId64" ret=%"PRId64
>  spapr_iommu_xlate(uint64_t liobn, uint64_t ioba, uint64_t tce, unsigned perm, unsigned pgsize) "liobn=%"PRIx64" 0x%"PRIx64" -> 0x%"PRIx64" perm=%u mask=%x"
>  spapr_iommu_new_table(uint64_t liobn, void *table, int fd) "liobn=%"PRIx64" table=%p fd=%d"
> +spapr_iommu_ddw_query(uint64_t buid, uint32_t cfgaddr, unsigned wa, uint64_t win_size, uint32_t pgmask) "buid=%"PRIx64" addr=%"PRIx32", %u windows available, max window size=%"PRIx64", mask=%"PRIx32
> +spapr_iommu_ddw_create(uint64_t buid, uint32_t cfgaddr, uint64_t pg_size, uint64_t req_size, uint64_t start, uint32_t liobn) "buid=%"PRIx64" addr=%"PRIx32", page size=0x%"PRIx64", requested=0x%"PRIx64", start addr=%"PRIx64", liobn=%"PRIx32
> +spapr_iommu_ddw_remove(uint32_t liobn, long ret) "liobn=%"PRIx32", ret = %ld"
> +spapr_iommu_ddw_reset(uint64_t buid, uint32_t cfgaddr) "buid=%"PRIx64" addr=%"PRIx32
>  
>  # hw/ppc/ppc.c
>  ppc_tb_adjust(uint64_t offs1, uint64_t offs2, int64_t diff, int64_t seconds) "adjusted from 0x%"PRIx64" to 0x%"PRIx64", diff %"PRId64" (%"PRId64"s)"
Alexey Kardashevskiy March 23, 2016, 3:28 a.m. UTC | #2
On 03/23/2016 01:13 PM, David Gibson wrote:
> On Mon, Mar 21, 2016 at 06:47:06PM +1100, Alexey Kardashevskiy wrote:
>> This adds support for Dynamic DMA Windows (DDW) option defined by
>> the SPAPR specification which allows to have additional DMA window(s)
>>
>> This implements DDW for emulated and VFIO devices.
>> This reserves RTAS token numbers for DDW calls.
>>
>> This changes the TCE table migration descriptor to support dynamic
>> tables as from now on, PHB will create as many stub TCE table objects
>> as PHB can possibly support but not all of them might be initialized at
>> the time of migration because DDW might or might not be requested by
>> the guest.
>>
>> The "ddw" property is enabled by default on a PHB but for compatibility
>> the pseries-2.5 machine and older disable it.
>>
>> This implements DDW for VFIO. The host kernel support is required.
>> This adds a "levels" property to PHB to control the number of levels
>> in the actual TCE table allocated by the host kernel, 0 is the default
>> value to tell QEMU to calculate the correct value. Current hardware
>> supports up to 5 levels.
>>
>> The existing linux guests try creating one additional huge DMA window
>> with 64K or 16MB pages and map the entire guest RAM to. If succeeded,
>> the guest switches to dma_direct_ops and never calls TCE hypercalls
>> (H_PUT_TCE,...) again. This enables VFIO devices to use the entire RAM
>> and not waste time on map/unmap later. This adds a "dma64_win_addr"
>> property which is a bus address for the 64bit window and by default
>> set to 0x800.0000.0000.0000 as this is what the modern POWER8 hardware
>> uses and this allows having emulated and VFIO devices on the same bus.
>>
>> This adds 4 RTAS handlers:
>> * ibm,query-pe-dma-window
>> * ibm,create-pe-dma-window
>> * ibm,remove-pe-dma-window
>> * ibm,reset-pe-dma-window
>> These are registered from type_init() callback.
>>
>> These RTAS handlers are implemented in a separate file to avoid polluting
>> spapr_iommu.c with PCI.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>   hw/ppc/Makefile.objs        |   1 +
>>   hw/ppc/spapr.c              |   7 +-
>>   hw/ppc/spapr_pci.c          |  73 ++++++++---
>>   hw/ppc/spapr_rtas_ddw.c     | 300 ++++++++++++++++++++++++++++++++++++++++++++
>>   hw/vfio/common.c            |   5 -
>>   include/hw/pci-host/spapr.h |  13 ++
>>   include/hw/ppc/spapr.h      |  16 ++-
>>   trace-events                |   4 +
>>   8 files changed, 395 insertions(+), 24 deletions(-)
>>   create mode 100644 hw/ppc/spapr_rtas_ddw.c
>>
>> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
>> index c1ffc77..986b36f 100644
>> --- a/hw/ppc/Makefile.objs
>> +++ b/hw/ppc/Makefile.objs
>> @@ -7,6 +7,7 @@ obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o spapr_rng.o
>>   ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
>>   obj-y += spapr_pci_vfio.o
>>   endif
>> +obj-$(CONFIG_PSERIES) += spapr_rtas_ddw.o
>>   # PowerPC 4xx boards
>>   obj-y += ppc405_boards.o ppc4xx_devs.o ppc405_uc.o ppc440_bamboo.o
>>   obj-y += ppc4xx_pci.o
>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>> index d0bb423..ef4c637 100644
>> --- a/hw/ppc/spapr.c
>> +++ b/hw/ppc/spapr.c
>> @@ -2362,7 +2362,12 @@ DEFINE_SPAPR_MACHINE(2_6, "2.6", true);
>>    * pseries-2.5
>>    */
>>   #define SPAPR_COMPAT_2_5 \
>> -        HW_COMPAT_2_5
>> +        HW_COMPAT_2_5 \
>> +        {\
>> +            .driver   = TYPE_SPAPR_PCI_HOST_BRIDGE,\
>> +            .property = "ddw",\
>> +            .value    = stringify(off),\
>> +        },
>>
>>   static void spapr_machine_2_5_instance_options(MachineState *machine)
>>   {
>> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
>> index af99a36..3bb294a 100644
>> --- a/hw/ppc/spapr_pci.c
>> +++ b/hw/ppc/spapr_pci.c
>> @@ -803,12 +803,12 @@ static char *spapr_phb_get_loc_code(sPAPRPHBState *sphb, PCIDevice *pdev)
>>       return buf;
>>   }
>>
>> -static void spapr_phb_dma_window_enable(sPAPRPHBState *sphb,
>> -                                       uint32_t liobn,
>> -                                       uint32_t page_shift,
>> -                                       uint64_t window_addr,
>> -                                       uint64_t window_size,
>> -                                       Error **errp)
>> +void spapr_phb_dma_window_enable(sPAPRPHBState *sphb,
>> +                                 uint32_t liobn,
>> +                                 uint32_t page_shift,
>> +                                 uint64_t window_addr,
>> +                                 uint64_t window_size,
>> +                                 Error **errp)
>>   {
>>       sPAPRTCETable *tcet;
>>       uint32_t nb_table = window_size >> page_shift;
>> @@ -825,10 +825,16 @@ static void spapr_phb_dma_window_enable(sPAPRPHBState *sphb,
>>           return;
>>       }
>>
>> +    if (SPAPR_PCI_DMA_WINDOW_NUM(liobn) && !sphb->ddw_enabled) {
>> +        error_setg(errp,
>> +                   "Attempt to use second window when DDW is disabled on PHB");
>> +        return;
>> +    }
>
> This should never happen unless something is wrong with the tests in
> the RTAS functions, yes?  In which case it should probably be an
> assert().

This should not. But this is called from the RTAS caller so I'd really like 
to have a message rather than assert() if that condition happens, here or 
in rtas_ibm_create_pe_dma_window().


>
>>       spapr_tce_table_enable(tcet, page_shift, window_addr, nb_table);
>>   }
>>
>> -static int spapr_phb_dma_window_disable(sPAPRPHBState *sphb, uint32_t liobn)
>> +int spapr_phb_dma_window_disable(sPAPRPHBState *sphb, uint32_t liobn)
>>   {
>>       sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
>>
>> @@ -1492,14 +1498,18 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
>>       }
>>
>>       /* DMA setup */
>> -    tcet = spapr_tce_new_table(DEVICE(sphb), sphb->dma_liobn);
>> -    if (!tcet) {
>> -        error_report("No default TCE table for %s", sphb->dtbusname);
>> -        return;
>> -    }
>>
>> -    memory_region_add_subregion_overlap(&sphb->iommu_root, 0,
>> -                                        spapr_tce_get_iommu(tcet), 0);
>> +    for (i = 0; i < SPAPR_PCI_DMA_MAX_WINDOWS; ++i) {
>> +        tcet = spapr_tce_new_table(DEVICE(sphb),
>> +                                   SPAPR_PCI_LIOBN(sphb->index, i));
>> +        if (!tcet) {
>> +            error_setg(errp, "Creating window#%d failed for %s",
>> +                       i, sphb->dtbusname);
>> +            return;
>> +        }
>> +        memory_region_add_subregion_overlap(&sphb->iommu_root, 0,
>> +                                            spapr_tce_get_iommu(tcet), 0);
>> +    }
>>
>>       sphb->msi = g_hash_table_new_full(g_int_hash, g_int_equal, g_free, g_free);
>>   }
>> @@ -1517,11 +1527,16 @@ static int spapr_phb_children_reset(Object *child, void *opaque)
>>
>>   void spapr_phb_dma_reset(sPAPRPHBState *sphb)
>>   {
>> -    sPAPRTCETable *tcet = spapr_tce_find_by_liobn(sphb->dma_liobn);
>>       Error *local_err = NULL;
>> +    int i;
>>
>> -    if (tcet && tcet->enabled) {
>> -        spapr_phb_dma_window_disable(sphb, sphb->dma_liobn);
>> +    for (i = 0; i < SPAPR_PCI_DMA_MAX_WINDOWS; ++i) {
>> +        uint32_t liobn = SPAPR_PCI_LIOBN(sphb->index, i);
>> +        sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
>> +
>> +        if (tcet && tcet->enabled) {
>> +            spapr_phb_dma_window_disable(sphb, liobn);
>> +        }
>>       }
>>
>>       /* Register default 32bit DMA window */
>> @@ -1562,6 +1577,13 @@ static Property spapr_phb_properties[] = {
>>       /* Default DMA window is 0..1GB */
>>       DEFINE_PROP_UINT64("dma_win_addr", sPAPRPHBState, dma_win_addr, 0),
>>       DEFINE_PROP_UINT64("dma_win_size", sPAPRPHBState, dma_win_size, 0x40000000),
>> +    DEFINE_PROP_UINT64("dma64_win_addr", sPAPRPHBState, dma64_window_addr,
>> +                       0x800000000000000ULL),
>> +    DEFINE_PROP_BOOL("ddw", sPAPRPHBState, ddw_enabled, true),
>> +    DEFINE_PROP_UINT32("windows", sPAPRPHBState, windows_supported,
>> +                       SPAPR_PCI_DMA_MAX_WINDOWS),
>
> What will happen if the user tries to set 'windows' larger than
> SPAPR_PCI_DMA_MAX_WINDOWS?


Oh. I need to replace SPAPR_PCI_DMA_MAX_WINDOWS with windows_supported 
everywhere, missed that. Besides that, there will be support for more 
windows, that's it. The host VFIO IOMMU driver will fail creating more 
windows but this is expected. For emulated windows, there will be more 
windows with no other consequences.




>> +    DEFINE_PROP_UINT64("pgsz", sPAPRPHBState, page_size_mask,
>> +                       (1ULL << 12) | (1ULL << 16) | (1ULL << 24)),
>>       DEFINE_PROP_END_OF_LIST(),
>>   };
>>
>> @@ -1815,6 +1837,15 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
>>       uint32_t interrupt_map_mask[] = {
>>           cpu_to_be32(b_ddddd(-1)|b_fff(0)), 0x0, 0x0, cpu_to_be32(-1)};
>>       uint32_t interrupt_map[PCI_SLOT_MAX * PCI_NUM_PINS][7];
>> +    uint32_t ddw_applicable[] = {
>> +        cpu_to_be32(RTAS_IBM_QUERY_PE_DMA_WINDOW),
>> +        cpu_to_be32(RTAS_IBM_CREATE_PE_DMA_WINDOW),
>> +        cpu_to_be32(RTAS_IBM_REMOVE_PE_DMA_WINDOW)
>> +    };
>> +    uint32_t ddw_extensions[] = {
>> +        cpu_to_be32(1),
>> +        cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW)
>> +    };
>>       sPAPRTCETable *tcet;
>>       PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus;
>>       sPAPRFDT s_fdt;
>> @@ -1839,6 +1870,14 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
>>       _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pci-config-space-type", 0x1));
>>       _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pe-total-#msi", XICS_IRQS));
>>
>> +    /* Dynamic DMA window */
>> +    if (phb->ddw_enabled) {
>> +        _FDT(fdt_setprop(fdt, bus_off, "ibm,ddw-applicable", &ddw_applicable,
>> +                         sizeof(ddw_applicable)));
>> +        _FDT(fdt_setprop(fdt, bus_off, "ibm,ddw-extensions",
>> +                         &ddw_extensions, sizeof(ddw_extensions)));
>> +    }
>> +
>>       /* Build the interrupt-map, this must matches what is done
>>        * in pci_spapr_map_irq
>>        */
>> diff --git a/hw/ppc/spapr_rtas_ddw.c b/hw/ppc/spapr_rtas_ddw.c
>> new file mode 100644
>> index 0000000..37f805f
>> --- /dev/null
>> +++ b/hw/ppc/spapr_rtas_ddw.c
>> @@ -0,0 +1,300 @@
>> +/*
>> + * QEMU sPAPR Dynamic DMA windows support
>> + *
>> + * Copyright (c) 2015 Alexey Kardashevskiy, IBM Corporation.
>> + *
>> + *  This program is free software; you can redistribute it and/or modify
>> + *  it under the terms of the GNU General Public License as published by
>> + *  the Free Software Foundation; either version 2 of the License,
>> + *  or (at your option) any later version.
>> + *
>> + *  This program is distributed in the hope that it will be useful,
>> + *  but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> + *  GNU General Public License for more details.
>> + *
>> + *  You should have received a copy of the GNU General Public License
>> + *  along with this program; if not, see <http://www.gnu.org/licenses/>.
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +#include "qemu/error-report.h"
>> +#include "hw/ppc/spapr.h"
>> +#include "hw/pci-host/spapr.h"
>> +#include "trace.h"
>> +
>> +static int spapr_phb_get_active_win_num_cb(Object *child, void *opaque)
>> +{
>> +    sPAPRTCETable *tcet;
>> +
>> +    tcet = (sPAPRTCETable *) object_dynamic_cast(child, TYPE_SPAPR_TCE_TABLE);
>> +    if (tcet && tcet->enabled) {
>> +        ++*(unsigned *)opaque;
>> +    }
>> +    return 0;
>> +}
>> +
>> +static unsigned spapr_phb_get_active_win_num(sPAPRPHBState *sphb)
>> +{
>> +    unsigned ret = 0;
>> +
>> +    object_child_foreach(OBJECT(sphb), spapr_phb_get_active_win_num_cb, &ret);
>> +
>> +    return ret;
>> +}
>> +
>> +static int spapr_phb_get_free_liobn_cb(Object *child, void *opaque)
>> +{
>> +    sPAPRTCETable *tcet;
>> +
>> +    tcet = (sPAPRTCETable *) object_dynamic_cast(child, TYPE_SPAPR_TCE_TABLE);
>> +    if (tcet && !tcet->enabled) {
>> +        *(uint32_t *)opaque = tcet->liobn;
>> +        return 1;
>> +    }
>> +    return 0;
>> +}
>> +
>> +static unsigned spapr_phb_get_free_liobn(sPAPRPHBState *sphb)
>> +{
>> +    uint32_t liobn = 0;
>> +
>> +    object_child_foreach(OBJECT(sphb), spapr_phb_get_free_liobn_cb, &liobn);
>> +
>> +    return liobn;
>> +}
>> +
>> +static uint32_t spapr_query_mask(struct ppc_one_seg_page_size *sps,
>> +                                 uint64_t page_mask)
>> +{
>> +    int i, j;
>> +    uint32_t mask = 0;
>> +    const struct { int shift; uint32_t mask; } masks[] = {
>> +        { 12, RTAS_DDW_PGSIZE_4K },
>> +        { 16, RTAS_DDW_PGSIZE_64K },
>> +        { 24, RTAS_DDW_PGSIZE_16M },
>> +        { 25, RTAS_DDW_PGSIZE_32M },
>> +        { 26, RTAS_DDW_PGSIZE_64M },
>> +        { 27, RTAS_DDW_PGSIZE_128M },
>> +        { 28, RTAS_DDW_PGSIZE_256M },
>> +        { 34, RTAS_DDW_PGSIZE_16G },
>> +    };
>> +
>> +    for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) {
>> +        for (j = 0; j < ARRAY_SIZE(masks); ++j) {
>> +            if ((sps[i].page_shift == masks[j].shift) &&
>> +                    (page_mask & (1ULL << masks[j].shift))) {
>> +                mask |= masks[j].mask;
>> +            }
>> +        }
>> +    }
>> +
>> +    return mask;
>> +}
>> +
>> +static void rtas_ibm_query_pe_dma_window(PowerPCCPU *cpu,
>> +                                         sPAPRMachineState *spapr,
>> +                                         uint32_t token, uint32_t nargs,
>> +                                         target_ulong args,
>> +                                         uint32_t nret, target_ulong rets)
>> +{
>> +    CPUPPCState *env = &cpu->env;
>> +    sPAPRPHBState *sphb;
>> +    uint64_t buid, max_window_size;
>> +    uint32_t avail, addr, pgmask = 0;
>> +
>> +    if ((nargs != 3) || (nret != 5)) {
>> +        goto param_error_exit;
>> +    }
>> +
>> +    buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
>> +    addr = rtas_ld(args, 0);
>> +    sphb = spapr_pci_find_phb(spapr, buid);
>> +    if (!sphb || !sphb->ddw_enabled) {
>> +        goto param_error_exit;
>> +    }
>> +
>> +    /* Work out supported page masks */
>> +    pgmask = spapr_query_mask(env->sps.sps, sphb->page_size_mask);
>
> There are a few potential problems here.  First you're just
> arbitrarily picking the first entry in the sps array to filter

Why first?  spapr_query_mask() has a loop 0..PPC_PAGE_SIZES_MAX_SZ.


> against, which doesn't seem right (except by accident).  It's a little
> bit odd filtering against guest page sizes at all, although I get what
> you're really trying to do is filter against allowed host page sizes.
>
> The other problem is that silently filtering capabilities based on the
> host can be a pain for migration - I've made the mistake and had it
> bite me in the past.  I think it would be safer to just check the
> pagesizes requested in the property against what's possible and
> outright fail if they don't match.  For convenience you could also set
> according to host capabilities if the user doesn't specify anything,
> but that would require explicit handling of the "default" case.


For the migration purposes, both guests should be started with or without 
hugepages enabled; this is taken into account already. Besides that, the 
result of "query" won't differ.


> Remember that this code will be relevant for DDW with emulated
> devices, even if VFIO is not in play at all.
>
> All those considerations aside, it seems like it would make more sense
> to do this filtering during device realize, rather than leaving it
> until the guest queries.

The result will be the same, it only depends on whether hugepages are 
enabled or not and this happens at the start time. But yes, feels more 
accurate to do this in PHB realize(), I'll move it.


>
>> +    /*
>> +     * This is "Largest contiguous block of TCEs allocated specifically
>> +     * for (that is, are reserved for) this PE".
>> +     * Return the maximum number as maximum supported RAM size was in 4K pages.
>> +     */
>> +    max_window_size = MACHINE(spapr)->maxram_size >> SPAPR_TCE_PAGE_SHIFT;
>
> Will maxram_size always be enough?  There will sometimes be an
> alignment gap between the "base" RAM and the hotpluggable RAM, meaning
> that if everything is plugged the last RAM address will be beyond
> maxram_size.  Will that require pushing this number up, or will the
> guest "repack" the RAM layout when it maps it into the TCE tables?


Hm. I do not know what the guest does to DDW on memory hotplug but this is 
a valid point... What QEMU helper does return the last available address in 
the system memory address space? Like memblock_end_of_DRAM() in the kernel, 
I would use that instead.


>> +    avail = sphb->windows_supported - spapr_phb_get_active_win_num(sphb);
>> +
>> +    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
>> +    rtas_st(rets, 1, avail);
>> +    rtas_st(rets, 2, max_window_size);
>> +    rtas_st(rets, 3, pgmask);
>> +    rtas_st(rets, 4, 0); /* DMA migration mask, not supported */
>> +
>> +    trace_spapr_iommu_ddw_query(buid, addr, avail, max_window_size, pgmask);
>> +    return;
>> +
>> +param_error_exit:
>> +    rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
>> +}
>> +
>> +static void rtas_ibm_create_pe_dma_window(PowerPCCPU *cpu,
>> +                                          sPAPRMachineState *spapr,
>> +                                          uint32_t token, uint32_t nargs,
>> +                                          target_ulong args,
>> +                                          uint32_t nret, target_ulong rets)
>> +{
>> +    sPAPRPHBState *sphb;
>> +    sPAPRTCETable *tcet = NULL;
>> +    uint32_t addr, page_shift, window_shift, liobn;
>> +    uint64_t buid;
>> +    Error *local_err = NULL;
>> +
>> +    if ((nargs != 5) || (nret != 4)) {
>> +        goto param_error_exit;
>> +    }
>> +
>> +    buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
>> +    addr = rtas_ld(args, 0);
>> +    sphb = spapr_pci_find_phb(spapr, buid);
>> +    if (!sphb || !sphb->ddw_enabled) {
>> +        goto param_error_exit;
>> +    }
>> +
>> +    page_shift = rtas_ld(args, 3);
>> +    window_shift = rtas_ld(args, 4);
>> +    liobn = spapr_phb_get_free_liobn(sphb);
>> +
>> +    if (!liobn || !(sphb->page_size_mask & (1ULL << page_shift)) ||
>> +        spapr_phb_get_active_win_num(sphb) == sphb->windows_supported) {
>> +        goto hw_error_exit;
>> +    }
>
> Bad page sizes should be H_PARAM, not H_HARDWARE, no?

Correct, I'll fix it.

>
> Also is no available liobns H_HARDWARE, or H_RESOURCE?

LoPAPR says this should return -1 (hardware), -2 (function), -3 
(privilege), not -16. So I pick -1 for this.


>> +
>> +    if (window_shift < page_shift) {
>> +        goto param_error_exit;
>> +    }
>> +
>> +    spapr_phb_dma_window_enable(sphb, liobn, page_shift,
>> +                                sphb->dma64_window_addr,
>> +                                1ULL << window_shift, &local_err);
>> +    if (local_err) {
>> +        error_report_err(local_err);
>> +        goto hw_error_exit;
>> +    }
>> +
>> +    tcet = spapr_tce_find_by_liobn(liobn);
>> +    trace_spapr_iommu_ddw_create(buid, addr, 1ULL << page_shift,
>> +                                 1ULL << window_shift,
>> +                                 tcet ? tcet->bus_offset : 0xbaadf00d, liobn);
>> +    if (local_err || !tcet) {
>> +        goto hw_error_exit;
>> +    }
>> +
>> +    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
>> +    rtas_st(rets, 1, liobn);
>> +    rtas_st(rets, 2, tcet->bus_offset >> 32);
>> +    rtas_st(rets, 3, tcet->bus_offset & ((uint32_t) -1));
>> +
>> +    return;
>> +
>> +hw_error_exit:
>> +    rtas_st(rets, 0, RTAS_OUT_HW_ERROR);
>> +    return;
>> +
>> +param_error_exit:
>> +    rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
>> +}
>> +
>> +static void rtas_ibm_remove_pe_dma_window(PowerPCCPU *cpu,
>> +                                          sPAPRMachineState *spapr,
>> +                                          uint32_t token, uint32_t nargs,
>> +                                          target_ulong args,
>> +                                          uint32_t nret, target_ulong rets)
>> +{
>> +    sPAPRPHBState *sphb;
>> +    sPAPRTCETable *tcet;
>> +    uint32_t liobn;
>> +    long ret;
>> +
>> +    if ((nargs != 1) || (nret != 1)) {
>> +        goto param_error_exit;
>> +    }
>> +
>> +    liobn = rtas_ld(args, 0);
>> +    tcet = spapr_tce_find_by_liobn(liobn);
>> +    if (!tcet) {
>> +        goto param_error_exit;
>> +    }
>> +
>> +    sphb = SPAPR_PCI_HOST_BRIDGE(OBJECT(tcet)->parent);
>> +    if (!sphb || !sphb->ddw_enabled || !spapr_phb_get_active_win_num(sphb)) {
>
> Checking spapr_phb_get_active_win_num() seems weird.  You already know
> that this tcet is a child of the sphb.  Can't you just check its own
> enabled property.
>
>> +        goto param_error_exit;
>> +    }
>> +
>> +    ret = spapr_phb_dma_window_disable(sphb, liobn);
>> +    trace_spapr_iommu_ddw_remove(liobn, ret);
>> +    if (ret) {
>> +        goto hw_error_exit;
>> +    }
>> +
>> +    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
>> +    return;
>> +
>> +hw_error_exit:
>> +    rtas_st(rets, 0, RTAS_OUT_HW_ERROR);
>> +    return;
>> +
>> +param_error_exit:
>> +    rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
>> +}
>> +
>> +static void rtas_ibm_reset_pe_dma_window(PowerPCCPU *cpu,
>> +                                         sPAPRMachineState *spapr,
>> +                                         uint32_t token, uint32_t nargs,
>> +                                         target_ulong args,
>> +                                         uint32_t nret, target_ulong rets)
>> +{
>> +    sPAPRPHBState *sphb;
>> +    uint64_t buid;
>> +    uint32_t addr;
>> +
>> +    if ((nargs != 3) || (nret != 1)) {
>> +        goto param_error_exit;
>> +    }
>> +
>> +    buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
>> +    addr = rtas_ld(args, 0);
>> +    sphb = spapr_pci_find_phb(spapr, buid);
>> +    if (!sphb || !sphb->ddw_enabled) {
>> +        goto param_error_exit;
>> +    }
>> +
>> +    spapr_phb_dma_reset(sphb);
>> +    trace_spapr_iommu_ddw_reset(buid, addr);
>> +
>> +    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
>> +
>> +    return;
>> +
>> +param_error_exit:
>> +    rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
>> +}
>> +
>> +static void spapr_rtas_ddw_init(void)
>> +{
>> +    spapr_rtas_register(RTAS_IBM_QUERY_PE_DMA_WINDOW,
>> +                        "ibm,query-pe-dma-window",
>> +                        rtas_ibm_query_pe_dma_window);
>> +    spapr_rtas_register(RTAS_IBM_CREATE_PE_DMA_WINDOW,
>> +                        "ibm,create-pe-dma-window",
>> +                        rtas_ibm_create_pe_dma_window);
>> +    spapr_rtas_register(RTAS_IBM_REMOVE_PE_DMA_WINDOW,
>> +                        "ibm,remove-pe-dma-window",
>> +                        rtas_ibm_remove_pe_dma_window);
>> +    spapr_rtas_register(RTAS_IBM_RESET_PE_DMA_WINDOW,
>> +                        "ibm,reset-pe-dma-window",
>> +                        rtas_ibm_reset_pe_dma_window);
>> +}
>> +
>> +type_init(spapr_rtas_ddw_init)
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index 421d6eb..b0ea146 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -994,11 +994,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
>>               }
>>           }
>>
>> -        /*
>> -         * This only considers the host IOMMU's 32-bit window.  At
>> -         * some point we need to add support for the optional 64-bit
>> -         * window and dynamic windows
>> -         */
>
> This seems like a stray change to VFIO code in a patch that's
> otherwise only affecting spapr_pci.  Does this hunk belong in a
> different patch?


It does :)
David Gibson March 23, 2016, 6:11 a.m. UTC | #3
On Wed, Mar 23, 2016 at 02:28:01PM +1100, Alexey Kardashevskiy wrote:
> On 03/23/2016 01:13 PM, David Gibson wrote:
> >On Mon, Mar 21, 2016 at 06:47:06PM +1100, Alexey Kardashevskiy wrote:
> >>This adds support for Dynamic DMA Windows (DDW) option defined by
> >>the SPAPR specification which allows to have additional DMA window(s)
> >>
> >>This implements DDW for emulated and VFIO devices.
> >>This reserves RTAS token numbers for DDW calls.
> >>
> >>This changes the TCE table migration descriptor to support dynamic
> >>tables as from now on, PHB will create as many stub TCE table objects
> >>as PHB can possibly support but not all of them might be initialized at
> >>the time of migration because DDW might or might not be requested by
> >>the guest.
> >>
> >>The "ddw" property is enabled by default on a PHB but for compatibility
> >>the pseries-2.5 machine and older disable it.
> >>
> >>This implements DDW for VFIO. The host kernel support is required.
> >>This adds a "levels" property to PHB to control the number of levels
> >>in the actual TCE table allocated by the host kernel, 0 is the default
> >>value to tell QEMU to calculate the correct value. Current hardware
> >>supports up to 5 levels.
> >>
> >>The existing linux guests try creating one additional huge DMA window
> >>with 64K or 16MB pages and map the entire guest RAM to. If succeeded,
> >>the guest switches to dma_direct_ops and never calls TCE hypercalls
> >>(H_PUT_TCE,...) again. This enables VFIO devices to use the entire RAM
> >>and not waste time on map/unmap later. This adds a "dma64_win_addr"
> >>property which is a bus address for the 64bit window and by default
> >>set to 0x800.0000.0000.0000 as this is what the modern POWER8 hardware
> >>uses and this allows having emulated and VFIO devices on the same bus.
> >>
> >>This adds 4 RTAS handlers:
> >>* ibm,query-pe-dma-window
> >>* ibm,create-pe-dma-window
> >>* ibm,remove-pe-dma-window
> >>* ibm,reset-pe-dma-window
> >>These are registered from type_init() callback.
> >>
> >>These RTAS handlers are implemented in a separate file to avoid polluting
> >>spapr_iommu.c with PCI.
> >>
> >>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>---
> >>  hw/ppc/Makefile.objs        |   1 +
> >>  hw/ppc/spapr.c              |   7 +-
> >>  hw/ppc/spapr_pci.c          |  73 ++++++++---
> >>  hw/ppc/spapr_rtas_ddw.c     | 300 ++++++++++++++++++++++++++++++++++++++++++++
> >>  hw/vfio/common.c            |   5 -
> >>  include/hw/pci-host/spapr.h |  13 ++
> >>  include/hw/ppc/spapr.h      |  16 ++-
> >>  trace-events                |   4 +
> >>  8 files changed, 395 insertions(+), 24 deletions(-)
> >>  create mode 100644 hw/ppc/spapr_rtas_ddw.c
> >>
> >>diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
> >>index c1ffc77..986b36f 100644
> >>--- a/hw/ppc/Makefile.objs
> >>+++ b/hw/ppc/Makefile.objs
> >>@@ -7,6 +7,7 @@ obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o spapr_rng.o
> >>  ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
> >>  obj-y += spapr_pci_vfio.o
> >>  endif
> >>+obj-$(CONFIG_PSERIES) += spapr_rtas_ddw.o
> >>  # PowerPC 4xx boards
> >>  obj-y += ppc405_boards.o ppc4xx_devs.o ppc405_uc.o ppc440_bamboo.o
> >>  obj-y += ppc4xx_pci.o
> >>diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> >>index d0bb423..ef4c637 100644
> >>--- a/hw/ppc/spapr.c
> >>+++ b/hw/ppc/spapr.c
> >>@@ -2362,7 +2362,12 @@ DEFINE_SPAPR_MACHINE(2_6, "2.6", true);
> >>   * pseries-2.5
> >>   */
> >>  #define SPAPR_COMPAT_2_5 \
> >>-        HW_COMPAT_2_5
> >>+        HW_COMPAT_2_5 \
> >>+        {\
> >>+            .driver   = TYPE_SPAPR_PCI_HOST_BRIDGE,\
> >>+            .property = "ddw",\
> >>+            .value    = stringify(off),\
> >>+        },
> >>
> >>  static void spapr_machine_2_5_instance_options(MachineState *machine)
> >>  {
> >>diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> >>index af99a36..3bb294a 100644
> >>--- a/hw/ppc/spapr_pci.c
> >>+++ b/hw/ppc/spapr_pci.c
> >>@@ -803,12 +803,12 @@ static char *spapr_phb_get_loc_code(sPAPRPHBState *sphb, PCIDevice *pdev)
> >>      return buf;
> >>  }
> >>
> >>-static void spapr_phb_dma_window_enable(sPAPRPHBState *sphb,
> >>-                                       uint32_t liobn,
> >>-                                       uint32_t page_shift,
> >>-                                       uint64_t window_addr,
> >>-                                       uint64_t window_size,
> >>-                                       Error **errp)
> >>+void spapr_phb_dma_window_enable(sPAPRPHBState *sphb,
> >>+                                 uint32_t liobn,
> >>+                                 uint32_t page_shift,
> >>+                                 uint64_t window_addr,
> >>+                                 uint64_t window_size,
> >>+                                 Error **errp)
> >>  {
> >>      sPAPRTCETable *tcet;
> >>      uint32_t nb_table = window_size >> page_shift;
> >>@@ -825,10 +825,16 @@ static void spapr_phb_dma_window_enable(sPAPRPHBState *sphb,
> >>          return;
> >>      }
> >>
> >>+    if (SPAPR_PCI_DMA_WINDOW_NUM(liobn) && !sphb->ddw_enabled) {
> >>+        error_setg(errp,
> >>+                   "Attempt to use second window when DDW is disabled on PHB");
> >>+        return;
> >>+    }
> >
> >This should never happen unless something is wrong with the tests in
> >the RTAS functions, yes?  In which case it should probably be an
> >assert().
> 
> This should not. But this is called from the RTAS caller so I'd really like
> to have a message rather than assert() if that condition happens, here or in
> rtas_ibm_create_pe_dma_window().

It should only be called from RTAS if ddw is enabled though, yes?

> 
> 
> >
> >>      spapr_tce_table_enable(tcet, page_shift, window_addr, nb_table);
> >>  }
> >>
> >>-static int spapr_phb_dma_window_disable(sPAPRPHBState *sphb, uint32_t liobn)
> >>+int spapr_phb_dma_window_disable(sPAPRPHBState *sphb, uint32_t liobn)
> >>  {
> >>      sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
> >>
> >>@@ -1492,14 +1498,18 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
> >>      }
> >>
> >>      /* DMA setup */
> >>-    tcet = spapr_tce_new_table(DEVICE(sphb), sphb->dma_liobn);
> >>-    if (!tcet) {
> >>-        error_report("No default TCE table for %s", sphb->dtbusname);
> >>-        return;
> >>-    }
> >>
> >>-    memory_region_add_subregion_overlap(&sphb->iommu_root, 0,
> >>-                                        spapr_tce_get_iommu(tcet), 0);
> >>+    for (i = 0; i < SPAPR_PCI_DMA_MAX_WINDOWS; ++i) {
> >>+        tcet = spapr_tce_new_table(DEVICE(sphb),
> >>+                                   SPAPR_PCI_LIOBN(sphb->index, i));
> >>+        if (!tcet) {
> >>+            error_setg(errp, "Creating window#%d failed for %s",
> >>+                       i, sphb->dtbusname);
> >>+            return;
> >>+        }
> >>+        memory_region_add_subregion_overlap(&sphb->iommu_root, 0,
> >>+                                            spapr_tce_get_iommu(tcet), 0);
> >>+    }
> >>
> >>      sphb->msi = g_hash_table_new_full(g_int_hash, g_int_equal, g_free, g_free);
> >>  }
> >>@@ -1517,11 +1527,16 @@ static int spapr_phb_children_reset(Object *child, void *opaque)
> >>
> >>  void spapr_phb_dma_reset(sPAPRPHBState *sphb)
> >>  {
> >>-    sPAPRTCETable *tcet = spapr_tce_find_by_liobn(sphb->dma_liobn);
> >>      Error *local_err = NULL;
> >>+    int i;
> >>
> >>-    if (tcet && tcet->enabled) {
> >>-        spapr_phb_dma_window_disable(sphb, sphb->dma_liobn);
> >>+    for (i = 0; i < SPAPR_PCI_DMA_MAX_WINDOWS; ++i) {
> >>+        uint32_t liobn = SPAPR_PCI_LIOBN(sphb->index, i);
> >>+        sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
> >>+
> >>+        if (tcet && tcet->enabled) {
> >>+            spapr_phb_dma_window_disable(sphb, liobn);
> >>+        }
> >>      }
> >>
> >>      /* Register default 32bit DMA window */
> >>@@ -1562,6 +1577,13 @@ static Property spapr_phb_properties[] = {
> >>      /* Default DMA window is 0..1GB */
> >>      DEFINE_PROP_UINT64("dma_win_addr", sPAPRPHBState, dma_win_addr, 0),
> >>      DEFINE_PROP_UINT64("dma_win_size", sPAPRPHBState, dma_win_size, 0x40000000),
> >>+    DEFINE_PROP_UINT64("dma64_win_addr", sPAPRPHBState, dma64_window_addr,
> >>+                       0x800000000000000ULL),
> >>+    DEFINE_PROP_BOOL("ddw", sPAPRPHBState, ddw_enabled, true),
> >>+    DEFINE_PROP_UINT32("windows", sPAPRPHBState, windows_supported,
> >>+                       SPAPR_PCI_DMA_MAX_WINDOWS),
> >
> >What will happen if the user tries to set 'windows' larger than
> >SPAPR_PCI_DMA_MAX_WINDOWS?
> 
> 
> Oh. I need to replace SPAPR_PCI_DMA_MAX_WINDOWS with windows_supported
> everywhere, missed that. Besides that, there will be support for more
> windows, that's it. The host VFIO IOMMU driver will fail creating more
> windows but this is expected. For emulated windows, there will be more
> windows with no other consequences.

Hmm.. is there actually a reason to have the windows property?  Would
you be better off just using the compile time constant for now.

> 
> 
> 
> 
> >>+    DEFINE_PROP_UINT64("pgsz", sPAPRPHBState, page_size_mask,
> >>+                       (1ULL << 12) | (1ULL << 16) | (1ULL << 24)),
> >>      DEFINE_PROP_END_OF_LIST(),
> >>  };
> >>
> >>@@ -1815,6 +1837,15 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
> >>      uint32_t interrupt_map_mask[] = {
> >>          cpu_to_be32(b_ddddd(-1)|b_fff(0)), 0x0, 0x0, cpu_to_be32(-1)};
> >>      uint32_t interrupt_map[PCI_SLOT_MAX * PCI_NUM_PINS][7];
> >>+    uint32_t ddw_applicable[] = {
> >>+        cpu_to_be32(RTAS_IBM_QUERY_PE_DMA_WINDOW),
> >>+        cpu_to_be32(RTAS_IBM_CREATE_PE_DMA_WINDOW),
> >>+        cpu_to_be32(RTAS_IBM_REMOVE_PE_DMA_WINDOW)
> >>+    };
> >>+    uint32_t ddw_extensions[] = {
> >>+        cpu_to_be32(1),
> >>+        cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW)
> >>+    };
> >>      sPAPRTCETable *tcet;
> >>      PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus;
> >>      sPAPRFDT s_fdt;
> >>@@ -1839,6 +1870,14 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
> >>      _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pci-config-space-type", 0x1));
> >>      _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pe-total-#msi", XICS_IRQS));
> >>
> >>+    /* Dynamic DMA window */
> >>+    if (phb->ddw_enabled) {
> >>+        _FDT(fdt_setprop(fdt, bus_off, "ibm,ddw-applicable", &ddw_applicable,
> >>+                         sizeof(ddw_applicable)));
> >>+        _FDT(fdt_setprop(fdt, bus_off, "ibm,ddw-extensions",
> >>+                         &ddw_extensions, sizeof(ddw_extensions)));
> >>+    }
> >>+
> >>      /* Build the interrupt-map, this must matches what is done
> >>       * in pci_spapr_map_irq
> >>       */
> >>diff --git a/hw/ppc/spapr_rtas_ddw.c b/hw/ppc/spapr_rtas_ddw.c
> >>new file mode 100644
> >>index 0000000..37f805f
> >>--- /dev/null
> >>+++ b/hw/ppc/spapr_rtas_ddw.c
> >>@@ -0,0 +1,300 @@
> >>+/*
> >>+ * QEMU sPAPR Dynamic DMA windows support
> >>+ *
> >>+ * Copyright (c) 2015 Alexey Kardashevskiy, IBM Corporation.
> >>+ *
> >>+ *  This program is free software; you can redistribute it and/or modify
> >>+ *  it under the terms of the GNU General Public License as published by
> >>+ *  the Free Software Foundation; either version 2 of the License,
> >>+ *  or (at your option) any later version.
> >>+ *
> >>+ *  This program is distributed in the hope that it will be useful,
> >>+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
> >>+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> >>+ *  GNU General Public License for more details.
> >>+ *
> >>+ *  You should have received a copy of the GNU General Public License
> >>+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
> >>+ */
> >>+
> >>+#include "qemu/osdep.h"
> >>+#include "qemu/error-report.h"
> >>+#include "hw/ppc/spapr.h"
> >>+#include "hw/pci-host/spapr.h"
> >>+#include "trace.h"
> >>+
> >>+static int spapr_phb_get_active_win_num_cb(Object *child, void *opaque)
> >>+{
> >>+    sPAPRTCETable *tcet;
> >>+
> >>+    tcet = (sPAPRTCETable *) object_dynamic_cast(child, TYPE_SPAPR_TCE_TABLE);
> >>+    if (tcet && tcet->enabled) {
> >>+        ++*(unsigned *)opaque;
> >>+    }
> >>+    return 0;
> >>+}
> >>+
> >>+static unsigned spapr_phb_get_active_win_num(sPAPRPHBState *sphb)
> >>+{
> >>+    unsigned ret = 0;
> >>+
> >>+    object_child_foreach(OBJECT(sphb), spapr_phb_get_active_win_num_cb, &ret);
> >>+
> >>+    return ret;
> >>+}
> >>+
> >>+static int spapr_phb_get_free_liobn_cb(Object *child, void *opaque)
> >>+{
> >>+    sPAPRTCETable *tcet;
> >>+
> >>+    tcet = (sPAPRTCETable *) object_dynamic_cast(child, TYPE_SPAPR_TCE_TABLE);
> >>+    if (tcet && !tcet->enabled) {
> >>+        *(uint32_t *)opaque = tcet->liobn;
> >>+        return 1;
> >>+    }
> >>+    return 0;
> >>+}
> >>+
> >>+static unsigned spapr_phb_get_free_liobn(sPAPRPHBState *sphb)
> >>+{
> >>+    uint32_t liobn = 0;
> >>+
> >>+    object_child_foreach(OBJECT(sphb), spapr_phb_get_free_liobn_cb, &liobn);
> >>+
> >>+    return liobn;
> >>+}
> >>+
> >>+static uint32_t spapr_query_mask(struct ppc_one_seg_page_size *sps,
> >>+                                 uint64_t page_mask)
> >>+{
> >>+    int i, j;
> >>+    uint32_t mask = 0;
> >>+    const struct { int shift; uint32_t mask; } masks[] = {
> >>+        { 12, RTAS_DDW_PGSIZE_4K },
> >>+        { 16, RTAS_DDW_PGSIZE_64K },
> >>+        { 24, RTAS_DDW_PGSIZE_16M },
> >>+        { 25, RTAS_DDW_PGSIZE_32M },
> >>+        { 26, RTAS_DDW_PGSIZE_64M },
> >>+        { 27, RTAS_DDW_PGSIZE_128M },
> >>+        { 28, RTAS_DDW_PGSIZE_256M },
> >>+        { 34, RTAS_DDW_PGSIZE_16G },
> >>+    };
> >>+
> >>+    for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) {
> >>+        for (j = 0; j < ARRAY_SIZE(masks); ++j) {
> >>+            if ((sps[i].page_shift == masks[j].shift) &&
> >>+                    (page_mask & (1ULL << masks[j].shift))) {
> >>+                mask |= masks[j].mask;
> >>+            }
> >>+        }
> >>+    }
> >>+
> >>+    return mask;
> >>+}
> >>+
> >>+static void rtas_ibm_query_pe_dma_window(PowerPCCPU *cpu,
> >>+                                         sPAPRMachineState *spapr,
> >>+                                         uint32_t token, uint32_t nargs,
> >>+                                         target_ulong args,
> >>+                                         uint32_t nret, target_ulong rets)
> >>+{
> >>+    CPUPPCState *env = &cpu->env;
> >>+    sPAPRPHBState *sphb;
> >>+    uint64_t buid, max_window_size;
> >>+    uint32_t avail, addr, pgmask = 0;
> >>+
> >>+    if ((nargs != 3) || (nret != 5)) {
> >>+        goto param_error_exit;
> >>+    }
> >>+
> >>+    buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
> >>+    addr = rtas_ld(args, 0);
> >>+    sphb = spapr_pci_find_phb(spapr, buid);
> >>+    if (!sphb || !sphb->ddw_enabled) {
> >>+        goto param_error_exit;
> >>+    }
> >>+
> >>+    /* Work out supported page masks */
> >>+    pgmask = spapr_query_mask(env->sps.sps, sphb->page_size_mask);
> >
> >There are a few potential problems here.  First you're just
> >arbitrarily picking the first entry in the sps array to filter
> 
> Why first?  spapr_query_mask() has a loop 0..PPC_PAGE_SIZES_MAX_SZ.

env->sps is a nested array, 0..PPC_PAGE_SIZES_MAX_SZ-1 for base page
sizes, then again for actual page sizes.  You're only examing the
first "row" of that table.  It kinda works because the 4k base page
size is first, which lists all the actual page size options.

> >against, which doesn't seem right (except by accident).  It's a little
> >bit odd filtering against guest page sizes at all, although I get what
> >you're really trying to do is filter against allowed host page sizes.
> >
> >The other problem is that silently filtering capabilities based on the
> >host can be a pain for migration - I've made the mistake and had it
> >bite me in the past.  I think it would be safer to just check the
> >pagesizes requested in the property against what's possible and
> >outright fail if they don't match.  For convenience you could also set
> >according to host capabilities if the user doesn't specify anything,
> >but that would require explicit handling of the "default" case.
> 
> 
> For the migration purposes, both guests should be started with or without
> hugepages enabled; this is taken into account already. Besides that, the
> result of "query" won't differ.

Hmm.. if you're migrating between TCG and KVM or between PR and HV
these could change as well.  I'm not sure that works at the moment,
but I'd prefer not to introduce any more barriers to it than we have
to.

> >Remember that this code will be relevant for DDW with emulated
> >devices, even if VFIO is not in play at all.
> >
> >All those considerations aside, it seems like it would make more sense
> >to do this filtering during device realize, rather than leaving it
> >until the guest queries.
> 
> The result will be the same, it only depends on whether hugepages are
> enabled or not and this happens at the start time. But yes, feels more
> accurate to do this in PHB realize(), I'll move it.
> 
> 
> >
> >>+    /*
> >>+     * This is "Largest contiguous block of TCEs allocated specifically
> >>+     * for (that is, are reserved for) this PE".
> >>+     * Return the maximum number as maximum supported RAM size was in 4K pages.
> >>+     */
> >>+    max_window_size = MACHINE(spapr)->maxram_size >> SPAPR_TCE_PAGE_SHIFT;
> >
> >Will maxram_size always be enough?  There will sometimes be an
> >alignment gap between the "base" RAM and the hotpluggable RAM, meaning
> >that if everything is plugged the last RAM address will be beyond
> >maxram_size.  Will that require pushing this number up, or will the
> >guest "repack" the RAM layout when it maps it into the TCE tables?
> 
> 
> Hm. I do not know what the guest does to DDW on memory hotplug but this is a
> valid point... What QEMU helper does return the last available address in
> the system memory address space? Like memblock_end_of_DRAM() in the kernel,
> I would use that instead.

There is a last_ram_offset() but that's in the ram_addr_t address
space, which isn't necessarily the same as the physical address space
(though it's usually similar).  You can have a look at what we check
in (the TCG/PR version of) H_ENTER which needs to check this as well.

> 
> 
> >>+    avail = sphb->windows_supported - spapr_phb_get_active_win_num(sphb);
> >>+
> >>+    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
> >>+    rtas_st(rets, 1, avail);
> >>+    rtas_st(rets, 2, max_window_size);
> >>+    rtas_st(rets, 3, pgmask);
> >>+    rtas_st(rets, 4, 0); /* DMA migration mask, not supported */
> >>+
> >>+    trace_spapr_iommu_ddw_query(buid, addr, avail, max_window_size, pgmask);
> >>+    return;
> >>+
> >>+param_error_exit:
> >>+    rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
> >>+}
> >>+
> >>+static void rtas_ibm_create_pe_dma_window(PowerPCCPU *cpu,
> >>+                                          sPAPRMachineState *spapr,
> >>+                                          uint32_t token, uint32_t nargs,
> >>+                                          target_ulong args,
> >>+                                          uint32_t nret, target_ulong rets)
> >>+{
> >>+    sPAPRPHBState *sphb;
> >>+    sPAPRTCETable *tcet = NULL;
> >>+    uint32_t addr, page_shift, window_shift, liobn;
> >>+    uint64_t buid;
> >>+    Error *local_err = NULL;
> >>+
> >>+    if ((nargs != 5) || (nret != 4)) {
> >>+        goto param_error_exit;
> >>+    }
> >>+
> >>+    buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
> >>+    addr = rtas_ld(args, 0);
> >>+    sphb = spapr_pci_find_phb(spapr, buid);
> >>+    if (!sphb || !sphb->ddw_enabled) {
> >>+        goto param_error_exit;
> >>+    }
> >>+
> >>+    page_shift = rtas_ld(args, 3);
> >>+    window_shift = rtas_ld(args, 4);
> >>+    liobn = spapr_phb_get_free_liobn(sphb);
> >>+
> >>+    if (!liobn || !(sphb->page_size_mask & (1ULL << page_shift)) ||
> >>+        spapr_phb_get_active_win_num(sphb) == sphb->windows_supported) {
> >>+        goto hw_error_exit;
> >>+    }
> >
> >Bad page sizes should be H_PARAM, not H_HARDWARE, no?
> 
> Correct, I'll fix it.
> 
> >
> >Also is no available liobns H_HARDWARE, or H_RESOURCE?
> 
> LoPAPR says this should return -1 (hardware), -2 (function), -3 (privilege),
> not -16. So I pick -1 for this.

Ok.

> 
> 
> >>+
> >>+    if (window_shift < page_shift) {
> >>+        goto param_error_exit;
> >>+    }
> >>+
> >>+    spapr_phb_dma_window_enable(sphb, liobn, page_shift,
> >>+                                sphb->dma64_window_addr,
> >>+                                1ULL << window_shift, &local_err);
> >>+    if (local_err) {
> >>+        error_report_err(local_err);
> >>+        goto hw_error_exit;
> >>+    }
> >>+
> >>+    tcet = spapr_tce_find_by_liobn(liobn);
> >>+    trace_spapr_iommu_ddw_create(buid, addr, 1ULL << page_shift,
> >>+                                 1ULL << window_shift,
> >>+                                 tcet ? tcet->bus_offset : 0xbaadf00d, liobn);
> >>+    if (local_err || !tcet) {
> >>+        goto hw_error_exit;
> >>+    }
> >>+
> >>+    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
> >>+    rtas_st(rets, 1, liobn);
> >>+    rtas_st(rets, 2, tcet->bus_offset >> 32);
> >>+    rtas_st(rets, 3, tcet->bus_offset & ((uint32_t) -1));
> >>+
> >>+    return;
> >>+
> >>+hw_error_exit:
> >>+    rtas_st(rets, 0, RTAS_OUT_HW_ERROR);
> >>+    return;
> >>+
> >>+param_error_exit:
> >>+    rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
> >>+}
> >>+
> >>+static void rtas_ibm_remove_pe_dma_window(PowerPCCPU *cpu,
> >>+                                          sPAPRMachineState *spapr,
> >>+                                          uint32_t token, uint32_t nargs,
> >>+                                          target_ulong args,
> >>+                                          uint32_t nret, target_ulong rets)
> >>+{
> >>+    sPAPRPHBState *sphb;
> >>+    sPAPRTCETable *tcet;
> >>+    uint32_t liobn;
> >>+    long ret;
> >>+
> >>+    if ((nargs != 1) || (nret != 1)) {
> >>+        goto param_error_exit;
> >>+    }
> >>+
> >>+    liobn = rtas_ld(args, 0);
> >>+    tcet = spapr_tce_find_by_liobn(liobn);
> >>+    if (!tcet) {
> >>+        goto param_error_exit;
> >>+    }
> >>+
> >>+    sphb = SPAPR_PCI_HOST_BRIDGE(OBJECT(tcet)->parent);
> >>+    if (!sphb || !sphb->ddw_enabled || !spapr_phb_get_active_win_num(sphb)) {
> >
> >Checking spapr_phb_get_active_win_num() seems weird.  You already know
> >that this tcet is a child of the sphb.  Can't you just check its own
> >enabled property.
> >
> >>+        goto param_error_exit;
> >>+    }
> >>+
> >>+    ret = spapr_phb_dma_window_disable(sphb, liobn);
> >>+    trace_spapr_iommu_ddw_remove(liobn, ret);
> >>+    if (ret) {
> >>+        goto hw_error_exit;
> >>+    }
> >>+
> >>+    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
> >>+    return;
> >>+
> >>+hw_error_exit:
> >>+    rtas_st(rets, 0, RTAS_OUT_HW_ERROR);
> >>+    return;
> >>+
> >>+param_error_exit:
> >>+    rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
> >>+}
> >>+
> >>+static void rtas_ibm_reset_pe_dma_window(PowerPCCPU *cpu,
> >>+                                         sPAPRMachineState *spapr,
> >>+                                         uint32_t token, uint32_t nargs,
> >>+                                         target_ulong args,
> >>+                                         uint32_t nret, target_ulong rets)
> >>+{
> >>+    sPAPRPHBState *sphb;
> >>+    uint64_t buid;
> >>+    uint32_t addr;
> >>+
> >>+    if ((nargs != 3) || (nret != 1)) {
> >>+        goto param_error_exit;
> >>+    }
> >>+
> >>+    buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
> >>+    addr = rtas_ld(args, 0);
> >>+    sphb = spapr_pci_find_phb(spapr, buid);
> >>+    if (!sphb || !sphb->ddw_enabled) {
> >>+        goto param_error_exit;
> >>+    }
> >>+
> >>+    spapr_phb_dma_reset(sphb);
> >>+    trace_spapr_iommu_ddw_reset(buid, addr);
> >>+
> >>+    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
> >>+
> >>+    return;
> >>+
> >>+param_error_exit:
> >>+    rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
> >>+}
> >>+
> >>+static void spapr_rtas_ddw_init(void)
> >>+{
> >>+    spapr_rtas_register(RTAS_IBM_QUERY_PE_DMA_WINDOW,
> >>+                        "ibm,query-pe-dma-window",
> >>+                        rtas_ibm_query_pe_dma_window);
> >>+    spapr_rtas_register(RTAS_IBM_CREATE_PE_DMA_WINDOW,
> >>+                        "ibm,create-pe-dma-window",
> >>+                        rtas_ibm_create_pe_dma_window);
> >>+    spapr_rtas_register(RTAS_IBM_REMOVE_PE_DMA_WINDOW,
> >>+                        "ibm,remove-pe-dma-window",
> >>+                        rtas_ibm_remove_pe_dma_window);
> >>+    spapr_rtas_register(RTAS_IBM_RESET_PE_DMA_WINDOW,
> >>+                        "ibm,reset-pe-dma-window",
> >>+                        rtas_ibm_reset_pe_dma_window);
> >>+}
> >>+
> >>+type_init(spapr_rtas_ddw_init)
> >>diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> >>index 421d6eb..b0ea146 100644
> >>--- a/hw/vfio/common.c
> >>+++ b/hw/vfio/common.c
> >>@@ -994,11 +994,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
> >>              }
> >>          }
> >>
> >>-        /*
> >>-         * This only considers the host IOMMU's 32-bit window.  At
> >>-         * some point we need to add support for the optional 64-bit
> >>-         * window and dynamic windows
> >>-         */
> >
> >This seems like a stray change to VFIO code in a patch that's
> >otherwise only affecting spapr_pci.  Does this hunk belong in a
> >different patch?
> 
> 
> It does :)
> 
> 
>
Alexey Kardashevskiy March 24, 2016, 2:32 a.m. UTC | #4
On 03/23/2016 05:11 PM, David Gibson wrote:
> On Wed, Mar 23, 2016 at 02:28:01PM +1100, Alexey Kardashevskiy wrote:
>> On 03/23/2016 01:13 PM, David Gibson wrote:
>>> On Mon, Mar 21, 2016 at 06:47:06PM +1100, Alexey Kardashevskiy wrote:
>>>> This adds support for Dynamic DMA Windows (DDW) option defined by
>>>> the SPAPR specification which allows to have additional DMA window(s)
>>>>
>>>> This implements DDW for emulated and VFIO devices.
>>>> This reserves RTAS token numbers for DDW calls.
>>>>
>>>> This changes the TCE table migration descriptor to support dynamic
>>>> tables as from now on, PHB will create as many stub TCE table objects
>>>> as PHB can possibly support but not all of them might be initialized at
>>>> the time of migration because DDW might or might not be requested by
>>>> the guest.
>>>>
>>>> The "ddw" property is enabled by default on a PHB but for compatibility
>>>> the pseries-2.5 machine and older disable it.
>>>>
>>>> This implements DDW for VFIO. The host kernel support is required.
>>>> This adds a "levels" property to PHB to control the number of levels
>>>> in the actual TCE table allocated by the host kernel, 0 is the default
>>>> value to tell QEMU to calculate the correct value. Current hardware
>>>> supports up to 5 levels.
>>>>
>>>> The existing linux guests try creating one additional huge DMA window
>>>> with 64K or 16MB pages and map the entire guest RAM to. If succeeded,
>>>> the guest switches to dma_direct_ops and never calls TCE hypercalls
>>>> (H_PUT_TCE,...) again. This enables VFIO devices to use the entire RAM
>>>> and not waste time on map/unmap later. This adds a "dma64_win_addr"
>>>> property which is a bus address for the 64bit window and by default
>>>> set to 0x800.0000.0000.0000 as this is what the modern POWER8 hardware
>>>> uses and this allows having emulated and VFIO devices on the same bus.
>>>>
>>>> This adds 4 RTAS handlers:
>>>> * ibm,query-pe-dma-window
>>>> * ibm,create-pe-dma-window
>>>> * ibm,remove-pe-dma-window
>>>> * ibm,reset-pe-dma-window
>>>> These are registered from type_init() callback.
>>>>
>>>> These RTAS handlers are implemented in a separate file to avoid polluting
>>>> spapr_iommu.c with PCI.
>>>>
>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>> ---
>>>>   hw/ppc/Makefile.objs        |   1 +
>>>>   hw/ppc/spapr.c              |   7 +-
>>>>   hw/ppc/spapr_pci.c          |  73 ++++++++---
>>>>   hw/ppc/spapr_rtas_ddw.c     | 300 ++++++++++++++++++++++++++++++++++++++++++++
>>>>   hw/vfio/common.c            |   5 -
>>>>   include/hw/pci-host/spapr.h |  13 ++
>>>>   include/hw/ppc/spapr.h      |  16 ++-
>>>>   trace-events                |   4 +
>>>>   8 files changed, 395 insertions(+), 24 deletions(-)
>>>>   create mode 100644 hw/ppc/spapr_rtas_ddw.c
>>>>
>>>> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
>>>> index c1ffc77..986b36f 100644
>>>> --- a/hw/ppc/Makefile.objs
>>>> +++ b/hw/ppc/Makefile.objs
>>>> @@ -7,6 +7,7 @@ obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o spapr_rng.o
>>>>   ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
>>>>   obj-y += spapr_pci_vfio.o
>>>>   endif
>>>> +obj-$(CONFIG_PSERIES) += spapr_rtas_ddw.o
>>>>   # PowerPC 4xx boards
>>>>   obj-y += ppc405_boards.o ppc4xx_devs.o ppc405_uc.o ppc440_bamboo.o
>>>>   obj-y += ppc4xx_pci.o
>>>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>>>> index d0bb423..ef4c637 100644
>>>> --- a/hw/ppc/spapr.c
>>>> +++ b/hw/ppc/spapr.c
>>>> @@ -2362,7 +2362,12 @@ DEFINE_SPAPR_MACHINE(2_6, "2.6", true);
>>>>    * pseries-2.5
>>>>    */
>>>>   #define SPAPR_COMPAT_2_5 \
>>>> -        HW_COMPAT_2_5
>>>> +        HW_COMPAT_2_5 \
>>>> +        {\
>>>> +            .driver   = TYPE_SPAPR_PCI_HOST_BRIDGE,\
>>>> +            .property = "ddw",\
>>>> +            .value    = stringify(off),\
>>>> +        },
>>>>
>>>>   static void spapr_machine_2_5_instance_options(MachineState *machine)
>>>>   {
>>>> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
>>>> index af99a36..3bb294a 100644
>>>> --- a/hw/ppc/spapr_pci.c
>>>> +++ b/hw/ppc/spapr_pci.c
>>>> @@ -803,12 +803,12 @@ static char *spapr_phb_get_loc_code(sPAPRPHBState *sphb, PCIDevice *pdev)
>>>>       return buf;
>>>>   }
>>>>
>>>> -static void spapr_phb_dma_window_enable(sPAPRPHBState *sphb,
>>>> -                                       uint32_t liobn,
>>>> -                                       uint32_t page_shift,
>>>> -                                       uint64_t window_addr,
>>>> -                                       uint64_t window_size,
>>>> -                                       Error **errp)
>>>> +void spapr_phb_dma_window_enable(sPAPRPHBState *sphb,
>>>> +                                 uint32_t liobn,
>>>> +                                 uint32_t page_shift,
>>>> +                                 uint64_t window_addr,
>>>> +                                 uint64_t window_size,
>>>> +                                 Error **errp)
>>>>   {
>>>>       sPAPRTCETable *tcet;
>>>>       uint32_t nb_table = window_size >> page_shift;
>>>> @@ -825,10 +825,16 @@ static void spapr_phb_dma_window_enable(sPAPRPHBState *sphb,
>>>>           return;
>>>>       }
>>>>
>>>> +    if (SPAPR_PCI_DMA_WINDOW_NUM(liobn) && !sphb->ddw_enabled) {
>>>> +        error_setg(errp,
>>>> +                   "Attempt to use second window when DDW is disabled on PHB");
>>>> +        return;
>>>> +    }
>>>
>>> This should never happen unless something is wrong with the tests in
>>> the RTAS functions, yes?  In which case it should probably be an
>>> assert().
>>
>> This should not. But this is called from the RTAS caller so I'd really like
>> to have a message rather than assert() if that condition happens, here or in
>> rtas_ibm_create_pe_dma_window().
>
> It should only be called from RTAS if ddw is enabled though, yes?


 From RTAS and from the PHB reset handler. Well. I will get rid of 
spapr_phb_dma_window_enable/spapr_phb_dma_window_disable, they are quite 
useless when I look at them now.


>>
>>>
>>>>       spapr_tce_table_enable(tcet, page_shift, window_addr, nb_table);
>>>>   }
>>>>
>>>> -static int spapr_phb_dma_window_disable(sPAPRPHBState *sphb, uint32_t liobn)
>>>> +int spapr_phb_dma_window_disable(sPAPRPHBState *sphb, uint32_t liobn)
>>>>   {
>>>>       sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
>>>>
>>>> @@ -1492,14 +1498,18 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
>>>>       }
>>>>
>>>>       /* DMA setup */
>>>> -    tcet = spapr_tce_new_table(DEVICE(sphb), sphb->dma_liobn);
>>>> -    if (!tcet) {
>>>> -        error_report("No default TCE table for %s", sphb->dtbusname);
>>>> -        return;
>>>> -    }
>>>>
>>>> -    memory_region_add_subregion_overlap(&sphb->iommu_root, 0,
>>>> -                                        spapr_tce_get_iommu(tcet), 0);
>>>> +    for (i = 0; i < SPAPR_PCI_DMA_MAX_WINDOWS; ++i) {
>>>> +        tcet = spapr_tce_new_table(DEVICE(sphb),
>>>> +                                   SPAPR_PCI_LIOBN(sphb->index, i));
>>>> +        if (!tcet) {
>>>> +            error_setg(errp, "Creating window#%d failed for %s",
>>>> +                       i, sphb->dtbusname);
>>>> +            return;
>>>> +        }
>>>> +        memory_region_add_subregion_overlap(&sphb->iommu_root, 0,
>>>> +                                            spapr_tce_get_iommu(tcet), 0);
>>>> +    }
>>>>
>>>>       sphb->msi = g_hash_table_new_full(g_int_hash, g_int_equal, g_free, g_free);
>>>>   }
>>>> @@ -1517,11 +1527,16 @@ static int spapr_phb_children_reset(Object *child, void *opaque)
>>>>
>>>>   void spapr_phb_dma_reset(sPAPRPHBState *sphb)
>>>>   {
>>>> -    sPAPRTCETable *tcet = spapr_tce_find_by_liobn(sphb->dma_liobn);
>>>>       Error *local_err = NULL;
>>>> +    int i;
>>>>
>>>> -    if (tcet && tcet->enabled) {
>>>> -        spapr_phb_dma_window_disable(sphb, sphb->dma_liobn);
>>>> +    for (i = 0; i < SPAPR_PCI_DMA_MAX_WINDOWS; ++i) {
>>>> +        uint32_t liobn = SPAPR_PCI_LIOBN(sphb->index, i);
>>>> +        sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
>>>> +
>>>> +        if (tcet && tcet->enabled) {
>>>> +            spapr_phb_dma_window_disable(sphb, liobn);
>>>> +        }
>>>>       }
>>>>
>>>>       /* Register default 32bit DMA window */
>>>> @@ -1562,6 +1577,13 @@ static Property spapr_phb_properties[] = {
>>>>       /* Default DMA window is 0..1GB */
>>>>       DEFINE_PROP_UINT64("dma_win_addr", sPAPRPHBState, dma_win_addr, 0),
>>>>       DEFINE_PROP_UINT64("dma_win_size", sPAPRPHBState, dma_win_size, 0x40000000),
>>>> +    DEFINE_PROP_UINT64("dma64_win_addr", sPAPRPHBState, dma64_window_addr,
>>>> +                       0x800000000000000ULL),
>>>> +    DEFINE_PROP_BOOL("ddw", sPAPRPHBState, ddw_enabled, true),
>>>> +    DEFINE_PROP_UINT32("windows", sPAPRPHBState, windows_supported,
>>>> +                       SPAPR_PCI_DMA_MAX_WINDOWS),
>>>
>>> What will happen if the user tries to set 'windows' larger than
>>> SPAPR_PCI_DMA_MAX_WINDOWS?
>>
>>
>> Oh. I need to replace SPAPR_PCI_DMA_MAX_WINDOWS with windows_supported
>> everywhere, missed that. Besides that, there will be support for more
>> windows, that's it. The host VFIO IOMMU driver will fail creating more
>> windows but this is expected. For emulated windows, there will be more
>> windows with no other consequences.
>
> Hmm.. is there actually a reason to have the windows property?  Would
> you be better off just using the compile time constant for now.


I am afraid it is going to be 2 DMA windows forever as the other DMA 
tlb-ish facility coming does not use windows at all :)


>>
>>
>>
>>
>>>> +    DEFINE_PROP_UINT64("pgsz", sPAPRPHBState, page_size_mask,
>>>> +                       (1ULL << 12) | (1ULL << 16) | (1ULL << 24)),
>>>>       DEFINE_PROP_END_OF_LIST(),
>>>>   };
>>>>
>>>> @@ -1815,6 +1837,15 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
>>>>       uint32_t interrupt_map_mask[] = {
>>>>           cpu_to_be32(b_ddddd(-1)|b_fff(0)), 0x0, 0x0, cpu_to_be32(-1)};
>>>>       uint32_t interrupt_map[PCI_SLOT_MAX * PCI_NUM_PINS][7];
>>>> +    uint32_t ddw_applicable[] = {
>>>> +        cpu_to_be32(RTAS_IBM_QUERY_PE_DMA_WINDOW),
>>>> +        cpu_to_be32(RTAS_IBM_CREATE_PE_DMA_WINDOW),
>>>> +        cpu_to_be32(RTAS_IBM_REMOVE_PE_DMA_WINDOW)
>>>> +    };
>>>> +    uint32_t ddw_extensions[] = {
>>>> +        cpu_to_be32(1),
>>>> +        cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW)
>>>> +    };
>>>>       sPAPRTCETable *tcet;
>>>>       PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus;
>>>>       sPAPRFDT s_fdt;
>>>> @@ -1839,6 +1870,14 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
>>>>       _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pci-config-space-type", 0x1));
>>>>       _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pe-total-#msi", XICS_IRQS));
>>>>
>>>> +    /* Dynamic DMA window */
>>>> +    if (phb->ddw_enabled) {
>>>> +        _FDT(fdt_setprop(fdt, bus_off, "ibm,ddw-applicable", &ddw_applicable,
>>>> +                         sizeof(ddw_applicable)));
>>>> +        _FDT(fdt_setprop(fdt, bus_off, "ibm,ddw-extensions",
>>>> +                         &ddw_extensions, sizeof(ddw_extensions)));
>>>> +    }
>>>> +
>>>>       /* Build the interrupt-map, this must matches what is done
>>>>        * in pci_spapr_map_irq
>>>>        */
>>>> diff --git a/hw/ppc/spapr_rtas_ddw.c b/hw/ppc/spapr_rtas_ddw.c
>>>> new file mode 100644
>>>> index 0000000..37f805f
>>>> --- /dev/null
>>>> +++ b/hw/ppc/spapr_rtas_ddw.c
>>>> @@ -0,0 +1,300 @@
>>>> +/*
>>>> + * QEMU sPAPR Dynamic DMA windows support
>>>> + *
>>>> + * Copyright (c) 2015 Alexey Kardashevskiy, IBM Corporation.
>>>> + *
>>>> + *  This program is free software; you can redistribute it and/or modify
>>>> + *  it under the terms of the GNU General Public License as published by
>>>> + *  the Free Software Foundation; either version 2 of the License,
>>>> + *  or (at your option) any later version.
>>>> + *
>>>> + *  This program is distributed in the hope that it will be useful,
>>>> + *  but WITHOUT ANY WARRANTY; without even the implied warranty of
>>>> + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>>>> + *  GNU General Public License for more details.
>>>> + *
>>>> + *  You should have received a copy of the GNU General Public License
>>>> + *  along with this program; if not, see <http://www.gnu.org/licenses/>.
>>>> + */
>>>> +
>>>> +#include "qemu/osdep.h"
>>>> +#include "qemu/error-report.h"
>>>> +#include "hw/ppc/spapr.h"
>>>> +#include "hw/pci-host/spapr.h"
>>>> +#include "trace.h"
>>>> +
>>>> +static int spapr_phb_get_active_win_num_cb(Object *child, void *opaque)
>>>> +{
>>>> +    sPAPRTCETable *tcet;
>>>> +
>>>> +    tcet = (sPAPRTCETable *) object_dynamic_cast(child, TYPE_SPAPR_TCE_TABLE);
>>>> +    if (tcet && tcet->enabled) {
>>>> +        ++*(unsigned *)opaque;
>>>> +    }
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static unsigned spapr_phb_get_active_win_num(sPAPRPHBState *sphb)
>>>> +{
>>>> +    unsigned ret = 0;
>>>> +
>>>> +    object_child_foreach(OBJECT(sphb), spapr_phb_get_active_win_num_cb, &ret);
>>>> +
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +static int spapr_phb_get_free_liobn_cb(Object *child, void *opaque)
>>>> +{
>>>> +    sPAPRTCETable *tcet;
>>>> +
>>>> +    tcet = (sPAPRTCETable *) object_dynamic_cast(child, TYPE_SPAPR_TCE_TABLE);
>>>> +    if (tcet && !tcet->enabled) {
>>>> +        *(uint32_t *)opaque = tcet->liobn;
>>>> +        return 1;
>>>> +    }
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static unsigned spapr_phb_get_free_liobn(sPAPRPHBState *sphb)
>>>> +{
>>>> +    uint32_t liobn = 0;
>>>> +
>>>> +    object_child_foreach(OBJECT(sphb), spapr_phb_get_free_liobn_cb, &liobn);
>>>> +
>>>> +    return liobn;
>>>> +}
>>>> +
>>>> +static uint32_t spapr_query_mask(struct ppc_one_seg_page_size *sps,
>>>> +                                 uint64_t page_mask)
>>>> +{
>>>> +    int i, j;
>>>> +    uint32_t mask = 0;
>>>> +    const struct { int shift; uint32_t mask; } masks[] = {
>>>> +        { 12, RTAS_DDW_PGSIZE_4K },
>>>> +        { 16, RTAS_DDW_PGSIZE_64K },
>>>> +        { 24, RTAS_DDW_PGSIZE_16M },
>>>> +        { 25, RTAS_DDW_PGSIZE_32M },
>>>> +        { 26, RTAS_DDW_PGSIZE_64M },
>>>> +        { 27, RTAS_DDW_PGSIZE_128M },
>>>> +        { 28, RTAS_DDW_PGSIZE_256M },
>>>> +        { 34, RTAS_DDW_PGSIZE_16G },
>>>> +    };
>>>> +
>>>> +    for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) {
>>>> +        for (j = 0; j < ARRAY_SIZE(masks); ++j) {
>>>> +            if ((sps[i].page_shift == masks[j].shift) &&
>>>> +                    (page_mask & (1ULL << masks[j].shift))) {
>>>> +                mask |= masks[j].mask;
>>>> +            }
>>>> +        }
>>>> +    }
>>>> +
>>>> +    return mask;
>>>> +}
>>>> +
>>>> +static void rtas_ibm_query_pe_dma_window(PowerPCCPU *cpu,
>>>> +                                         sPAPRMachineState *spapr,
>>>> +                                         uint32_t token, uint32_t nargs,
>>>> +                                         target_ulong args,
>>>> +                                         uint32_t nret, target_ulong rets)
>>>> +{
>>>> +    CPUPPCState *env = &cpu->env;
>>>> +    sPAPRPHBState *sphb;
>>>> +    uint64_t buid, max_window_size;
>>>> +    uint32_t avail, addr, pgmask = 0;
>>>> +
>>>> +    if ((nargs != 3) || (nret != 5)) {
>>>> +        goto param_error_exit;
>>>> +    }
>>>> +
>>>> +    buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
>>>> +    addr = rtas_ld(args, 0);
>>>> +    sphb = spapr_pci_find_phb(spapr, buid);
>>>> +    if (!sphb || !sphb->ddw_enabled) {
>>>> +        goto param_error_exit;
>>>> +    }
>>>> +
>>>> +    /* Work out supported page masks */
>>>> +    pgmask = spapr_query_mask(env->sps.sps, sphb->page_size_mask);
>>>
>>> There are a few potential problems here.  First you're just
>>> arbitrarily picking the first entry in the sps array to filter
>>
>> Why first?  spapr_query_mask() has a loop 0..PPC_PAGE_SIZES_MAX_SZ.
>
> env->sps is a nested array, 0..PPC_PAGE_SIZES_MAX_SZ-1 for base page
> sizes, then again for actual page sizes.  You're only examing the
> first "row" of that table.  It kinda works because the 4k base page
> size is first, which lists all the actual page size options.

Ah. Right. So I need to walk through all of them, ok.


>>> against, which doesn't seem right (except by accident).  It's a little
>>> bit odd filtering against guest page sizes at all, although I get what
>>> you're really trying to do is filter against allowed host page sizes.
>>>
>>> The other problem is that silently filtering capabilities based on the
>>> host can be a pain for migration - I've made the mistake and had it
>>> bite me in the past.  I think it would be safer to just check the
>>> pagesizes requested in the property against what's possible and
>>> outright fail if they don't match.  For convenience you could also set
>>> according to host capabilities if the user doesn't specify anything,
>>> but that would require explicit handling of the "default" case.


What are the host capabilities here?

There is a page mask from the host IOMMU/PE which is 4K|64K|16M and many 
other sizes, this is supported always by IODA2.
And there is PAGE_SIZE and huge pages (but only with -mempath) - so, 64K 
and 16M (with -mempath).

And there is a "ddw-query" RTAS call which tells the guest if it can use 
16M or not. How do you suggest I advertise 16M to the guest? If I always 
advertise 16M and there is no -mempath, the guest won't try smaller page size.

So - if the user wants 16M IOMMU pages, he has to use -mempath and in 
addition to that explicitely say -global spapr-pci-host-bridge.pgsz=16M, 
and by default enable only 4K and 64K (or just 4K?)? I am fine with this, 
it just means more work for libvirt folks.



>>
>> For the migration purposes, both guests should be started with or without
>> hugepages enabled; this is taken into account already. Besides that, the
>> result of "query" won't differ.
>
> Hmm.. if you're migrating between TCG and KVM or between PR and HV
> these could change as well.  I'm not sure that works at the moment,
> but I'd prefer not to introduce any more barriers to it than we have
> to.
>
>>> Remember that this code will be relevant for DDW with emulated
>>> devices, even if VFIO is not in play at all.
>>>
>>> All those considerations aside, it seems like it would make more sense
>>> to do this filtering during device realize, rather than leaving it
>>> until the guest queries.
>>
>> The result will be the same, it only depends on whether hugepages are
>> enabled or not and this happens at the start time. But yes, feels more
>> accurate to do this in PHB realize(), I'll move it.
>>
>>
>>>
>>>> +    /*
>>>> +     * This is "Largest contiguous block of TCEs allocated specifically
>>>> +     * for (that is, are reserved for) this PE".
>>>> +     * Return the maximum number as maximum supported RAM size was in 4K pages.
>>>> +     */
>>>> +    max_window_size = MACHINE(spapr)->maxram_size >> SPAPR_TCE_PAGE_SHIFT;
>>>
>>> Will maxram_size always be enough?  There will sometimes be an
>>> alignment gap between the "base" RAM and the hotpluggable RAM, meaning
>>> that if everything is plugged the last RAM address will be beyond
>>> maxram_size.  Will that require pushing this number up, or will the
>>> guest "repack" the RAM layout when it maps it into the TCE tables?
>>
>>
>> Hm. I do not know what the guest does to DDW on memory hotplug but this is a
>> valid point... What QEMU helper does return the last available address in
>> the system memory address space? Like memblock_end_of_DRAM() in the kernel,
>> I would use that instead.
>
> There is a last_ram_offset() but that's in the ram_addr_t address

What do you call "ram_addr_t address space"? Non-pluggable memory and 
machine->ram_size is its size?


> space, which isn't necessarily the same as the physical address space
> (though it's usually similar).


I looked at the code and it looks like machine->ram_size is always what 
came from "-m" and it won't grow if some memory was hotplugged, is that 
correct?

And hotpluggable memory does not appear in the global ram_list as a RAMBlock?


> You can have a look at what we check
> in (the TCG/PR version of) H_ENTER which needs to check this as well.

Ok, thanks for the pointer.
David Gibson March 29, 2016, 5:22 a.m. UTC | #5
On Thu, Mar 24, 2016 at 01:32:48PM +1100, Alexey Kardashevskiy wrote:
> On 03/23/2016 05:11 PM, David Gibson wrote:
> >On Wed, Mar 23, 2016 at 02:28:01PM +1100, Alexey Kardashevskiy wrote:
> >>On 03/23/2016 01:13 PM, David Gibson wrote:
> >>>On Mon, Mar 21, 2016 at 06:47:06PM +1100, Alexey Kardashevskiy wrote:
> >>>>This adds support for Dynamic DMA Windows (DDW) option defined by
> >>>>the SPAPR specification which allows to have additional DMA window(s)
> >>>>
> >>>>This implements DDW for emulated and VFIO devices.
> >>>>This reserves RTAS token numbers for DDW calls.
> >>>>
> >>>>This changes the TCE table migration descriptor to support dynamic
> >>>>tables as from now on, PHB will create as many stub TCE table objects
> >>>>as PHB can possibly support but not all of them might be initialized at
> >>>>the time of migration because DDW might or might not be requested by
> >>>>the guest.
> >>>>
> >>>>The "ddw" property is enabled by default on a PHB but for compatibility
> >>>>the pseries-2.5 machine and older disable it.
> >>>>
> >>>>This implements DDW for VFIO. The host kernel support is required.
> >>>>This adds a "levels" property to PHB to control the number of levels
> >>>>in the actual TCE table allocated by the host kernel, 0 is the default
> >>>>value to tell QEMU to calculate the correct value. Current hardware
> >>>>supports up to 5 levels.
> >>>>
> >>>>The existing linux guests try creating one additional huge DMA window
> >>>>with 64K or 16MB pages and map the entire guest RAM to. If succeeded,
> >>>>the guest switches to dma_direct_ops and never calls TCE hypercalls
> >>>>(H_PUT_TCE,...) again. This enables VFIO devices to use the entire RAM
> >>>>and not waste time on map/unmap later. This adds a "dma64_win_addr"
> >>>>property which is a bus address for the 64bit window and by default
> >>>>set to 0x800.0000.0000.0000 as this is what the modern POWER8 hardware
> >>>>uses and this allows having emulated and VFIO devices on the same bus.
> >>>>
> >>>>This adds 4 RTAS handlers:
> >>>>* ibm,query-pe-dma-window
> >>>>* ibm,create-pe-dma-window
> >>>>* ibm,remove-pe-dma-window
> >>>>* ibm,reset-pe-dma-window
> >>>>These are registered from type_init() callback.
> >>>>
> >>>>These RTAS handlers are implemented in a separate file to avoid polluting
> >>>>spapr_iommu.c with PCI.
> >>>>
> >>>>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>>---
> >>>>  hw/ppc/Makefile.objs        |   1 +
> >>>>  hw/ppc/spapr.c              |   7 +-
> >>>>  hw/ppc/spapr_pci.c          |  73 ++++++++---
> >>>>  hw/ppc/spapr_rtas_ddw.c     | 300 ++++++++++++++++++++++++++++++++++++++++++++
> >>>>  hw/vfio/common.c            |   5 -
> >>>>  include/hw/pci-host/spapr.h |  13 ++
> >>>>  include/hw/ppc/spapr.h      |  16 ++-
> >>>>  trace-events                |   4 +
> >>>>  8 files changed, 395 insertions(+), 24 deletions(-)
> >>>>  create mode 100644 hw/ppc/spapr_rtas_ddw.c
> >>>>
> >>>>diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
> >>>>index c1ffc77..986b36f 100644
> >>>>--- a/hw/ppc/Makefile.objs
> >>>>+++ b/hw/ppc/Makefile.objs
> >>>>@@ -7,6 +7,7 @@ obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o spapr_rng.o
> >>>>  ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
> >>>>  obj-y += spapr_pci_vfio.o
> >>>>  endif
> >>>>+obj-$(CONFIG_PSERIES) += spapr_rtas_ddw.o
> >>>>  # PowerPC 4xx boards
> >>>>  obj-y += ppc405_boards.o ppc4xx_devs.o ppc405_uc.o ppc440_bamboo.o
> >>>>  obj-y += ppc4xx_pci.o
> >>>>diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> >>>>index d0bb423..ef4c637 100644
> >>>>--- a/hw/ppc/spapr.c
> >>>>+++ b/hw/ppc/spapr.c
> >>>>@@ -2362,7 +2362,12 @@ DEFINE_SPAPR_MACHINE(2_6, "2.6", true);
> >>>>   * pseries-2.5
> >>>>   */
> >>>>  #define SPAPR_COMPAT_2_5 \
> >>>>-        HW_COMPAT_2_5
> >>>>+        HW_COMPAT_2_5 \
> >>>>+        {\
> >>>>+            .driver   = TYPE_SPAPR_PCI_HOST_BRIDGE,\
> >>>>+            .property = "ddw",\
> >>>>+            .value    = stringify(off),\
> >>>>+        },
> >>>>
> >>>>  static void spapr_machine_2_5_instance_options(MachineState *machine)
> >>>>  {
> >>>>diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> >>>>index af99a36..3bb294a 100644
> >>>>--- a/hw/ppc/spapr_pci.c
> >>>>+++ b/hw/ppc/spapr_pci.c
> >>>>@@ -803,12 +803,12 @@ static char *spapr_phb_get_loc_code(sPAPRPHBState *sphb, PCIDevice *pdev)
> >>>>      return buf;
> >>>>  }
> >>>>
> >>>>-static void spapr_phb_dma_window_enable(sPAPRPHBState *sphb,
> >>>>-                                       uint32_t liobn,
> >>>>-                                       uint32_t page_shift,
> >>>>-                                       uint64_t window_addr,
> >>>>-                                       uint64_t window_size,
> >>>>-                                       Error **errp)
> >>>>+void spapr_phb_dma_window_enable(sPAPRPHBState *sphb,
> >>>>+                                 uint32_t liobn,
> >>>>+                                 uint32_t page_shift,
> >>>>+                                 uint64_t window_addr,
> >>>>+                                 uint64_t window_size,
> >>>>+                                 Error **errp)
> >>>>  {
> >>>>      sPAPRTCETable *tcet;
> >>>>      uint32_t nb_table = window_size >> page_shift;
> >>>>@@ -825,10 +825,16 @@ static void spapr_phb_dma_window_enable(sPAPRPHBState *sphb,
> >>>>          return;
> >>>>      }
> >>>>
> >>>>+    if (SPAPR_PCI_DMA_WINDOW_NUM(liobn) && !sphb->ddw_enabled) {
> >>>>+        error_setg(errp,
> >>>>+                   "Attempt to use second window when DDW is disabled on PHB");
> >>>>+        return;
> >>>>+    }
> >>>
> >>>This should never happen unless something is wrong with the tests in
> >>>the RTAS functions, yes?  In which case it should probably be an
> >>>assert().
> >>
> >>This should not. But this is called from the RTAS caller so I'd really like
> >>to have a message rather than assert() if that condition happens, here or in
> >>rtas_ibm_create_pe_dma_window().
> >
> >It should only be called from RTAS if ddw is enabled though, yes?
> 
> 
> From RTAS and from the PHB reset handler. Well. I will get rid of
> spapr_phb_dma_window_enable/spapr_phb_dma_window_disable, they are quite
> useless when I look at them now.

Ok.

> >>>>      spapr_tce_table_enable(tcet, page_shift, window_addr, nb_table);
> >>>>  }
> >>>>
> >>>>-static int spapr_phb_dma_window_disable(sPAPRPHBState *sphb, uint32_t liobn)
> >>>>+int spapr_phb_dma_window_disable(sPAPRPHBState *sphb, uint32_t liobn)
> >>>>  {
> >>>>      sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
> >>>>
> >>>>@@ -1492,14 +1498,18 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
> >>>>      }
> >>>>
> >>>>      /* DMA setup */
> >>>>-    tcet = spapr_tce_new_table(DEVICE(sphb), sphb->dma_liobn);
> >>>>-    if (!tcet) {
> >>>>-        error_report("No default TCE table for %s", sphb->dtbusname);
> >>>>-        return;
> >>>>-    }
> >>>>
> >>>>-    memory_region_add_subregion_overlap(&sphb->iommu_root, 0,
> >>>>-                                        spapr_tce_get_iommu(tcet), 0);
> >>>>+    for (i = 0; i < SPAPR_PCI_DMA_MAX_WINDOWS; ++i) {
> >>>>+        tcet = spapr_tce_new_table(DEVICE(sphb),
> >>>>+                                   SPAPR_PCI_LIOBN(sphb->index, i));
> >>>>+        if (!tcet) {
> >>>>+            error_setg(errp, "Creating window#%d failed for %s",
> >>>>+                       i, sphb->dtbusname);
> >>>>+            return;
> >>>>+        }
> >>>>+        memory_region_add_subregion_overlap(&sphb->iommu_root, 0,
> >>>>+                                            spapr_tce_get_iommu(tcet), 0);
> >>>>+    }
> >>>>
> >>>>      sphb->msi = g_hash_table_new_full(g_int_hash, g_int_equal, g_free, g_free);
> >>>>  }
> >>>>@@ -1517,11 +1527,16 @@ static int spapr_phb_children_reset(Object *child, void *opaque)
> >>>>
> >>>>  void spapr_phb_dma_reset(sPAPRPHBState *sphb)
> >>>>  {
> >>>>-    sPAPRTCETable *tcet = spapr_tce_find_by_liobn(sphb->dma_liobn);
> >>>>      Error *local_err = NULL;
> >>>>+    int i;
> >>>>
> >>>>-    if (tcet && tcet->enabled) {
> >>>>-        spapr_phb_dma_window_disable(sphb, sphb->dma_liobn);
> >>>>+    for (i = 0; i < SPAPR_PCI_DMA_MAX_WINDOWS; ++i) {
> >>>>+        uint32_t liobn = SPAPR_PCI_LIOBN(sphb->index, i);
> >>>>+        sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
> >>>>+
> >>>>+        if (tcet && tcet->enabled) {
> >>>>+            spapr_phb_dma_window_disable(sphb, liobn);
> >>>>+        }
> >>>>      }
> >>>>
> >>>>      /* Register default 32bit DMA window */
> >>>>@@ -1562,6 +1577,13 @@ static Property spapr_phb_properties[] = {
> >>>>      /* Default DMA window is 0..1GB */
> >>>>      DEFINE_PROP_UINT64("dma_win_addr", sPAPRPHBState, dma_win_addr, 0),
> >>>>      DEFINE_PROP_UINT64("dma_win_size", sPAPRPHBState, dma_win_size, 0x40000000),
> >>>>+    DEFINE_PROP_UINT64("dma64_win_addr", sPAPRPHBState, dma64_window_addr,
> >>>>+                       0x800000000000000ULL),
> >>>>+    DEFINE_PROP_BOOL("ddw", sPAPRPHBState, ddw_enabled, true),
> >>>>+    DEFINE_PROP_UINT32("windows", sPAPRPHBState, windows_supported,
> >>>>+                       SPAPR_PCI_DMA_MAX_WINDOWS),
> >>>
> >>>What will happen if the user tries to set 'windows' larger than
> >>>SPAPR_PCI_DMA_MAX_WINDOWS?
> >>
> >>
> >>Oh. I need to replace SPAPR_PCI_DMA_MAX_WINDOWS with windows_supported
> >>everywhere, missed that. Besides that, there will be support for more
> >>windows, that's it. The host VFIO IOMMU driver will fail creating more
> >>windows but this is expected. For emulated windows, there will be more
> >>windows with no other consequences.
> >
> >Hmm.. is there actually a reason to have the windows property?  Would
> >you be better off just using the compile time constant for now.
> 
> 
> I am afraid it is going to be 2 DMA windows forever as the other DMA tlb-ish
> facility coming does not use windows at all :)

That sounds like a reason not to have the property and leave it as a
compile time constant.

> >>>>+    DEFINE_PROP_UINT64("pgsz", sPAPRPHBState, page_size_mask,
> >>>>+                       (1ULL << 12) | (1ULL << 16) | (1ULL << 24)),
> >>>>      DEFINE_PROP_END_OF_LIST(),
> >>>>  };
> >>>>
> >>>>@@ -1815,6 +1837,15 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
> >>>>      uint32_t interrupt_map_mask[] = {
> >>>>          cpu_to_be32(b_ddddd(-1)|b_fff(0)), 0x0, 0x0, cpu_to_be32(-1)};
> >>>>      uint32_t interrupt_map[PCI_SLOT_MAX * PCI_NUM_PINS][7];
> >>>>+    uint32_t ddw_applicable[] = {
> >>>>+        cpu_to_be32(RTAS_IBM_QUERY_PE_DMA_WINDOW),
> >>>>+        cpu_to_be32(RTAS_IBM_CREATE_PE_DMA_WINDOW),
> >>>>+        cpu_to_be32(RTAS_IBM_REMOVE_PE_DMA_WINDOW)
> >>>>+    };
> >>>>+    uint32_t ddw_extensions[] = {
> >>>>+        cpu_to_be32(1),
> >>>>+        cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW)
> >>>>+    };
> >>>>      sPAPRTCETable *tcet;
> >>>>      PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus;
> >>>>      sPAPRFDT s_fdt;
> >>>>@@ -1839,6 +1870,14 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
> >>>>      _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pci-config-space-type", 0x1));
> >>>>      _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pe-total-#msi", XICS_IRQS));
> >>>>
> >>>>+    /* Dynamic DMA window */
> >>>>+    if (phb->ddw_enabled) {
> >>>>+        _FDT(fdt_setprop(fdt, bus_off, "ibm,ddw-applicable", &ddw_applicable,
> >>>>+                         sizeof(ddw_applicable)));
> >>>>+        _FDT(fdt_setprop(fdt, bus_off, "ibm,ddw-extensions",
> >>>>+                         &ddw_extensions, sizeof(ddw_extensions)));
> >>>>+    }
> >>>>+
> >>>>      /* Build the interrupt-map, this must matches what is done
> >>>>       * in pci_spapr_map_irq
> >>>>       */
> >>>>diff --git a/hw/ppc/spapr_rtas_ddw.c b/hw/ppc/spapr_rtas_ddw.c
> >>>>new file mode 100644
> >>>>index 0000000..37f805f
> >>>>--- /dev/null
> >>>>+++ b/hw/ppc/spapr_rtas_ddw.c
> >>>>@@ -0,0 +1,300 @@
> >>>>+/*
> >>>>+ * QEMU sPAPR Dynamic DMA windows support
> >>>>+ *
> >>>>+ * Copyright (c) 2015 Alexey Kardashevskiy, IBM Corporation.
> >>>>+ *
> >>>>+ *  This program is free software; you can redistribute it and/or modify
> >>>>+ *  it under the terms of the GNU General Public License as published by
> >>>>+ *  the Free Software Foundation; either version 2 of the License,
> >>>>+ *  or (at your option) any later version.
> >>>>+ *
> >>>>+ *  This program is distributed in the hope that it will be useful,
> >>>>+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
> >>>>+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> >>>>+ *  GNU General Public License for more details.
> >>>>+ *
> >>>>+ *  You should have received a copy of the GNU General Public License
> >>>>+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
> >>>>+ */
> >>>>+
> >>>>+#include "qemu/osdep.h"
> >>>>+#include "qemu/error-report.h"
> >>>>+#include "hw/ppc/spapr.h"
> >>>>+#include "hw/pci-host/spapr.h"
> >>>>+#include "trace.h"
> >>>>+
> >>>>+static int spapr_phb_get_active_win_num_cb(Object *child, void *opaque)
> >>>>+{
> >>>>+    sPAPRTCETable *tcet;
> >>>>+
> >>>>+    tcet = (sPAPRTCETable *) object_dynamic_cast(child, TYPE_SPAPR_TCE_TABLE);
> >>>>+    if (tcet && tcet->enabled) {
> >>>>+        ++*(unsigned *)opaque;
> >>>>+    }
> >>>>+    return 0;
> >>>>+}
> >>>>+
> >>>>+static unsigned spapr_phb_get_active_win_num(sPAPRPHBState *sphb)
> >>>>+{
> >>>>+    unsigned ret = 0;
> >>>>+
> >>>>+    object_child_foreach(OBJECT(sphb), spapr_phb_get_active_win_num_cb, &ret);
> >>>>+
> >>>>+    return ret;
> >>>>+}
> >>>>+
> >>>>+static int spapr_phb_get_free_liobn_cb(Object *child, void *opaque)
> >>>>+{
> >>>>+    sPAPRTCETable *tcet;
> >>>>+
> >>>>+    tcet = (sPAPRTCETable *) object_dynamic_cast(child, TYPE_SPAPR_TCE_TABLE);
> >>>>+    if (tcet && !tcet->enabled) {
> >>>>+        *(uint32_t *)opaque = tcet->liobn;
> >>>>+        return 1;
> >>>>+    }
> >>>>+    return 0;
> >>>>+}
> >>>>+
> >>>>+static unsigned spapr_phb_get_free_liobn(sPAPRPHBState *sphb)
> >>>>+{
> >>>>+    uint32_t liobn = 0;
> >>>>+
> >>>>+    object_child_foreach(OBJECT(sphb), spapr_phb_get_free_liobn_cb, &liobn);
> >>>>+
> >>>>+    return liobn;
> >>>>+}
> >>>>+
> >>>>+static uint32_t spapr_query_mask(struct ppc_one_seg_page_size *sps,
> >>>>+                                 uint64_t page_mask)
> >>>>+{
> >>>>+    int i, j;
> >>>>+    uint32_t mask = 0;
> >>>>+    const struct { int shift; uint32_t mask; } masks[] = {
> >>>>+        { 12, RTAS_DDW_PGSIZE_4K },
> >>>>+        { 16, RTAS_DDW_PGSIZE_64K },
> >>>>+        { 24, RTAS_DDW_PGSIZE_16M },
> >>>>+        { 25, RTAS_DDW_PGSIZE_32M },
> >>>>+        { 26, RTAS_DDW_PGSIZE_64M },
> >>>>+        { 27, RTAS_DDW_PGSIZE_128M },
> >>>>+        { 28, RTAS_DDW_PGSIZE_256M },
> >>>>+        { 34, RTAS_DDW_PGSIZE_16G },
> >>>>+    };
> >>>>+
> >>>>+    for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) {
> >>>>+        for (j = 0; j < ARRAY_SIZE(masks); ++j) {
> >>>>+            if ((sps[i].page_shift == masks[j].shift) &&
> >>>>+                    (page_mask & (1ULL << masks[j].shift))) {
> >>>>+                mask |= masks[j].mask;
> >>>>+            }
> >>>>+        }
> >>>>+    }
> >>>>+
> >>>>+    return mask;
> >>>>+}
> >>>>+
> >>>>+static void rtas_ibm_query_pe_dma_window(PowerPCCPU *cpu,
> >>>>+                                         sPAPRMachineState *spapr,
> >>>>+                                         uint32_t token, uint32_t nargs,
> >>>>+                                         target_ulong args,
> >>>>+                                         uint32_t nret, target_ulong rets)
> >>>>+{
> >>>>+    CPUPPCState *env = &cpu->env;
> >>>>+    sPAPRPHBState *sphb;
> >>>>+    uint64_t buid, max_window_size;
> >>>>+    uint32_t avail, addr, pgmask = 0;
> >>>>+
> >>>>+    if ((nargs != 3) || (nret != 5)) {
> >>>>+        goto param_error_exit;
> >>>>+    }
> >>>>+
> >>>>+    buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
> >>>>+    addr = rtas_ld(args, 0);
> >>>>+    sphb = spapr_pci_find_phb(spapr, buid);
> >>>>+    if (!sphb || !sphb->ddw_enabled) {
> >>>>+        goto param_error_exit;
> >>>>+    }
> >>>>+
> >>>>+    /* Work out supported page masks */
> >>>>+    pgmask = spapr_query_mask(env->sps.sps, sphb->page_size_mask);
> >>>
> >>>There are a few potential problems here.  First you're just
> >>>arbitrarily picking the first entry in the sps array to filter
> >>
> >>Why first?  spapr_query_mask() has a loop 0..PPC_PAGE_SIZES_MAX_SZ.
> >
> >env->sps is a nested array, 0..PPC_PAGE_SIZES_MAX_SZ-1 for base page
> >sizes, then again for actual page sizes.  You're only examing the
> >first "row" of that table.  It kinda works because the 4k base page
> >size is first, which lists all the actual page size options.
> 
> Ah. Right. So I need to walk through all of them, ok.

Well.. that would be closer to right, but even then, I don't think
that's really right.  What you want to advertise here is all guest
capabilities that are possible on the host.

So, the first question is what's the actual size of the chunks that
the host will need to map.  Let's call that the "effective IOMMU page
size".  That's going to be whichever is smaller of the guest IOMMU
page size, and the (host) RAM page size.

e.g. For a guest using 4k IOMMU mappings on a host with 64k page size,
the effective page size is 4kiB.

For a guest using 16MiB IOMMU mappings on a host with 64kiB page size,
the effective page size is 64kiB (because a 16MiB guest "page" could
be broken up into different real pages on the host).

The next question is what can you actually map in the host IOMMU.  It
should be able to handle any effective page size that's equal to, or
smaller than the smallest page size supported on the host.  That might
involve the host inserting several host TCEs for a single effective
page, but the VFIO DMA_MAP interface should already cover that because
it takes a length parameter.

> >>>against, which doesn't seem right (except by accident).  It's a little
> >>>bit odd filtering against guest page sizes at all, although I get what
> >>>you're really trying to do is filter against allowed host page sizes.
> >>>
> >>>The other problem is that silently filtering capabilities based on the
> >>>host can be a pain for migration - I've made the mistake and had it
> >>>bite me in the past.  I think it would be safer to just check the
> >>>pagesizes requested in the property against what's possible and
> >>>outright fail if they don't match.  For convenience you could also set
> >>>according to host capabilities if the user doesn't specify anything,
> >>>but that would require explicit handling of the "default" case.
> 
> 
> What are the host capabilities here?
> 
> There is a page mask from the host IOMMU/PE which is 4K|64K|16M and many
> other sizes, this is supported always by IODA2.
> And there is PAGE_SIZE and huge pages (but only with -mempath) - so, 64K and
> 16M (with -mempath).
> 
> And there is a "ddw-query" RTAS call which tells the guest if it can use 16M
> or not. How do you suggest I advertise 16M to the guest? If I always
> advertise 16M and there is no -mempath, the guest won't try smaller page
> size.

So, here's what I think you need to do:

  1. Start with the full list of IOMMU page sizes the virtual (guest)
     hardware supports (so, 4KiB, 64KiB & 16MiB, possibly modified by
     properties)
  2. For each entry in the list work out what the effective page size
     will be, by clamping to the RAM page size.
  3. Test if each effective page size is possible on the host (i.e. is
     <= at least one of the pagesizes advertised by the host IOMMU
     info ioctl).

For a first cut it's probably reasonable to only check for == (not <=)
the supported host pagesizes.  Ideally the host (inside the kernel)
would automatically create duplicate host TCEs if effective page size
< host IOMMU page size.  Obviously it would still need to use the
supplied guest page size for the guest view of the table.


> So - if the user wants 16M IOMMU pages, he has to use -mempath and in
> addition to that explicitely say -global spapr-pci-host-bridge.pgsz=16M, and
> by default enable only 4K and 64K (or just 4K?)? I am fine with this, it
> just means more work for libvirt folks.

I think that's the safest option to start with.  Require that the user
explicitly list what pagesizes the guest IOMMU will support, and just
fail if the host can't do that.

We could potentially auto-filter (as we already do for CPU/MMU
pagesizes) once we've thought a bit more carefully about the possible
migration implications.


> >>For the migration purposes, both guests should be started with or without
> >>hugepages enabled; this is taken into account already. Besides that, the
> >>result of "query" won't differ.
> >
> >Hmm.. if you're migrating between TCG and KVM or between PR and HV
> >these could change as well.  I'm not sure that works at the moment,
> >but I'd prefer not to introduce any more barriers to it than we have
> >to.
> >
> >>>Remember that this code will be relevant for DDW with emulated
> >>>devices, even if VFIO is not in play at all.
> >>>
> >>>All those considerations aside, it seems like it would make more sense
> >>>to do this filtering during device realize, rather than leaving it
> >>>until the guest queries.
> >>
> >>The result will be the same, it only depends on whether hugepages are
> >>enabled or not and this happens at the start time. But yes, feels more
> >>accurate to do this in PHB realize(), I'll move it.
> >>
> >>
> >>>
> >>>>+    /*
> >>>>+     * This is "Largest contiguous block of TCEs allocated specifically
> >>>>+     * for (that is, are reserved for) this PE".
> >>>>+     * Return the maximum number as maximum supported RAM size was in 4K pages.
> >>>>+     */
> >>>>+    max_window_size = MACHINE(spapr)->maxram_size >> SPAPR_TCE_PAGE_SHIFT;
> >>>
> >>>Will maxram_size always be enough?  There will sometimes be an
> >>>alignment gap between the "base" RAM and the hotpluggable RAM, meaning
> >>>that if everything is plugged the last RAM address will be beyond
> >>>maxram_size.  Will that require pushing this number up, or will the
> >>>guest "repack" the RAM layout when it maps it into the TCE tables?
> >>
> >>
> >>Hm. I do not know what the guest does to DDW on memory hotplug but this is a
> >>valid point... What QEMU helper does return the last available address in
> >>the system memory address space? Like memblock_end_of_DRAM() in the kernel,
> >>I would use that instead.
> >
> >There is a last_ram_offset() but that's in the ram_addr_t address
> 
> What do you call "ram_addr_t address space"? Non-pluggable memory and
> machine->ram_size is its size?

There's an address space in which all RAMBlocks live - for internal
qemu tracking purposes - which is *different* from the guest physical
address space (although they'll match in simpler cases).  Variables of
type ram_addr_t are in that address space, whereas variables of type
hwaddr are in the guest physical address space.  Or at least they're
supposed to be: unfortunately there are some variables with the wrong
type about :(.

I forget exactly why the ram_addr_t space exists - I think it's
usually contiguous, even if the ramblocks aren't contiguous in GPA,
and it's used for tracking qemu's internal dirty bitmaps amongst other
things.

> >space, which isn't necessarily the same as the physical address space
> >(though it's usually similar).
> 
> 
> I looked at the code and it looks like machine->ram_size is always what came
> from "-m" and it won't grow if some memory was hotplugged, is that correct?

Yes, that's right.

> And hotpluggable memory does not appear in the global ram_list as a RAMBlock?

Well.. hotplugged memory should appear there.  Hotpluggable, but not
yet plugged memory won't, if that's what you mean.

> >You can have a look at what we check
> >in (the TCG/PR version of) H_ENTER which needs to check this as well.
> 
> Ok, thanks for the pointer.
> 
>
Alexey Kardashevskiy March 29, 2016, 6:23 a.m. UTC | #6
On 03/29/2016 04:22 PM, David Gibson wrote:
> On Thu, Mar 24, 2016 at 01:32:48PM +1100, Alexey Kardashevskiy wrote:
>> On 03/23/2016 05:11 PM, David Gibson wrote:
>>> On Wed, Mar 23, 2016 at 02:28:01PM +1100, Alexey Kardashevskiy wrote:
>>>> On 03/23/2016 01:13 PM, David Gibson wrote:
>>>>> On Mon, Mar 21, 2016 at 06:47:06PM +1100, Alexey Kardashevskiy wrote:
>>>>>> This adds support for Dynamic DMA Windows (DDW) option defined by
>>>>>> the SPAPR specification which allows to have additional DMA window(s)
>>>>>>
>>>>>> This implements DDW for emulated and VFIO devices.
>>>>>> This reserves RTAS token numbers for DDW calls.
>>>>>>
>>>>>> This changes the TCE table migration descriptor to support dynamic
>>>>>> tables as from now on, PHB will create as many stub TCE table objects
>>>>>> as PHB can possibly support but not all of them might be initialized at
>>>>>> the time of migration because DDW might or might not be requested by
>>>>>> the guest.
>>>>>>
>>>>>> The "ddw" property is enabled by default on a PHB but for compatibility
>>>>>> the pseries-2.5 machine and older disable it.
>>>>>>
>>>>>> This implements DDW for VFIO. The host kernel support is required.
>>>>>> This adds a "levels" property to PHB to control the number of levels
>>>>>> in the actual TCE table allocated by the host kernel, 0 is the default
>>>>>> value to tell QEMU to calculate the correct value. Current hardware
>>>>>> supports up to 5 levels.
>>>>>>
>>>>>> The existing linux guests try creating one additional huge DMA window
>>>>>> with 64K or 16MB pages and map the entire guest RAM to. If succeeded,
>>>>>> the guest switches to dma_direct_ops and never calls TCE hypercalls
>>>>>> (H_PUT_TCE,...) again. This enables VFIO devices to use the entire RAM
>>>>>> and not waste time on map/unmap later. This adds a "dma64_win_addr"
>>>>>> property which is a bus address for the 64bit window and by default
>>>>>> set to 0x800.0000.0000.0000 as this is what the modern POWER8 hardware
>>>>>> uses and this allows having emulated and VFIO devices on the same bus.
>>>>>>
>>>>>> This adds 4 RTAS handlers:
>>>>>> * ibm,query-pe-dma-window
>>>>>> * ibm,create-pe-dma-window
>>>>>> * ibm,remove-pe-dma-window
>>>>>> * ibm,reset-pe-dma-window
>>>>>> These are registered from type_init() callback.
>>>>>>
>>>>>> These RTAS handlers are implemented in a separate file to avoid polluting
>>>>>> spapr_iommu.c with PCI.
>>>>>>
>>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>>>> ---
>>>>>>   hw/ppc/Makefile.objs        |   1 +
>>>>>>   hw/ppc/spapr.c              |   7 +-
>>>>>>   hw/ppc/spapr_pci.c          |  73 ++++++++---
>>>>>>   hw/ppc/spapr_rtas_ddw.c     | 300 ++++++++++++++++++++++++++++++++++++++++++++
>>>>>>   hw/vfio/common.c            |   5 -
>>>>>>   include/hw/pci-host/spapr.h |  13 ++
>>>>>>   include/hw/ppc/spapr.h      |  16 ++-
>>>>>>   trace-events                |   4 +
>>>>>>   8 files changed, 395 insertions(+), 24 deletions(-)
>>>>>>   create mode 100644 hw/ppc/spapr_rtas_ddw.c
>>>>>>
>>>>>> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
>>>>>> index c1ffc77..986b36f 100644
>>>>>> --- a/hw/ppc/Makefile.objs
>>>>>> +++ b/hw/ppc/Makefile.objs
>>>>>> @@ -7,6 +7,7 @@ obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o spapr_rng.o
>>>>>>   ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
>>>>>>   obj-y += spapr_pci_vfio.o
>>>>>>   endif
>>>>>> +obj-$(CONFIG_PSERIES) += spapr_rtas_ddw.o
>>>>>>   # PowerPC 4xx boards
>>>>>>   obj-y += ppc405_boards.o ppc4xx_devs.o ppc405_uc.o ppc440_bamboo.o
>>>>>>   obj-y += ppc4xx_pci.o
>>>>>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>>>>>> index d0bb423..ef4c637 100644
>>>>>> --- a/hw/ppc/spapr.c
>>>>>> +++ b/hw/ppc/spapr.c
>>>>>> @@ -2362,7 +2362,12 @@ DEFINE_SPAPR_MACHINE(2_6, "2.6", true);
>>>>>>    * pseries-2.5
>>>>>>    */
>>>>>>   #define SPAPR_COMPAT_2_5 \
>>>>>> -        HW_COMPAT_2_5
>>>>>> +        HW_COMPAT_2_5 \
>>>>>> +        {\
>>>>>> +            .driver   = TYPE_SPAPR_PCI_HOST_BRIDGE,\
>>>>>> +            .property = "ddw",\
>>>>>> +            .value    = stringify(off),\
>>>>>> +        },
>>>>>>
>>>>>>   static void spapr_machine_2_5_instance_options(MachineState *machine)
>>>>>>   {
>>>>>> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
>>>>>> index af99a36..3bb294a 100644
>>>>>> --- a/hw/ppc/spapr_pci.c
>>>>>> +++ b/hw/ppc/spapr_pci.c
>>>>>> @@ -803,12 +803,12 @@ static char *spapr_phb_get_loc_code(sPAPRPHBState *sphb, PCIDevice *pdev)
>>>>>>       return buf;
>>>>>>   }
>>>>>>
>>>>>> -static void spapr_phb_dma_window_enable(sPAPRPHBState *sphb,
>>>>>> -                                       uint32_t liobn,
>>>>>> -                                       uint32_t page_shift,
>>>>>> -                                       uint64_t window_addr,
>>>>>> -                                       uint64_t window_size,
>>>>>> -                                       Error **errp)
>>>>>> +void spapr_phb_dma_window_enable(sPAPRPHBState *sphb,
>>>>>> +                                 uint32_t liobn,
>>>>>> +                                 uint32_t page_shift,
>>>>>> +                                 uint64_t window_addr,
>>>>>> +                                 uint64_t window_size,
>>>>>> +                                 Error **errp)
>>>>>>   {
>>>>>>       sPAPRTCETable *tcet;
>>>>>>       uint32_t nb_table = window_size >> page_shift;
>>>>>> @@ -825,10 +825,16 @@ static void spapr_phb_dma_window_enable(sPAPRPHBState *sphb,
>>>>>>           return;
>>>>>>       }
>>>>>>
>>>>>> +    if (SPAPR_PCI_DMA_WINDOW_NUM(liobn) && !sphb->ddw_enabled) {
>>>>>> +        error_setg(errp,
>>>>>> +                   "Attempt to use second window when DDW is disabled on PHB");
>>>>>> +        return;
>>>>>> +    }
>>>>>
>>>>> This should never happen unless something is wrong with the tests in
>>>>> the RTAS functions, yes?  In which case it should probably be an
>>>>> assert().
>>>>
>>>> This should not. But this is called from the RTAS caller so I'd really like
>>>> to have a message rather than assert() if that condition happens, here or in
>>>> rtas_ibm_create_pe_dma_window().
>>>
>>> It should only be called from RTAS if ddw is enabled though, yes?
>>
>>
>>  From RTAS and from the PHB reset handler. Well. I will get rid of
>> spapr_phb_dma_window_enable/spapr_phb_dma_window_disable, they are quite
>> useless when I look at them now.
>
> Ok.
>
>>>>>>       spapr_tce_table_enable(tcet, page_shift, window_addr, nb_table);
>>>>>>   }
>>>>>>
>>>>>> -static int spapr_phb_dma_window_disable(sPAPRPHBState *sphb, uint32_t liobn)
>>>>>> +int spapr_phb_dma_window_disable(sPAPRPHBState *sphb, uint32_t liobn)
>>>>>>   {
>>>>>>       sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
>>>>>>
>>>>>> @@ -1492,14 +1498,18 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
>>>>>>       }
>>>>>>
>>>>>>       /* DMA setup */
>>>>>> -    tcet = spapr_tce_new_table(DEVICE(sphb), sphb->dma_liobn);
>>>>>> -    if (!tcet) {
>>>>>> -        error_report("No default TCE table for %s", sphb->dtbusname);
>>>>>> -        return;
>>>>>> -    }
>>>>>>
>>>>>> -    memory_region_add_subregion_overlap(&sphb->iommu_root, 0,
>>>>>> -                                        spapr_tce_get_iommu(tcet), 0);
>>>>>> +    for (i = 0; i < SPAPR_PCI_DMA_MAX_WINDOWS; ++i) {
>>>>>> +        tcet = spapr_tce_new_table(DEVICE(sphb),
>>>>>> +                                   SPAPR_PCI_LIOBN(sphb->index, i));
>>>>>> +        if (!tcet) {
>>>>>> +            error_setg(errp, "Creating window#%d failed for %s",
>>>>>> +                       i, sphb->dtbusname);
>>>>>> +            return;
>>>>>> +        }
>>>>>> +        memory_region_add_subregion_overlap(&sphb->iommu_root, 0,
>>>>>> +                                            spapr_tce_get_iommu(tcet), 0);
>>>>>> +    }
>>>>>>
>>>>>>       sphb->msi = g_hash_table_new_full(g_int_hash, g_int_equal, g_free, g_free);
>>>>>>   }
>>>>>> @@ -1517,11 +1527,16 @@ static int spapr_phb_children_reset(Object *child, void *opaque)
>>>>>>
>>>>>>   void spapr_phb_dma_reset(sPAPRPHBState *sphb)
>>>>>>   {
>>>>>> -    sPAPRTCETable *tcet = spapr_tce_find_by_liobn(sphb->dma_liobn);
>>>>>>       Error *local_err = NULL;
>>>>>> +    int i;
>>>>>>
>>>>>> -    if (tcet && tcet->enabled) {
>>>>>> -        spapr_phb_dma_window_disable(sphb, sphb->dma_liobn);
>>>>>> +    for (i = 0; i < SPAPR_PCI_DMA_MAX_WINDOWS; ++i) {
>>>>>> +        uint32_t liobn = SPAPR_PCI_LIOBN(sphb->index, i);
>>>>>> +        sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
>>>>>> +
>>>>>> +        if (tcet && tcet->enabled) {
>>>>>> +            spapr_phb_dma_window_disable(sphb, liobn);
>>>>>> +        }
>>>>>>       }
>>>>>>
>>>>>>       /* Register default 32bit DMA window */
>>>>>> @@ -1562,6 +1577,13 @@ static Property spapr_phb_properties[] = {
>>>>>>       /* Default DMA window is 0..1GB */
>>>>>>       DEFINE_PROP_UINT64("dma_win_addr", sPAPRPHBState, dma_win_addr, 0),
>>>>>>       DEFINE_PROP_UINT64("dma_win_size", sPAPRPHBState, dma_win_size, 0x40000000),
>>>>>> +    DEFINE_PROP_UINT64("dma64_win_addr", sPAPRPHBState, dma64_window_addr,
>>>>>> +                       0x800000000000000ULL),
>>>>>> +    DEFINE_PROP_BOOL("ddw", sPAPRPHBState, ddw_enabled, true),
>>>>>> +    DEFINE_PROP_UINT32("windows", sPAPRPHBState, windows_supported,
>>>>>> +                       SPAPR_PCI_DMA_MAX_WINDOWS),
>>>>>
>>>>> What will happen if the user tries to set 'windows' larger than
>>>>> SPAPR_PCI_DMA_MAX_WINDOWS?
>>>>
>>>>
>>>> Oh. I need to replace SPAPR_PCI_DMA_MAX_WINDOWS with windows_supported
>>>> everywhere, missed that. Besides that, there will be support for more
>>>> windows, that's it. The host VFIO IOMMU driver will fail creating more
>>>> windows but this is expected. For emulated windows, there will be more
>>>> windows with no other consequences.
>>>
>>> Hmm.. is there actually a reason to have the windows property?  Would
>>> you be better off just using the compile time constant for now.
>>
>>
>> I am afraid it is going to be 2 DMA windows forever as the other DMA tlb-ish
>> facility coming does not use windows at all :)
>
> That sounds like a reason not to have the property and leave it as a
> compile time constant.
>
>>>>>> +    DEFINE_PROP_UINT64("pgsz", sPAPRPHBState, page_size_mask,
>>>>>> +                       (1ULL << 12) | (1ULL << 16) | (1ULL << 24)),
>>>>>>       DEFINE_PROP_END_OF_LIST(),
>>>>>>   };
>>>>>>
>>>>>> @@ -1815,6 +1837,15 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
>>>>>>       uint32_t interrupt_map_mask[] = {
>>>>>>           cpu_to_be32(b_ddddd(-1)|b_fff(0)), 0x0, 0x0, cpu_to_be32(-1)};
>>>>>>       uint32_t interrupt_map[PCI_SLOT_MAX * PCI_NUM_PINS][7];
>>>>>> +    uint32_t ddw_applicable[] = {
>>>>>> +        cpu_to_be32(RTAS_IBM_QUERY_PE_DMA_WINDOW),
>>>>>> +        cpu_to_be32(RTAS_IBM_CREATE_PE_DMA_WINDOW),
>>>>>> +        cpu_to_be32(RTAS_IBM_REMOVE_PE_DMA_WINDOW)
>>>>>> +    };
>>>>>> +    uint32_t ddw_extensions[] = {
>>>>>> +        cpu_to_be32(1),
>>>>>> +        cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW)
>>>>>> +    };
>>>>>>       sPAPRTCETable *tcet;
>>>>>>       PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus;
>>>>>>       sPAPRFDT s_fdt;
>>>>>> @@ -1839,6 +1870,14 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
>>>>>>       _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pci-config-space-type", 0x1));
>>>>>>       _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pe-total-#msi", XICS_IRQS));
>>>>>>
>>>>>> +    /* Dynamic DMA window */
>>>>>> +    if (phb->ddw_enabled) {
>>>>>> +        _FDT(fdt_setprop(fdt, bus_off, "ibm,ddw-applicable", &ddw_applicable,
>>>>>> +                         sizeof(ddw_applicable)));
>>>>>> +        _FDT(fdt_setprop(fdt, bus_off, "ibm,ddw-extensions",
>>>>>> +                         &ddw_extensions, sizeof(ddw_extensions)));
>>>>>> +    }
>>>>>> +
>>>>>>       /* Build the interrupt-map, this must matches what is done
>>>>>>        * in pci_spapr_map_irq
>>>>>>        */
>>>>>> diff --git a/hw/ppc/spapr_rtas_ddw.c b/hw/ppc/spapr_rtas_ddw.c
>>>>>> new file mode 100644
>>>>>> index 0000000..37f805f
>>>>>> --- /dev/null
>>>>>> +++ b/hw/ppc/spapr_rtas_ddw.c
>>>>>> @@ -0,0 +1,300 @@
>>>>>> +/*
>>>>>> + * QEMU sPAPR Dynamic DMA windows support
>>>>>> + *
>>>>>> + * Copyright (c) 2015 Alexey Kardashevskiy, IBM Corporation.
>>>>>> + *
>>>>>> + *  This program is free software; you can redistribute it and/or modify
>>>>>> + *  it under the terms of the GNU General Public License as published by
>>>>>> + *  the Free Software Foundation; either version 2 of the License,
>>>>>> + *  or (at your option) any later version.
>>>>>> + *
>>>>>> + *  This program is distributed in the hope that it will be useful,
>>>>>> + *  but WITHOUT ANY WARRANTY; without even the implied warranty of
>>>>>> + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>>>>>> + *  GNU General Public License for more details.
>>>>>> + *
>>>>>> + *  You should have received a copy of the GNU General Public License
>>>>>> + *  along with this program; if not, see <http://www.gnu.org/licenses/>.
>>>>>> + */
>>>>>> +
>>>>>> +#include "qemu/osdep.h"
>>>>>> +#include "qemu/error-report.h"
>>>>>> +#include "hw/ppc/spapr.h"
>>>>>> +#include "hw/pci-host/spapr.h"
>>>>>> +#include "trace.h"
>>>>>> +
>>>>>> +static int spapr_phb_get_active_win_num_cb(Object *child, void *opaque)
>>>>>> +{
>>>>>> +    sPAPRTCETable *tcet;
>>>>>> +
>>>>>> +    tcet = (sPAPRTCETable *) object_dynamic_cast(child, TYPE_SPAPR_TCE_TABLE);
>>>>>> +    if (tcet && tcet->enabled) {
>>>>>> +        ++*(unsigned *)opaque;
>>>>>> +    }
>>>>>> +    return 0;
>>>>>> +}
>>>>>> +
>>>>>> +static unsigned spapr_phb_get_active_win_num(sPAPRPHBState *sphb)
>>>>>> +{
>>>>>> +    unsigned ret = 0;
>>>>>> +
>>>>>> +    object_child_foreach(OBJECT(sphb), spapr_phb_get_active_win_num_cb, &ret);
>>>>>> +
>>>>>> +    return ret;
>>>>>> +}
>>>>>> +
>>>>>> +static int spapr_phb_get_free_liobn_cb(Object *child, void *opaque)
>>>>>> +{
>>>>>> +    sPAPRTCETable *tcet;
>>>>>> +
>>>>>> +    tcet = (sPAPRTCETable *) object_dynamic_cast(child, TYPE_SPAPR_TCE_TABLE);
>>>>>> +    if (tcet && !tcet->enabled) {
>>>>>> +        *(uint32_t *)opaque = tcet->liobn;
>>>>>> +        return 1;
>>>>>> +    }
>>>>>> +    return 0;
>>>>>> +}
>>>>>> +
>>>>>> +static unsigned spapr_phb_get_free_liobn(sPAPRPHBState *sphb)
>>>>>> +{
>>>>>> +    uint32_t liobn = 0;
>>>>>> +
>>>>>> +    object_child_foreach(OBJECT(sphb), spapr_phb_get_free_liobn_cb, &liobn);
>>>>>> +
>>>>>> +    return liobn;
>>>>>> +}
>>>>>> +
>>>>>> +static uint32_t spapr_query_mask(struct ppc_one_seg_page_size *sps,
>>>>>> +                                 uint64_t page_mask)
>>>>>> +{
>>>>>> +    int i, j;
>>>>>> +    uint32_t mask = 0;
>>>>>> +    const struct { int shift; uint32_t mask; } masks[] = {
>>>>>> +        { 12, RTAS_DDW_PGSIZE_4K },
>>>>>> +        { 16, RTAS_DDW_PGSIZE_64K },
>>>>>> +        { 24, RTAS_DDW_PGSIZE_16M },
>>>>>> +        { 25, RTAS_DDW_PGSIZE_32M },
>>>>>> +        { 26, RTAS_DDW_PGSIZE_64M },
>>>>>> +        { 27, RTAS_DDW_PGSIZE_128M },
>>>>>> +        { 28, RTAS_DDW_PGSIZE_256M },
>>>>>> +        { 34, RTAS_DDW_PGSIZE_16G },
>>>>>> +    };
>>>>>> +
>>>>>> +    for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) {
>>>>>> +        for (j = 0; j < ARRAY_SIZE(masks); ++j) {
>>>>>> +            if ((sps[i].page_shift == masks[j].shift) &&
>>>>>> +                    (page_mask & (1ULL << masks[j].shift))) {
>>>>>> +                mask |= masks[j].mask;
>>>>>> +            }
>>>>>> +        }
>>>>>> +    }
>>>>>> +
>>>>>> +    return mask;
>>>>>> +}
>>>>>> +
>>>>>> +static void rtas_ibm_query_pe_dma_window(PowerPCCPU *cpu,
>>>>>> +                                         sPAPRMachineState *spapr,
>>>>>> +                                         uint32_t token, uint32_t nargs,
>>>>>> +                                         target_ulong args,
>>>>>> +                                         uint32_t nret, target_ulong rets)
>>>>>> +{
>>>>>> +    CPUPPCState *env = &cpu->env;
>>>>>> +    sPAPRPHBState *sphb;
>>>>>> +    uint64_t buid, max_window_size;
>>>>>> +    uint32_t avail, addr, pgmask = 0;
>>>>>> +
>>>>>> +    if ((nargs != 3) || (nret != 5)) {
>>>>>> +        goto param_error_exit;
>>>>>> +    }
>>>>>> +
>>>>>> +    buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
>>>>>> +    addr = rtas_ld(args, 0);
>>>>>> +    sphb = spapr_pci_find_phb(spapr, buid);
>>>>>> +    if (!sphb || !sphb->ddw_enabled) {
>>>>>> +        goto param_error_exit;
>>>>>> +    }
>>>>>> +
>>>>>> +    /* Work out supported page masks */
>>>>>> +    pgmask = spapr_query_mask(env->sps.sps, sphb->page_size_mask);
>>>>>
>>>>> There are a few potential problems here.  First you're just
>>>>> arbitrarily picking the first entry in the sps array to filter
>>>>
>>>> Why first?  spapr_query_mask() has a loop 0..PPC_PAGE_SIZES_MAX_SZ.
>>>
>>> env->sps is a nested array, 0..PPC_PAGE_SIZES_MAX_SZ-1 for base page
>>> sizes, then again for actual page sizes.  You're only examing the
>>> first "row" of that table.  It kinda works because the 4k base page
>>> size is first, which lists all the actual page size options.
>>
>> Ah. Right. So I need to walk through all of them, ok.
>
> Well.. that would be closer to right, but even then, I don't think
> that's really right.  What you want to advertise here is all guest
> capabilities that are possible on the host.
>
> So, the first question is what's the actual size of the chunks that
> the host will need to map.  Let's call that the "effective IOMMU page
> size".  That's going to be whichever is smaller of the guest IOMMU
> page size, and the (host) RAM page size.
>
> e.g. For a guest using 4k IOMMU mappings on a host with 64k page size,
> the effective page size is 4kiB.
>
> For a guest using 16MiB IOMMU mappings on a host with 64kiB page size,
> the effective page size is 64kiB (because a 16MiB guest "page" could
> be broken up into different real pages on the host).
>
> The next question is what can you actually map in the host IOMMU.  It
> should be able to handle any effective page size that's equal to, or
> smaller than the smallest page size supported on the host.  That might
> involve the host inserting several host TCEs for a single effective
> page, but the VFIO DMA_MAP interface should already cover that because
> it takes a length parameter.
>
>>>>> against, which doesn't seem right (except by accident).  It's a little
>>>>> bit odd filtering against guest page sizes at all, although I get what
>>>>> you're really trying to do is filter against allowed host page sizes.
>>>>>
>>>>> The other problem is that silently filtering capabilities based on the
>>>>> host can be a pain for migration - I've made the mistake and had it
>>>>> bite me in the past.  I think it would be safer to just check the
>>>>> pagesizes requested in the property against what's possible and
>>>>> outright fail if they don't match.  For convenience you could also set
>>>>> according to host capabilities if the user doesn't specify anything,
>>>>> but that would require explicit handling of the "default" case.
>>
>>
>> What are the host capabilities here?
>>
>> There is a page mask from the host IOMMU/PE which is 4K|64K|16M and many
>> other sizes, this is supported always by IODA2.
>> And there is PAGE_SIZE and huge pages (but only with -mempath) - so, 64K and
>> 16M (with -mempath).
>>
>> And there is a "ddw-query" RTAS call which tells the guest if it can use 16M
>> or not. How do you suggest I advertise 16M to the guest? If I always
>> advertise 16M and there is no -mempath, the guest won't try smaller page
>> size.
>
> So, here's what I think you need to do:
>
>    1. Start with the full list of IOMMU page sizes the virtual (guest)
>       hardware supports (so, 4KiB, 64KiB & 16MiB, possibly modified by
>       properties)
>    2. For each entry in the list work out what the effective page size
>       will be, by clamping to the RAM page size.
>    3. Test if each effective page size is possible on the host (i.e. is
>       <= at least one of the pagesizes advertised by the host IOMMU
>       info ioctl).
>
> For a first cut it's probably reasonable to only check for == (not <=)
> the supported host pagesizes.  Ideally the host (inside the kernel)
> would automatically create duplicate host TCEs if effective page size
> < host IOMMU page size.  Obviously it would still need to use the
> supplied guest page size for the guest view of the table.
>
>
>> So - if the user wants 16M IOMMU pages, he has to use -mempath and in
>> addition to that explicitely say -global spapr-pci-host-bridge.pgsz=16M, and
>> by default enable only 4K and 64K (or just 4K?)? I am fine with this, it
>> just means more work for libvirt folks.
>
> I think that's the safest option to start with.  Require that the user
> explicitly list what pagesizes the guest IOMMU will support, and just
> fail if the host can't do that.
>
> We could potentially auto-filter (as we already do for CPU/MMU
> pagesizes) once we've thought a bit more carefully about the possible
> migration implications.


For now I enable 4K and 64K for a guest and then rely on the kernel's 
ability to create what the guest requested.

In v15, I'll add 3 patches to auto-add 16MB to the allowed mask list, may 
be it won't look too ugly.
David Gibson March 31, 2016, 3:19 a.m. UTC | #7
On Thu, Mar 24, 2016 at 01:32:48PM +1100, Alexey Kardashevskiy wrote:
> On 03/23/2016 05:11 PM, David Gibson wrote:
> >On Wed, Mar 23, 2016 at 02:28:01PM +1100, Alexey Kardashevskiy wrote:
> >>On 03/23/2016 01:13 PM, David Gibson wrote:
> >>>On Mon, Mar 21, 2016 at 06:47:06PM +1100, Alexey Kardashevskiy wrote:
> >>>>This adds support for Dynamic DMA Windows (DDW) option defined by
> >>>>the SPAPR specification which allows to have additional DMA window(s)
> >>>>
> >>>>This implements DDW for emulated and VFIO devices.
> >>>>This reserves RTAS token numbers for DDW calls.
> >>>>
> >>>>This changes the TCE table migration descriptor to support dynamic
> >>>>tables as from now on, PHB will create as many stub TCE table objects
> >>>>as PHB can possibly support but not all of them might be initialized at
> >>>>the time of migration because DDW might or might not be requested by
> >>>>the guest.
> >>>>
> >>>>The "ddw" property is enabled by default on a PHB but for compatibility
> >>>>the pseries-2.5 machine and older disable it.
> >>>>
> >>>>This implements DDW for VFIO. The host kernel support is required.
> >>>>This adds a "levels" property to PHB to control the number of levels
> >>>>in the actual TCE table allocated by the host kernel, 0 is the default
> >>>>value to tell QEMU to calculate the correct value. Current hardware
> >>>>supports up to 5 levels.
> >>>>
> >>>>The existing linux guests try creating one additional huge DMA window
> >>>>with 64K or 16MB pages and map the entire guest RAM to. If succeeded,
> >>>>the guest switches to dma_direct_ops and never calls TCE hypercalls
> >>>>(H_PUT_TCE,...) again. This enables VFIO devices to use the entire RAM
> >>>>and not waste time on map/unmap later. This adds a "dma64_win_addr"
> >>>>property which is a bus address for the 64bit window and by default
> >>>>set to 0x800.0000.0000.0000 as this is what the modern POWER8 hardware
> >>>>uses and this allows having emulated and VFIO devices on the same bus.
> >>>>
> >>>>This adds 4 RTAS handlers:
> >>>>* ibm,query-pe-dma-window
> >>>>* ibm,create-pe-dma-window
> >>>>* ibm,remove-pe-dma-window
> >>>>* ibm,reset-pe-dma-window
> >>>>These are registered from type_init() callback.
> >>>>
> >>>>These RTAS handlers are implemented in a separate file to avoid polluting
> >>>>spapr_iommu.c with PCI.
> >>>>
> >>>>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>>---
> >>>>  hw/ppc/Makefile.objs        |   1 +
> >>>>  hw/ppc/spapr.c              |   7 +-
> >>>>  hw/ppc/spapr_pci.c          |  73 ++++++++---
> >>>>  hw/ppc/spapr_rtas_ddw.c     | 300 ++++++++++++++++++++++++++++++++++++++++++++
> >>>>  hw/vfio/common.c            |   5 -
> >>>>  include/hw/pci-host/spapr.h |  13 ++
> >>>>  include/hw/ppc/spapr.h      |  16 ++-
> >>>>  trace-events                |   4 +
> >>>>  8 files changed, 395 insertions(+), 24 deletions(-)
> >>>>  create mode 100644 hw/ppc/spapr_rtas_ddw.c
> >>>>
> >>>>diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
> >>>>index c1ffc77..986b36f 100644
> >>>>--- a/hw/ppc/Makefile.objs
> >>>>+++ b/hw/ppc/Makefile.objs
> >>>>@@ -7,6 +7,7 @@ obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o spapr_rng.o
> >>>>  ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
> >>>>  obj-y += spapr_pci_vfio.o
> >>>>  endif
> >>>>+obj-$(CONFIG_PSERIES) += spapr_rtas_ddw.o
> >>>>  # PowerPC 4xx boards
> >>>>  obj-y += ppc405_boards.o ppc4xx_devs.o ppc405_uc.o ppc440_bamboo.o
> >>>>  obj-y += ppc4xx_pci.o
> >>>>diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> >>>>index d0bb423..ef4c637 100644
> >>>>--- a/hw/ppc/spapr.c
> >>>>+++ b/hw/ppc/spapr.c
> >>>>@@ -2362,7 +2362,12 @@ DEFINE_SPAPR_MACHINE(2_6, "2.6", true);
> >>>>   * pseries-2.5
> >>>>   */
> >>>>  #define SPAPR_COMPAT_2_5 \
> >>>>-        HW_COMPAT_2_5
> >>>>+        HW_COMPAT_2_5 \
> >>>>+        {\
> >>>>+            .driver   = TYPE_SPAPR_PCI_HOST_BRIDGE,\
> >>>>+            .property = "ddw",\
> >>>>+            .value    = stringify(off),\
> >>>>+        },
> >>>>
> >>>>  static void spapr_machine_2_5_instance_options(MachineState *machine)
> >>>>  {
> >>>>diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> >>>>index af99a36..3bb294a 100644
> >>>>--- a/hw/ppc/spapr_pci.c
> >>>>+++ b/hw/ppc/spapr_pci.c
> >>>>@@ -803,12 +803,12 @@ static char *spapr_phb_get_loc_code(sPAPRPHBState *sphb, PCIDevice *pdev)
> >>>>      return buf;
> >>>>  }
> >>>>
> >>>>-static void spapr_phb_dma_window_enable(sPAPRPHBState *sphb,
> >>>>-                                       uint32_t liobn,
> >>>>-                                       uint32_t page_shift,
> >>>>-                                       uint64_t window_addr,
> >>>>-                                       uint64_t window_size,
> >>>>-                                       Error **errp)
> >>>>+void spapr_phb_dma_window_enable(sPAPRPHBState *sphb,
> >>>>+                                 uint32_t liobn,
> >>>>+                                 uint32_t page_shift,
> >>>>+                                 uint64_t window_addr,
> >>>>+                                 uint64_t window_size,
> >>>>+                                 Error **errp)
> >>>>  {
> >>>>      sPAPRTCETable *tcet;
> >>>>      uint32_t nb_table = window_size >> page_shift;
> >>>>@@ -825,10 +825,16 @@ static void spapr_phb_dma_window_enable(sPAPRPHBState *sphb,
> >>>>          return;
> >>>>      }
> >>>>
> >>>>+    if (SPAPR_PCI_DMA_WINDOW_NUM(liobn) && !sphb->ddw_enabled) {
> >>>>+        error_setg(errp,
> >>>>+                   "Attempt to use second window when DDW is disabled on PHB");
> >>>>+        return;
> >>>>+    }
> >>>
> >>>This should never happen unless something is wrong with the tests in
> >>>the RTAS functions, yes?  In which case it should probably be an
> >>>assert().
> >>
> >>This should not. But this is called from the RTAS caller so I'd really like
> >>to have a message rather than assert() if that condition happens, here or in
> >>rtas_ibm_create_pe_dma_window().
> >
> >It should only be called from RTAS if ddw is enabled though, yes?
> 
> 
> From RTAS and from the PHB reset handler. Well. I will get rid of
> spapr_phb_dma_window_enable/spapr_phb_dma_window_disable, they are quite
> useless when I look at them now.

Ok.

> >>>>      spapr_tce_table_enable(tcet, page_shift, window_addr, nb_table);
> >>>>  }
> >>>>
> >>>>-static int spapr_phb_dma_window_disable(sPAPRPHBState *sphb, uint32_t liobn)
> >>>>+int spapr_phb_dma_window_disable(sPAPRPHBState *sphb, uint32_t liobn)
> >>>>  {
> >>>>      sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
> >>>>
> >>>>@@ -1492,14 +1498,18 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
> >>>>      }
> >>>>
> >>>>      /* DMA setup */
> >>>>-    tcet = spapr_tce_new_table(DEVICE(sphb), sphb->dma_liobn);
> >>>>-    if (!tcet) {
> >>>>-        error_report("No default TCE table for %s", sphb->dtbusname);
> >>>>-        return;
> >>>>-    }
> >>>>
> >>>>-    memory_region_add_subregion_overlap(&sphb->iommu_root, 0,
> >>>>-                                        spapr_tce_get_iommu(tcet), 0);
> >>>>+    for (i = 0; i < SPAPR_PCI_DMA_MAX_WINDOWS; ++i) {
> >>>>+        tcet = spapr_tce_new_table(DEVICE(sphb),
> >>>>+                                   SPAPR_PCI_LIOBN(sphb->index, i));
> >>>>+        if (!tcet) {
> >>>>+            error_setg(errp, "Creating window#%d failed for %s",
> >>>>+                       i, sphb->dtbusname);
> >>>>+            return;
> >>>>+        }
> >>>>+        memory_region_add_subregion_overlap(&sphb->iommu_root, 0,
> >>>>+                                            spapr_tce_get_iommu(tcet), 0);
> >>>>+    }
> >>>>
> >>>>      sphb->msi = g_hash_table_new_full(g_int_hash, g_int_equal, g_free, g_free);
> >>>>  }
> >>>>@@ -1517,11 +1527,16 @@ static int spapr_phb_children_reset(Object *child, void *opaque)
> >>>>
> >>>>  void spapr_phb_dma_reset(sPAPRPHBState *sphb)
> >>>>  {
> >>>>-    sPAPRTCETable *tcet = spapr_tce_find_by_liobn(sphb->dma_liobn);
> >>>>      Error *local_err = NULL;
> >>>>+    int i;
> >>>>
> >>>>-    if (tcet && tcet->enabled) {
> >>>>-        spapr_phb_dma_window_disable(sphb, sphb->dma_liobn);
> >>>>+    for (i = 0; i < SPAPR_PCI_DMA_MAX_WINDOWS; ++i) {
> >>>>+        uint32_t liobn = SPAPR_PCI_LIOBN(sphb->index, i);
> >>>>+        sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
> >>>>+
> >>>>+        if (tcet && tcet->enabled) {
> >>>>+            spapr_phb_dma_window_disable(sphb, liobn);
> >>>>+        }
> >>>>      }
> >>>>
> >>>>      /* Register default 32bit DMA window */
> >>>>@@ -1562,6 +1577,13 @@ static Property spapr_phb_properties[] = {
> >>>>      /* Default DMA window is 0..1GB */
> >>>>      DEFINE_PROP_UINT64("dma_win_addr", sPAPRPHBState, dma_win_addr, 0),
> >>>>      DEFINE_PROP_UINT64("dma_win_size", sPAPRPHBState, dma_win_size, 0x40000000),
> >>>>+    DEFINE_PROP_UINT64("dma64_win_addr", sPAPRPHBState, dma64_window_addr,
> >>>>+                       0x800000000000000ULL),
> >>>>+    DEFINE_PROP_BOOL("ddw", sPAPRPHBState, ddw_enabled, true),
> >>>>+    DEFINE_PROP_UINT32("windows", sPAPRPHBState, windows_supported,
> >>>>+                       SPAPR_PCI_DMA_MAX_WINDOWS),
> >>>
> >>>What will happen if the user tries to set 'windows' larger than
> >>>SPAPR_PCI_DMA_MAX_WINDOWS?
> >>
> >>
> >>Oh. I need to replace SPAPR_PCI_DMA_MAX_WINDOWS with windows_supported
> >>everywhere, missed that. Besides that, there will be support for more
> >>windows, that's it. The host VFIO IOMMU driver will fail creating more
> >>windows but this is expected. For emulated windows, there will be more
> >>windows with no other consequences.
> >
> >Hmm.. is there actually a reason to have the windows property?  Would
> >you be better off just using the compile time constant for now.
> 
> 
> I am afraid it is going to be 2 DMA windows forever as the other DMA tlb-ish
> facility coming does not use windows at all :)

Ok, that sounds like a vote for removing the property.

> >>>>+    DEFINE_PROP_UINT64("pgsz", sPAPRPHBState, page_size_mask,
> >>>>+                       (1ULL << 12) | (1ULL << 16) | (1ULL << 24)),
> >>>>      DEFINE_PROP_END_OF_LIST(),
> >>>>  };
> >>>>
> >>>>@@ -1815,6 +1837,15 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
> >>>>      uint32_t interrupt_map_mask[] = {
> >>>>          cpu_to_be32(b_ddddd(-1)|b_fff(0)), 0x0, 0x0, cpu_to_be32(-1)};
> >>>>      uint32_t interrupt_map[PCI_SLOT_MAX * PCI_NUM_PINS][7];
> >>>>+    uint32_t ddw_applicable[] = {
> >>>>+        cpu_to_be32(RTAS_IBM_QUERY_PE_DMA_WINDOW),
> >>>>+        cpu_to_be32(RTAS_IBM_CREATE_PE_DMA_WINDOW),
> >>>>+        cpu_to_be32(RTAS_IBM_REMOVE_PE_DMA_WINDOW)
> >>>>+    };
> >>>>+    uint32_t ddw_extensions[] = {
> >>>>+        cpu_to_be32(1),
> >>>>+        cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW)
> >>>>+    };
> >>>>      sPAPRTCETable *tcet;
> >>>>      PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus;
> >>>>      sPAPRFDT s_fdt;
> >>>>@@ -1839,6 +1870,14 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
> >>>>      _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pci-config-space-type", 0x1));
> >>>>      _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pe-total-#msi", XICS_IRQS));
> >>>>
> >>>>+    /* Dynamic DMA window */
> >>>>+    if (phb->ddw_enabled) {
> >>>>+        _FDT(fdt_setprop(fdt, bus_off, "ibm,ddw-applicable", &ddw_applicable,
> >>>>+                         sizeof(ddw_applicable)));
> >>>>+        _FDT(fdt_setprop(fdt, bus_off, "ibm,ddw-extensions",
> >>>>+                         &ddw_extensions, sizeof(ddw_extensions)));
> >>>>+    }
> >>>>+
> >>>>      /* Build the interrupt-map, this must matches what is done
> >>>>       * in pci_spapr_map_irq
> >>>>       */
> >>>>diff --git a/hw/ppc/spapr_rtas_ddw.c b/hw/ppc/spapr_rtas_ddw.c
> >>>>new file mode 100644
> >>>>index 0000000..37f805f
> >>>>--- /dev/null
> >>>>+++ b/hw/ppc/spapr_rtas_ddw.c
> >>>>@@ -0,0 +1,300 @@
> >>>>+/*
> >>>>+ * QEMU sPAPR Dynamic DMA windows support
> >>>>+ *
> >>>>+ * Copyright (c) 2015 Alexey Kardashevskiy, IBM Corporation.
> >>>>+ *
> >>>>+ *  This program is free software; you can redistribute it and/or modify
> >>>>+ *  it under the terms of the GNU General Public License as published by
> >>>>+ *  the Free Software Foundation; either version 2 of the License,
> >>>>+ *  or (at your option) any later version.
> >>>>+ *
> >>>>+ *  This program is distributed in the hope that it will be useful,
> >>>>+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
> >>>>+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> >>>>+ *  GNU General Public License for more details.
> >>>>+ *
> >>>>+ *  You should have received a copy of the GNU General Public License
> >>>>+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
> >>>>+ */
> >>>>+
> >>>>+#include "qemu/osdep.h"
> >>>>+#include "qemu/error-report.h"
> >>>>+#include "hw/ppc/spapr.h"
> >>>>+#include "hw/pci-host/spapr.h"
> >>>>+#include "trace.h"
> >>>>+
> >>>>+static int spapr_phb_get_active_win_num_cb(Object *child, void *opaque)
> >>>>+{
> >>>>+    sPAPRTCETable *tcet;
> >>>>+
> >>>>+    tcet = (sPAPRTCETable *) object_dynamic_cast(child, TYPE_SPAPR_TCE_TABLE);
> >>>>+    if (tcet && tcet->enabled) {
> >>>>+        ++*(unsigned *)opaque;
> >>>>+    }
> >>>>+    return 0;
> >>>>+}
> >>>>+
> >>>>+static unsigned spapr_phb_get_active_win_num(sPAPRPHBState *sphb)
> >>>>+{
> >>>>+    unsigned ret = 0;
> >>>>+
> >>>>+    object_child_foreach(OBJECT(sphb), spapr_phb_get_active_win_num_cb, &ret);
> >>>>+
> >>>>+    return ret;
> >>>>+}
> >>>>+
> >>>>+static int spapr_phb_get_free_liobn_cb(Object *child, void *opaque)
> >>>>+{
> >>>>+    sPAPRTCETable *tcet;
> >>>>+
> >>>>+    tcet = (sPAPRTCETable *) object_dynamic_cast(child, TYPE_SPAPR_TCE_TABLE);
> >>>>+    if (tcet && !tcet->enabled) {
> >>>>+        *(uint32_t *)opaque = tcet->liobn;
> >>>>+        return 1;
> >>>>+    }
> >>>>+    return 0;
> >>>>+}
> >>>>+
> >>>>+static unsigned spapr_phb_get_free_liobn(sPAPRPHBState *sphb)
> >>>>+{
> >>>>+    uint32_t liobn = 0;
> >>>>+
> >>>>+    object_child_foreach(OBJECT(sphb), spapr_phb_get_free_liobn_cb, &liobn);
> >>>>+
> >>>>+    return liobn;
> >>>>+}
> >>>>+
> >>>>+static uint32_t spapr_query_mask(struct ppc_one_seg_page_size *sps,
> >>>>+                                 uint64_t page_mask)
> >>>>+{
> >>>>+    int i, j;
> >>>>+    uint32_t mask = 0;
> >>>>+    const struct { int shift; uint32_t mask; } masks[] = {
> >>>>+        { 12, RTAS_DDW_PGSIZE_4K },
> >>>>+        { 16, RTAS_DDW_PGSIZE_64K },
> >>>>+        { 24, RTAS_DDW_PGSIZE_16M },
> >>>>+        { 25, RTAS_DDW_PGSIZE_32M },
> >>>>+        { 26, RTAS_DDW_PGSIZE_64M },
> >>>>+        { 27, RTAS_DDW_PGSIZE_128M },
> >>>>+        { 28, RTAS_DDW_PGSIZE_256M },
> >>>>+        { 34, RTAS_DDW_PGSIZE_16G },
> >>>>+    };
> >>>>+
> >>>>+    for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) {
> >>>>+        for (j = 0; j < ARRAY_SIZE(masks); ++j) {
> >>>>+            if ((sps[i].page_shift == masks[j].shift) &&
> >>>>+                    (page_mask & (1ULL << masks[j].shift))) {
> >>>>+                mask |= masks[j].mask;
> >>>>+            }
> >>>>+        }
> >>>>+    }
> >>>>+
> >>>>+    return mask;
> >>>>+}
> >>>>+
> >>>>+static void rtas_ibm_query_pe_dma_window(PowerPCCPU *cpu,
> >>>>+                                         sPAPRMachineState *spapr,
> >>>>+                                         uint32_t token, uint32_t nargs,
> >>>>+                                         target_ulong args,
> >>>>+                                         uint32_t nret, target_ulong rets)
> >>>>+{
> >>>>+    CPUPPCState *env = &cpu->env;
> >>>>+    sPAPRPHBState *sphb;
> >>>>+    uint64_t buid, max_window_size;
> >>>>+    uint32_t avail, addr, pgmask = 0;
> >>>>+
> >>>>+    if ((nargs != 3) || (nret != 5)) {
> >>>>+        goto param_error_exit;
> >>>>+    }
> >>>>+
> >>>>+    buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
> >>>>+    addr = rtas_ld(args, 0);
> >>>>+    sphb = spapr_pci_find_phb(spapr, buid);
> >>>>+    if (!sphb || !sphb->ddw_enabled) {
> >>>>+        goto param_error_exit;
> >>>>+    }
> >>>>+
> >>>>+    /* Work out supported page masks */
> >>>>+    pgmask = spapr_query_mask(env->sps.sps, sphb->page_size_mask);
> >>>
> >>>There are a few potential problems here.  First you're just
> >>>arbitrarily picking the first entry in the sps array to filter
> >>
> >>Why first?  spapr_query_mask() has a loop 0..PPC_PAGE_SIZES_MAX_SZ.
> >
> >env->sps is a nested array, 0..PPC_PAGE_SIZES_MAX_SZ-1 for base page
> >sizes, then again for actual page sizes.  You're only examing the
> >first "row" of that table.  It kinda works because the 4k base page
> >size is first, which lists all the actual page size options.
> 
> Ah. Right. So I need to walk through all of them, ok.

Yes.. well.. except that the more I think about it, the less looking
at the target page sizes seems relevant to what IOMMU page sizes can
be supported.

The fundamental constraints are that an allowed page size needs to be:
     1. Supported by the guest IOMMU
and  2. >= than the smallest available host IOMMU page size

(1) applies always, (2) is are only relevant when VFIO is in the
picture.

This is somewhat similar to the filtering we need to do for CPU page
sizes, but I don't think it's close enough that it really makes sense
to re-use the sps table.

> >>>against, which doesn't seem right (except by accident).  It's a little
> >>>bit odd filtering against guest page sizes at all, although I get what
> >>>you're really trying to do is filter against allowed host page sizes.
> >>>
> >>>The other problem is that silently filtering capabilities based on the
> >>>host can be a pain for migration - I've made the mistake and had it
> >>>bite me in the past.  I think it would be safer to just check the
> >>>pagesizes requested in the property against what's possible and
> >>>outright fail if they don't match.  For convenience you could also set
> >>>according to host capabilities if the user doesn't specify anything,
> >>>but that would require explicit handling of the "default" case.
> 
> 
> What are the host capabilities here?
> 
> There is a page mask from the host IOMMU/PE which is 4K|64K|16M and many
> other sizes, this is supported always by IODA2.
> And there is PAGE_SIZE and huge pages (but only with -mempath) - so, 64K and
> 16M (with -mempath).
> 
> And there is a "ddw-query" RTAS call which tells the guest if it can use 16M
> or not. How do you suggest I advertise 16M to the guest? If I always
> advertise 16M and there is no -mempath, the guest won't try smaller page
> size.
> 
> So - if the user wants 16M IOMMU pages, he has to use -mempath and in
> addition to that explicitely say -global spapr-pci-host-bridge.pgsz=16M, and
> by default enable only 4K and 64K (or just 4K?)? I am fine with this, it
> just means more work for libvirt folks.

So, what complicates this is the additional constraint that 1 guest
TCE map to 1 host TCE.  That constraint isn't necessary for basic,
slow, VFIO operation: VFIO_DMA_MAP on a 16M guest page will map in
multiple TCEs on a 64k pagesized host TCE table.

The kernel acceleration does rely on the host IOMMU page size matching
the guest IOMMU page size, and the kernel DDW interface sort of does,
since we directly specify the page size (qemu could request a DDW
window with a smaller pagesize than the guest pagesize, and things
should still work ok - but the guest TCE table representation would
now have multiple host TCEs backing each entry).

Allowing those more complex cases is probably more trouble than its
worth for the time being, so I think that replaces constraint (2) with
the stronger constraint:
    2a. == to one of the host IOMMU page sizes

> >>For the migration purposes, both guests should be started with or without
> >>hugepages enabled; this is taken into account already. Besides that, the
> >>result of "query" won't differ.
> >
> >Hmm.. if you're migrating between TCG and KVM or between PR and HV
> >these could change as well.  I'm not sure that works at the moment,
> >but I'd prefer not to introduce any more barriers to it than we have
> >to.
> >
> >>>Remember that this code will be relevant for DDW with emulated
> >>>devices, even if VFIO is not in play at all.
> >>>
> >>>All those considerations aside, it seems like it would make more sense
> >>>to do this filtering during device realize, rather than leaving it
> >>>until the guest queries.
> >>
> >>The result will be the same, it only depends on whether hugepages are
> >>enabled or not and this happens at the start time. But yes, feels more
> >>accurate to do this in PHB realize(), I'll move it.
> >>
> >>
> >>>
> >>>>+    /*
> >>>>+     * This is "Largest contiguous block of TCEs allocated specifically
> >>>>+     * for (that is, are reserved for) this PE".
> >>>>+     * Return the maximum number as maximum supported RAM size was in 4K pages.
> >>>>+     */
> >>>>+    max_window_size = MACHINE(spapr)->maxram_size >> SPAPR_TCE_PAGE_SHIFT;
> >>>
> >>>Will maxram_size always be enough?  There will sometimes be an
> >>>alignment gap between the "base" RAM and the hotpluggable RAM, meaning
> >>>that if everything is plugged the last RAM address will be beyond
> >>>maxram_size.  Will that require pushing this number up, or will the
> >>>guest "repack" the RAM layout when it maps it into the TCE tables?
> >>
> >>
> >>Hm. I do not know what the guest does to DDW on memory hotplug but this is a
> >>valid point... What QEMU helper does return the last available address in
> >>the system memory address space? Like memblock_end_of_DRAM() in the kernel,
> >>I would use that instead.
> >
> >There is a last_ram_offset() but that's in the ram_addr_t address
> 
> What do you call "ram_addr_t address space"? Non-pluggable memory and
> machine->ram_size is its size?
> 
> 
> >space, which isn't necessarily the same as the physical address space
> >(though it's usually similar).
> 
> 
> I looked at the code and it looks like machine->ram_size is always what came
> from "-m" and it won't grow if some memory was hotplugged, is that correct?
> 
> And hotpluggable memory does not appear in the global ram_list as a RAMBlock?
> 
> 
> >You can have a look at what we check
> >in (the TCG/PR version of) H_ENTER which needs to check this as well.
> 
> Ok, thanks for the pointer.
> 
>
diff mbox

Patch

diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
index c1ffc77..986b36f 100644
--- a/hw/ppc/Makefile.objs
+++ b/hw/ppc/Makefile.objs
@@ -7,6 +7,7 @@  obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o spapr_rng.o
 ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
 obj-y += spapr_pci_vfio.o
 endif
+obj-$(CONFIG_PSERIES) += spapr_rtas_ddw.o
 # PowerPC 4xx boards
 obj-y += ppc405_boards.o ppc4xx_devs.o ppc405_uc.o ppc440_bamboo.o
 obj-y += ppc4xx_pci.o
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index d0bb423..ef4c637 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2362,7 +2362,12 @@  DEFINE_SPAPR_MACHINE(2_6, "2.6", true);
  * pseries-2.5
  */
 #define SPAPR_COMPAT_2_5 \
-        HW_COMPAT_2_5
+        HW_COMPAT_2_5 \
+        {\
+            .driver   = TYPE_SPAPR_PCI_HOST_BRIDGE,\
+            .property = "ddw",\
+            .value    = stringify(off),\
+        },
 
 static void spapr_machine_2_5_instance_options(MachineState *machine)
 {
diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index af99a36..3bb294a 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -803,12 +803,12 @@  static char *spapr_phb_get_loc_code(sPAPRPHBState *sphb, PCIDevice *pdev)
     return buf;
 }
 
-static void spapr_phb_dma_window_enable(sPAPRPHBState *sphb,
-                                       uint32_t liobn,
-                                       uint32_t page_shift,
-                                       uint64_t window_addr,
-                                       uint64_t window_size,
-                                       Error **errp)
+void spapr_phb_dma_window_enable(sPAPRPHBState *sphb,
+                                 uint32_t liobn,
+                                 uint32_t page_shift,
+                                 uint64_t window_addr,
+                                 uint64_t window_size,
+                                 Error **errp)
 {
     sPAPRTCETable *tcet;
     uint32_t nb_table = window_size >> page_shift;
@@ -825,10 +825,16 @@  static void spapr_phb_dma_window_enable(sPAPRPHBState *sphb,
         return;
     }
 
+    if (SPAPR_PCI_DMA_WINDOW_NUM(liobn) && !sphb->ddw_enabled) {
+        error_setg(errp,
+                   "Attempt to use second window when DDW is disabled on PHB");
+        return;
+    }
+
     spapr_tce_table_enable(tcet, page_shift, window_addr, nb_table);
 }
 
-static int spapr_phb_dma_window_disable(sPAPRPHBState *sphb, uint32_t liobn)
+int spapr_phb_dma_window_disable(sPAPRPHBState *sphb, uint32_t liobn)
 {
     sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
 
@@ -1492,14 +1498,18 @@  static void spapr_phb_realize(DeviceState *dev, Error **errp)
     }
 
     /* DMA setup */
-    tcet = spapr_tce_new_table(DEVICE(sphb), sphb->dma_liobn);
-    if (!tcet) {
-        error_report("No default TCE table for %s", sphb->dtbusname);
-        return;
-    }
 
-    memory_region_add_subregion_overlap(&sphb->iommu_root, 0,
-                                        spapr_tce_get_iommu(tcet), 0);
+    for (i = 0; i < SPAPR_PCI_DMA_MAX_WINDOWS; ++i) {
+        tcet = spapr_tce_new_table(DEVICE(sphb),
+                                   SPAPR_PCI_LIOBN(sphb->index, i));
+        if (!tcet) {
+            error_setg(errp, "Creating window#%d failed for %s",
+                       i, sphb->dtbusname);
+            return;
+        }
+        memory_region_add_subregion_overlap(&sphb->iommu_root, 0,
+                                            spapr_tce_get_iommu(tcet), 0);
+    }
 
     sphb->msi = g_hash_table_new_full(g_int_hash, g_int_equal, g_free, g_free);
 }
@@ -1517,11 +1527,16 @@  static int spapr_phb_children_reset(Object *child, void *opaque)
 
 void spapr_phb_dma_reset(sPAPRPHBState *sphb)
 {
-    sPAPRTCETable *tcet = spapr_tce_find_by_liobn(sphb->dma_liobn);
     Error *local_err = NULL;
+    int i;
 
-    if (tcet && tcet->enabled) {
-        spapr_phb_dma_window_disable(sphb, sphb->dma_liobn);
+    for (i = 0; i < SPAPR_PCI_DMA_MAX_WINDOWS; ++i) {
+        uint32_t liobn = SPAPR_PCI_LIOBN(sphb->index, i);
+        sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
+
+        if (tcet && tcet->enabled) {
+            spapr_phb_dma_window_disable(sphb, liobn);
+        }
     }
 
     /* Register default 32bit DMA window */
@@ -1562,6 +1577,13 @@  static Property spapr_phb_properties[] = {
     /* Default DMA window is 0..1GB */
     DEFINE_PROP_UINT64("dma_win_addr", sPAPRPHBState, dma_win_addr, 0),
     DEFINE_PROP_UINT64("dma_win_size", sPAPRPHBState, dma_win_size, 0x40000000),
+    DEFINE_PROP_UINT64("dma64_win_addr", sPAPRPHBState, dma64_window_addr,
+                       0x800000000000000ULL),
+    DEFINE_PROP_BOOL("ddw", sPAPRPHBState, ddw_enabled, true),
+    DEFINE_PROP_UINT32("windows", sPAPRPHBState, windows_supported,
+                       SPAPR_PCI_DMA_MAX_WINDOWS),
+    DEFINE_PROP_UINT64("pgsz", sPAPRPHBState, page_size_mask,
+                       (1ULL << 12) | (1ULL << 16) | (1ULL << 24)),
     DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -1815,6 +1837,15 @@  int spapr_populate_pci_dt(sPAPRPHBState *phb,
     uint32_t interrupt_map_mask[] = {
         cpu_to_be32(b_ddddd(-1)|b_fff(0)), 0x0, 0x0, cpu_to_be32(-1)};
     uint32_t interrupt_map[PCI_SLOT_MAX * PCI_NUM_PINS][7];
+    uint32_t ddw_applicable[] = {
+        cpu_to_be32(RTAS_IBM_QUERY_PE_DMA_WINDOW),
+        cpu_to_be32(RTAS_IBM_CREATE_PE_DMA_WINDOW),
+        cpu_to_be32(RTAS_IBM_REMOVE_PE_DMA_WINDOW)
+    };
+    uint32_t ddw_extensions[] = {
+        cpu_to_be32(1),
+        cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW)
+    };
     sPAPRTCETable *tcet;
     PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus;
     sPAPRFDT s_fdt;
@@ -1839,6 +1870,14 @@  int spapr_populate_pci_dt(sPAPRPHBState *phb,
     _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pci-config-space-type", 0x1));
     _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pe-total-#msi", XICS_IRQS));
 
+    /* Dynamic DMA window */
+    if (phb->ddw_enabled) {
+        _FDT(fdt_setprop(fdt, bus_off, "ibm,ddw-applicable", &ddw_applicable,
+                         sizeof(ddw_applicable)));
+        _FDT(fdt_setprop(fdt, bus_off, "ibm,ddw-extensions",
+                         &ddw_extensions, sizeof(ddw_extensions)));
+    }
+
     /* Build the interrupt-map, this must matches what is done
      * in pci_spapr_map_irq
      */
diff --git a/hw/ppc/spapr_rtas_ddw.c b/hw/ppc/spapr_rtas_ddw.c
new file mode 100644
index 0000000..37f805f
--- /dev/null
+++ b/hw/ppc/spapr_rtas_ddw.c
@@ -0,0 +1,300 @@ 
+/*
+ * QEMU sPAPR Dynamic DMA windows support
+ *
+ * Copyright (c) 2015 Alexey Kardashevskiy, IBM Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License,
+ *  or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/error-report.h"
+#include "hw/ppc/spapr.h"
+#include "hw/pci-host/spapr.h"
+#include "trace.h"
+
+static int spapr_phb_get_active_win_num_cb(Object *child, void *opaque)
+{
+    sPAPRTCETable *tcet;
+
+    tcet = (sPAPRTCETable *) object_dynamic_cast(child, TYPE_SPAPR_TCE_TABLE);
+    if (tcet && tcet->enabled) {
+        ++*(unsigned *)opaque;
+    }
+    return 0;
+}
+
+static unsigned spapr_phb_get_active_win_num(sPAPRPHBState *sphb)
+{
+    unsigned ret = 0;
+
+    object_child_foreach(OBJECT(sphb), spapr_phb_get_active_win_num_cb, &ret);
+
+    return ret;
+}
+
+static int spapr_phb_get_free_liobn_cb(Object *child, void *opaque)
+{
+    sPAPRTCETable *tcet;
+
+    tcet = (sPAPRTCETable *) object_dynamic_cast(child, TYPE_SPAPR_TCE_TABLE);
+    if (tcet && !tcet->enabled) {
+        *(uint32_t *)opaque = tcet->liobn;
+        return 1;
+    }
+    return 0;
+}
+
+static unsigned spapr_phb_get_free_liobn(sPAPRPHBState *sphb)
+{
+    uint32_t liobn = 0;
+
+    object_child_foreach(OBJECT(sphb), spapr_phb_get_free_liobn_cb, &liobn);
+
+    return liobn;
+}
+
+static uint32_t spapr_query_mask(struct ppc_one_seg_page_size *sps,
+                                 uint64_t page_mask)
+{
+    int i, j;
+    uint32_t mask = 0;
+    const struct { int shift; uint32_t mask; } masks[] = {
+        { 12, RTAS_DDW_PGSIZE_4K },
+        { 16, RTAS_DDW_PGSIZE_64K },
+        { 24, RTAS_DDW_PGSIZE_16M },
+        { 25, RTAS_DDW_PGSIZE_32M },
+        { 26, RTAS_DDW_PGSIZE_64M },
+        { 27, RTAS_DDW_PGSIZE_128M },
+        { 28, RTAS_DDW_PGSIZE_256M },
+        { 34, RTAS_DDW_PGSIZE_16G },
+    };
+
+    for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) {
+        for (j = 0; j < ARRAY_SIZE(masks); ++j) {
+            if ((sps[i].page_shift == masks[j].shift) &&
+                    (page_mask & (1ULL << masks[j].shift))) {
+                mask |= masks[j].mask;
+            }
+        }
+    }
+
+    return mask;
+}
+
+static void rtas_ibm_query_pe_dma_window(PowerPCCPU *cpu,
+                                         sPAPRMachineState *spapr,
+                                         uint32_t token, uint32_t nargs,
+                                         target_ulong args,
+                                         uint32_t nret, target_ulong rets)
+{
+    CPUPPCState *env = &cpu->env;
+    sPAPRPHBState *sphb;
+    uint64_t buid, max_window_size;
+    uint32_t avail, addr, pgmask = 0;
+
+    if ((nargs != 3) || (nret != 5)) {
+        goto param_error_exit;
+    }
+
+    buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
+    addr = rtas_ld(args, 0);
+    sphb = spapr_pci_find_phb(spapr, buid);
+    if (!sphb || !sphb->ddw_enabled) {
+        goto param_error_exit;
+    }
+
+    /* Work out supported page masks */
+    pgmask = spapr_query_mask(env->sps.sps, sphb->page_size_mask);
+
+    /*
+     * This is "Largest contiguous block of TCEs allocated specifically
+     * for (that is, are reserved for) this PE".
+     * Return the maximum number as maximum supported RAM size was in 4K pages.
+     */
+    max_window_size = MACHINE(spapr)->maxram_size >> SPAPR_TCE_PAGE_SHIFT;
+
+    avail = sphb->windows_supported - spapr_phb_get_active_win_num(sphb);
+
+    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
+    rtas_st(rets, 1, avail);
+    rtas_st(rets, 2, max_window_size);
+    rtas_st(rets, 3, pgmask);
+    rtas_st(rets, 4, 0); /* DMA migration mask, not supported */
+
+    trace_spapr_iommu_ddw_query(buid, addr, avail, max_window_size, pgmask);
+    return;
+
+param_error_exit:
+    rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+}
+
+static void rtas_ibm_create_pe_dma_window(PowerPCCPU *cpu,
+                                          sPAPRMachineState *spapr,
+                                          uint32_t token, uint32_t nargs,
+                                          target_ulong args,
+                                          uint32_t nret, target_ulong rets)
+{
+    sPAPRPHBState *sphb;
+    sPAPRTCETable *tcet = NULL;
+    uint32_t addr, page_shift, window_shift, liobn;
+    uint64_t buid;
+    Error *local_err = NULL;
+
+    if ((nargs != 5) || (nret != 4)) {
+        goto param_error_exit;
+    }
+
+    buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
+    addr = rtas_ld(args, 0);
+    sphb = spapr_pci_find_phb(spapr, buid);
+    if (!sphb || !sphb->ddw_enabled) {
+        goto param_error_exit;
+    }
+
+    page_shift = rtas_ld(args, 3);
+    window_shift = rtas_ld(args, 4);
+    liobn = spapr_phb_get_free_liobn(sphb);
+
+    if (!liobn || !(sphb->page_size_mask & (1ULL << page_shift)) ||
+        spapr_phb_get_active_win_num(sphb) == sphb->windows_supported) {
+        goto hw_error_exit;
+    }
+
+    if (window_shift < page_shift) {
+        goto param_error_exit;
+    }
+
+    spapr_phb_dma_window_enable(sphb, liobn, page_shift,
+                                sphb->dma64_window_addr,
+                                1ULL << window_shift, &local_err);
+    if (local_err) {
+        error_report_err(local_err);
+        goto hw_error_exit;
+    }
+
+    tcet = spapr_tce_find_by_liobn(liobn);
+    trace_spapr_iommu_ddw_create(buid, addr, 1ULL << page_shift,
+                                 1ULL << window_shift,
+                                 tcet ? tcet->bus_offset : 0xbaadf00d, liobn);
+    if (local_err || !tcet) {
+        goto hw_error_exit;
+    }
+
+    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
+    rtas_st(rets, 1, liobn);
+    rtas_st(rets, 2, tcet->bus_offset >> 32);
+    rtas_st(rets, 3, tcet->bus_offset & ((uint32_t) -1));
+
+    return;
+
+hw_error_exit:
+    rtas_st(rets, 0, RTAS_OUT_HW_ERROR);
+    return;
+
+param_error_exit:
+    rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+}
+
+static void rtas_ibm_remove_pe_dma_window(PowerPCCPU *cpu,
+                                          sPAPRMachineState *spapr,
+                                          uint32_t token, uint32_t nargs,
+                                          target_ulong args,
+                                          uint32_t nret, target_ulong rets)
+{
+    sPAPRPHBState *sphb;
+    sPAPRTCETable *tcet;
+    uint32_t liobn;
+    long ret;
+
+    if ((nargs != 1) || (nret != 1)) {
+        goto param_error_exit;
+    }
+
+    liobn = rtas_ld(args, 0);
+    tcet = spapr_tce_find_by_liobn(liobn);
+    if (!tcet) {
+        goto param_error_exit;
+    }
+
+    sphb = SPAPR_PCI_HOST_BRIDGE(OBJECT(tcet)->parent);
+    if (!sphb || !sphb->ddw_enabled || !spapr_phb_get_active_win_num(sphb)) {
+        goto param_error_exit;
+    }
+
+    ret = spapr_phb_dma_window_disable(sphb, liobn);
+    trace_spapr_iommu_ddw_remove(liobn, ret);
+    if (ret) {
+        goto hw_error_exit;
+    }
+
+    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
+    return;
+
+hw_error_exit:
+    rtas_st(rets, 0, RTAS_OUT_HW_ERROR);
+    return;
+
+param_error_exit:
+    rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+}
+
+static void rtas_ibm_reset_pe_dma_window(PowerPCCPU *cpu,
+                                         sPAPRMachineState *spapr,
+                                         uint32_t token, uint32_t nargs,
+                                         target_ulong args,
+                                         uint32_t nret, target_ulong rets)
+{
+    sPAPRPHBState *sphb;
+    uint64_t buid;
+    uint32_t addr;
+
+    if ((nargs != 3) || (nret != 1)) {
+        goto param_error_exit;
+    }
+
+    buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
+    addr = rtas_ld(args, 0);
+    sphb = spapr_pci_find_phb(spapr, buid);
+    if (!sphb || !sphb->ddw_enabled) {
+        goto param_error_exit;
+    }
+
+    spapr_phb_dma_reset(sphb);
+    trace_spapr_iommu_ddw_reset(buid, addr);
+
+    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
+
+    return;
+
+param_error_exit:
+    rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+}
+
+static void spapr_rtas_ddw_init(void)
+{
+    spapr_rtas_register(RTAS_IBM_QUERY_PE_DMA_WINDOW,
+                        "ibm,query-pe-dma-window",
+                        rtas_ibm_query_pe_dma_window);
+    spapr_rtas_register(RTAS_IBM_CREATE_PE_DMA_WINDOW,
+                        "ibm,create-pe-dma-window",
+                        rtas_ibm_create_pe_dma_window);
+    spapr_rtas_register(RTAS_IBM_REMOVE_PE_DMA_WINDOW,
+                        "ibm,remove-pe-dma-window",
+                        rtas_ibm_remove_pe_dma_window);
+    spapr_rtas_register(RTAS_IBM_RESET_PE_DMA_WINDOW,
+                        "ibm,reset-pe-dma-window",
+                        rtas_ibm_reset_pe_dma_window);
+}
+
+type_init(spapr_rtas_ddw_init)
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 421d6eb..b0ea146 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -994,11 +994,6 @@  static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
             }
         }
 
-        /*
-         * This only considers the host IOMMU's 32-bit window.  At
-         * some point we need to add support for the optional 64-bit
-         * window and dynamic windows
-         */
         info.argsz = sizeof(info);
         ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
         if (ret) {
diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
index 7848366..e81b751 100644
--- a/include/hw/pci-host/spapr.h
+++ b/include/hw/pci-host/spapr.h
@@ -71,6 +71,11 @@  struct sPAPRPHBState {
     spapr_pci_msi_mig *msi_devs;
 
     QLIST_ENTRY(sPAPRPHBState) list;
+
+    bool ddw_enabled;
+    uint32_t windows_supported;
+    uint64_t page_size_mask;
+    uint64_t dma64_window_addr;
 };
 
 #define SPAPR_PCI_MAX_INDEX          255
@@ -89,6 +94,8 @@  struct sPAPRPHBState {
 
 #define SPAPR_PCI_MSI_WINDOW         0x40000000000ULL
 
+#define SPAPR_PCI_DMA_MAX_WINDOWS    2
+
 static inline qemu_irq spapr_phb_lsi_qirq(struct sPAPRPHBState *phb, int pin)
 {
     sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
@@ -148,5 +155,11 @@  static inline void spapr_phb_vfio_reset(DeviceState *qdev)
 #endif
 
 void spapr_phb_dma_reset(sPAPRPHBState *sphb);
+void spapr_phb_dma_window_enable(sPAPRPHBState *sphb,
+                                uint32_t liobn, uint32_t page_shift,
+                                uint64_t window_addr,
+                                uint64_t window_size,
+                                 Error **errp);
+int spapr_phb_dma_window_disable(sPAPRPHBState *sphb, uint32_t liobn);
 
 #endif /* __HW_SPAPR_PCI_H__ */
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 471eb4a..41b32c6 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -417,6 +417,16 @@  int spapr_allocate_irq_block(int num, bool lsi, bool msi);
 #define RTAS_OUT_NOT_AUTHORIZED                 -9002
 #define RTAS_OUT_SYSPARM_PARAM_ERROR            -9999
 
+/* DDW pagesize mask values from ibm,query-pe-dma-window */
+#define RTAS_DDW_PGSIZE_4K       0x01
+#define RTAS_DDW_PGSIZE_64K      0x02
+#define RTAS_DDW_PGSIZE_16M      0x04
+#define RTAS_DDW_PGSIZE_32M      0x08
+#define RTAS_DDW_PGSIZE_64M      0x10
+#define RTAS_DDW_PGSIZE_128M     0x20
+#define RTAS_DDW_PGSIZE_256M     0x40
+#define RTAS_DDW_PGSIZE_16G      0x80
+
 /* RTAS tokens */
 #define RTAS_TOKEN_BASE      0x2000
 
@@ -458,8 +468,12 @@  int spapr_allocate_irq_block(int num, bool lsi, bool msi);
 #define RTAS_IBM_SET_SLOT_RESET                 (RTAS_TOKEN_BASE + 0x23)
 #define RTAS_IBM_CONFIGURE_PE                   (RTAS_TOKEN_BASE + 0x24)
 #define RTAS_IBM_SLOT_ERROR_DETAIL              (RTAS_TOKEN_BASE + 0x25)
+#define RTAS_IBM_QUERY_PE_DMA_WINDOW            (RTAS_TOKEN_BASE + 0x26)
+#define RTAS_IBM_CREATE_PE_DMA_WINDOW           (RTAS_TOKEN_BASE + 0x27)
+#define RTAS_IBM_REMOVE_PE_DMA_WINDOW           (RTAS_TOKEN_BASE + 0x28)
+#define RTAS_IBM_RESET_PE_DMA_WINDOW            (RTAS_TOKEN_BASE + 0x29)
 
-#define RTAS_TOKEN_MAX                          (RTAS_TOKEN_BASE + 0x26)
+#define RTAS_TOKEN_MAX                          (RTAS_TOKEN_BASE + 0x2A)
 
 /* RTAS ibm,get-system-parameter token values */
 #define RTAS_SYSPARM_SPLPAR_CHARACTERISTICS      20
diff --git a/trace-events b/trace-events
index f2b75a3..e68d0e4 100644
--- a/trace-events
+++ b/trace-events
@@ -1431,6 +1431,10 @@  spapr_iommu_pci_indirect(uint64_t liobn, uint64_t ioba, uint64_t tce, uint64_t i
 spapr_iommu_pci_stuff(uint64_t liobn, uint64_t ioba, uint64_t tce_value, uint64_t npages, uint64_t ret) "liobn=%"PRIx64" ioba=0x%"PRIx64" tcevalue=0x%"PRIx64" npages=%"PRId64" ret=%"PRId64
 spapr_iommu_xlate(uint64_t liobn, uint64_t ioba, uint64_t tce, unsigned perm, unsigned pgsize) "liobn=%"PRIx64" 0x%"PRIx64" -> 0x%"PRIx64" perm=%u mask=%x"
 spapr_iommu_new_table(uint64_t liobn, void *table, int fd) "liobn=%"PRIx64" table=%p fd=%d"
+spapr_iommu_ddw_query(uint64_t buid, uint32_t cfgaddr, unsigned wa, uint64_t win_size, uint32_t pgmask) "buid=%"PRIx64" addr=%"PRIx32", %u windows available, max window size=%"PRIx64", mask=%"PRIx32
+spapr_iommu_ddw_create(uint64_t buid, uint32_t cfgaddr, uint64_t pg_size, uint64_t req_size, uint64_t start, uint32_t liobn) "buid=%"PRIx64" addr=%"PRIx32", page size=0x%"PRIx64", requested=0x%"PRIx64", start addr=%"PRIx64", liobn=%"PRIx32
+spapr_iommu_ddw_remove(uint32_t liobn, long ret) "liobn=%"PRIx32", ret = %ld"
+spapr_iommu_ddw_reset(uint64_t buid, uint32_t cfgaddr) "buid=%"PRIx64" addr=%"PRIx32
 
 # hw/ppc/ppc.c
 ppc_tb_adjust(uint64_t offs1, uint64_t offs2, int64_t diff, int64_t seconds) "adjusted from 0x%"PRIx64" to 0x%"PRIx64", diff %"PRId64" (%"PRId64"s)"