diff mbox

[v5,15/16] spapr_pci: enable basic hotplug operations

Message ID 1424096872-29868-16-git-send-email-mdroth@linux.vnet.ibm.com
State New
Headers show

Commit Message

Michael Roth Feb. 16, 2015, 2:27 p.m. UTC
This enables hotplug for PHB bridges. Upon hotplug we generate the
OF-nodes required by PAPR specification and IEEE 1275-1994
"PCI Bus Binding to Open Firmware" for the device.

We associate the corresponding FDT for these nodes with the DrcEntry
corresponding to the slot, which will be fetched via
ibm,configure-connector RTAS calls by the guest as described by PAPR
specification. The FDT is cleaned up in the case of unplug.

Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
---
 hw/ppc/spapr_pci.c | 391 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 371 insertions(+), 20 deletions(-)

Comments

David Gibson Feb. 25, 2015, 3:11 a.m. UTC | #1
On Mon, Feb 16, 2015 at 08:27:51AM -0600, Michael Roth wrote:
> This enables hotplug for PHB bridges. Upon hotplug we generate the

"PCI Host Bridge bridges" :-p

> OF-nodes required by PAPR specification and IEEE 1275-1994
> "PCI Bus Binding to Open Firmware" for the device.
> 
> We associate the corresponding FDT for these nodes with the DrcEntry
> corresponding to the slot, which will be fetched via
> ibm,configure-connector RTAS calls by the guest as described by PAPR
> specification. The FDT is cleaned up in the case of unplug.
> 
> Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
> ---
>  hw/ppc/spapr_pci.c | 391 ++++++++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 371 insertions(+), 20 deletions(-)
> 
> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> index 6a3d917..b9af1cd 100644
> --- a/hw/ppc/spapr_pci.c
> +++ b/hw/ppc/spapr_pci.c
> @@ -33,8 +33,11 @@
>  #include <libfdt.h>
>  #include "trace.h"
>  #include "qemu/error-report.h"
> +#include "qapi/qmp/qerror.h"
>  
>  #include "hw/pci/pci_bus.h"
> +#include "hw/ppc/spapr_drc.h"
> +#include "sysemu/device_tree.h"
>  
>  /* Copied from the kernel arch/powerpc/platforms/pseries/msi.c */
>  #define RTAS_QUERY_FN           0
> @@ -47,7 +50,13 @@
>  #define RTAS_TYPE_MSI           1
>  #define RTAS_TYPE_MSIX          2
>  
> -#include "hw/ppc/spapr_drc.h"
> +#define _FDT(exp) \
> +    do { \
> +        int ret = (exp);                                           \
> +        if (ret < 0) {                                             \
> +            return ret;                                            \
> +        }                                                          \
> +    } while (0)
>  
>  static sPAPRPHBState *find_phb(sPAPREnvironment *spapr, uint64_t buid)
>  {
> @@ -481,6 +490,359 @@ static AddressSpace *spapr_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn)
>      return &phb->iommu_as;
>  }
>  
> +/* Macros to operate with address in OF binding to PCI */
> +#define b_x(x, p, l)    (((x) & ((1<<(l))-1)) << (p))
> +#define b_n(x)          b_x((x), 31, 1) /* 0 if relocatable */
> +#define b_p(x)          b_x((x), 30, 1) /* 1 if prefetchable */
> +#define b_t(x)          b_x((x), 29, 1) /* 1 if the address is aliased */
> +#define b_ss(x)         b_x((x), 24, 2) /* the space code */
> +#define b_bbbbbbbb(x)   b_x((x), 16, 8) /* bus number */
> +#define b_ddddd(x)      b_x((x), 11, 5) /* device number */
> +#define b_fff(x)        b_x((x), 8, 3)  /* function number */
> +#define b_rrrrrrrr(x)   b_x((x), 0, 8)  /* register number */
> +/* for 'reg'/'assigned-addresses' OF properties */
> +#define RESOURCE_CELLS_SIZE 2
> +#define RESOURCE_CELLS_ADDRESS 3
> +
> +typedef struct ResourceFields {
> +    uint32_t phys_hi;
> +    uint32_t phys_mid;
> +    uint32_t phys_lo;
> +    uint32_t size_hi;
> +    uint32_t size_lo;
> +} ResourceFields;

Hrm, 5*32-bit ints, that probably needs a ((packed)) on at least some
platforms to be safe.

> +typedef struct ResourceProps {
> +    union {
> +        ResourceFields fields[8];
> +        uint8_t data[sizeof(ResourceFields) * 8];
> +    } reg;
> +    union {
> +        ResourceFields fields[7];
> +        uint8_t data[sizeof(ResourceFields) * 7];
> +    } assigned;
> +    uint32_t reg_len;
> +    uint32_t assigned_len;
> +} ResourceProps;

I'm a bit dubious about this union, rather than just casting when you
need the bytestring version.

> +
> +/* fill in the 'reg'/'assigned-resources' OF properties for
> + * a PCI device. 'reg' describes resource requirements for a
> + * device's IO/MEM regions, 'assigned-addresses' describes the
> + * actual resource assignments.
> + *
> + * the properties are arrays of ('phys-addr', 'size') pairs describing
> + * the addressable regions of the PCI device, where 'phys-addr' is a
> + * RESOURCE_CELLS_ADDRESS-tuple of 32-bit integers corresponding to
> + * (phys.hi, phys.mid, phys.lo), and 'size' is a
> + * RESOURCE_CELLS_SIZE-tuple corresponding to (size.hi, size.lo).
> + * phys.hi = 0xYYXXXXZZ, where:
> + *   0xYY = npt000ss
> + *          |||   |
> + *          |||   +-- space code: 1 if IO region, 2 if MEM region
> + *          ||+------ for non-relocatable IO: 1 if aliased
> + *          ||        for relocatable IO: 1 if below 64KB
> + *          ||        for MEM: 1 if below 1MB
> + *          |+------- 1 if region is prefetchable
> + *          +-------- 1 if region is non-relocatable
> + *   0xXXXX = bbbbbbbb dddddfff, encoding bus, slot, and function
> + *            bits respectively
> + *   0xZZ = rrrrrrrr, the register number of the BAR corresponding
> + *          to the region
> + *
> + * phys.mid and phys.lo correspond respectively to the hi/lo portions
> + * of the actual address of the region.
> + *
> + * how the phys-addr/size values are used differ slightly between
> + * 'reg' and 'assigned-addresses' properties. namely, 'reg' has
> + * an additional description for the config space region of the
> + * device, and in the case of QEMU has n=0 and phys.mid=phys.lo=0
> + * to describe the region as relocatable, with an address-mapping
> + * that corresponds directly to the PHB's address space for the
> + * resource. 'assigned-addresses' always has n=1 set with an absolute
> + * address assigned for the resource. in general, 'assigned-addresses'
> + * won't be populated, since addresses for PCI devices are generally
> + * unmapped initially and left to the guest to assign.
> + *
> + * in accordance with PCI Bus Binding to Open Firmware,
> + * IEEE Std 1275-1994, section 4.1.1, as implemented by PAPR+ v2.7,
> + * Appendix C.
> + */
> +static void populate_resource_props(PCIDevice *d, ResourceProps *rp)
> +{
> +    int bus_num = pci_bus_num(PCI_BUS(qdev_get_parent_bus(DEVICE(d))));
> +    uint32_t dev_id = (b_bbbbbbbb(bus_num) |
> +                       b_ddddd(PCI_SLOT(d->devfn)) |
> +                       b_fff(PCI_FUNC(d->devfn)));
> +    ResourceFields *reg, *assigned;
> +    int i, reg_idx = 0, assigned_idx = 0;
> +
> +    /* config space region */
> +    reg = &rp->reg.fields[reg_idx++];
> +    reg->phys_hi = cpu_to_be32(dev_id);
> +    reg->phys_mid = 0;
> +    reg->phys_lo = 0;
> +    reg->size_hi = 0;
> +    reg->size_lo = 0;
> +
> +    for (i = 0; i < PCI_NUM_REGIONS; i++) {
> +        if (!d->io_regions[i].size) {
> +            continue;
> +        }
> +
> +        reg = &rp->reg.fields[reg_idx++];
> +
> +        reg->phys_hi = cpu_to_be32(dev_id | b_rrrrrrrr(pci_bar(d, i)));
> +        if (d->io_regions[i].type & PCI_BASE_ADDRESS_SPACE_IO) {
> +            reg->phys_hi |= cpu_to_be32(b_ss(1));
> +        } else {
> +            reg->phys_hi |= cpu_to_be32(b_ss(2));
> +        }
> +        reg->phys_mid = 0;
> +        reg->phys_lo = 0;
> +        reg->size_hi = cpu_to_be32(d->io_regions[i].size >> 32);
> +        reg->size_lo = cpu_to_be32(d->io_regions[i].size);
> +
> +        if (d->io_regions[i].addr != PCI_BAR_UNMAPPED) {
> +            continue;

This has inverted sense, doesn't it?  If the BAR *is* unmapped you
want to continue, skipping the assigned-addresses stuff.

> +        }
> +
> +        assigned = &rp->assigned.fields[assigned_idx++];
> +        assigned->phys_hi = cpu_to_be32(reg->phys_hi | b_n(1));
> +        assigned->phys_mid = cpu_to_be32(d->io_regions[i].addr >> 32);
> +        assigned->phys_lo = cpu_to_be32(d->io_regions[i].addr);

Not sure if I understood the comment above properly; should these
addresses actually be translated through an mappings above the PHB?
Not that there are any such translations on sPAPR, but...

> +        assigned->size_hi = reg->size_hi;
> +        assigned->size_lo = reg->size_lo;
> +    }
> +
> +    rp->reg_len = reg_idx * sizeof(ResourceFields);
> +    rp->assigned_len = assigned_idx * sizeof(ResourceFields);
> +}
> +
> +static int spapr_populate_pci_child_dt(PCIDevice *dev, void *fdt, int offset,
> +                                       int phb_index, int drc_index,
> +                                       const char *drc_name)
> +{
> +    ResourceProps rp;
> +    bool is_bridge = false;
> +    int pci_status;
> +
> +    if (pci_default_read_config(dev, PCI_HEADER_TYPE, 1) ==
> +        PCI_HEADER_TYPE_BRIDGE) {
> +        is_bridge = true;
> +    }
> +
> +    /* in accordance with PAPR+ v2.7 13.6.3, Table 181 */
> +    _FDT(fdt_setprop_cell(fdt, offset, "vendor-id",
> +                          pci_default_read_config(dev, PCI_VENDOR_ID, 2)));
> +    _FDT(fdt_setprop_cell(fdt, offset, "device-id",
> +                          pci_default_read_config(dev, PCI_DEVICE_ID, 2)));
> +    _FDT(fdt_setprop_cell(fdt, offset, "revision-id",
> +                          pci_default_read_config(dev, PCI_REVISION_ID, 1)));
> +    _FDT(fdt_setprop_cell(fdt, offset, "class-code",
> +                          pci_default_read_config(dev, PCI_CLASS_DEVICE, 2)
> +                            << 8));
> +    if (pci_default_read_config(dev, PCI_INTERRUPT_PIN, 1)) {
> +        _FDT(fdt_setprop_cell(fdt, offset, "interrupts",
> +                 pci_default_read_config(dev, PCI_INTERRUPT_PIN, 1)));
> +    }
> +
> +    if (!is_bridge) {
> +        _FDT(fdt_setprop_cell(fdt, offset, "min-grant",
> +            pci_default_read_config(dev, PCI_MIN_GNT, 1)));
> +        _FDT(fdt_setprop_cell(fdt, offset, "max-latency",
> +            pci_default_read_config(dev, PCI_MAX_LAT, 1)));
> +    }
> +
> +    if (pci_default_read_config(dev, PCI_SUBSYSTEM_ID, 2)) {
> +        _FDT(fdt_setprop_cell(fdt, offset, "subsystem-id",
> +                 pci_default_read_config(dev, PCI_SUBSYSTEM_ID, 2)));
> +    }
> +
> +    if (pci_default_read_config(dev, PCI_SUBSYSTEM_VENDOR_ID, 2)) {
> +        _FDT(fdt_setprop_cell(fdt, offset, "subsystem-vendor-id",
> +                 pci_default_read_config(dev, PCI_SUBSYSTEM_VENDOR_ID, 2)));
> +    }
> +
> +    _FDT(fdt_setprop_cell(fdt, offset, "cache-line-size",
> +        pci_default_read_config(dev, PCI_CACHE_LINE_SIZE, 1)));
> +
> +    /* the following fdt cells are masked off the pci status register */
> +    pci_status = pci_default_read_config(dev, PCI_STATUS, 2);
> +    _FDT(fdt_setprop_cell(fdt, offset, "devsel-speed",
> +                          PCI_STATUS_DEVSEL_MASK & pci_status));
> +
> +    if (pci_status & PCI_STATUS_FAST_BACK) {
> +        _FDT(fdt_setprop(fdt, offset, "fast-back-to-back", NULL, 0));
> +    }
> +    if (pci_status & PCI_STATUS_66MHZ) {
> +        _FDT(fdt_setprop(fdt, offset, "66mhz-capable", NULL, 0));
> +    }
> +    if (pci_status & PCI_STATUS_UDF) {
> +        _FDT(fdt_setprop(fdt, offset, "udf-supported", NULL, 0));
> +    }
> +
> +    _FDT(fdt_setprop_string(fdt, offset, "name", "pci"));

Ah.. this is a bit weird.  Explicit "name" properties are usually
omitted from fdts.  But I guess you need it, because this will get
streamed into the guest via the RTAS interfaces, rather than into the
guest's fdt parsing code (which knowws to derive name properties from
the unit name).

That probably deserves a comment.

> +    _FDT(fdt_setprop(fdt, offset, "ibm,loc-code", drc_name, strlen(drc_name)));
> +    _FDT(fdt_setprop_cell(fdt, offset, "ibm,my-drc-index", drc_index));
> +
> +    _FDT(fdt_setprop_cell(fdt, offset, "#address-cells",
> +                          RESOURCE_CELLS_ADDRESS));
> +    _FDT(fdt_setprop_cell(fdt, offset, "#size-cells",
> +                          RESOURCE_CELLS_SIZE));
> +    _FDT(fdt_setprop_cell(fdt, offset, "ibm,req#msi-x",
> +                          RESOURCE_CELLS_SIZE));
> +
> +    populate_resource_props(dev, &rp);
> +    _FDT(fdt_setprop(fdt, offset, "reg", rp.reg.data, rp.reg_len));
> +    _FDT(fdt_setprop(fdt, offset, "assigned-addresses",
> +                     rp.assigned.data, rp.assigned_len));
> +
> +    return 0;
> +}
> +
> +/* create OF node for pci device and required OF DT properties */
> +static void *spapr_create_pci_child_dt(sPAPRPHBState *phb, PCIDevice *dev,
> +                                       int drc_index, const char *drc_name,
> +                                       int *dt_offset)
> +{
> +    void *fdt;
> +    int offset, ret, fdt_size;
> +    int slot = PCI_SLOT(dev->devfn);
> +    char nodename[512];
> +
> +    fdt = create_device_tree(&fdt_size);
> +    sprintf(nodename, "pci@%d", slot);
> +    offset = fdt_add_subnode(fdt, 0, nodename);
> +    ret = spapr_populate_pci_child_dt(dev, fdt, offset, phb->index, drc_index,
> +                                      drc_name);
> +    g_assert(!ret);
> +
> +    *dt_offset = offset;
> +    return fdt;
> +}
> +
> +static void spapr_phb_add_pci_device(sPAPRDRConnector *drc,
> +                                     sPAPRPHBState *phb,
> +                                     PCIDevice *pdev,
> +                                     Error **errp)
> +{
> +    sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
> +    DeviceState *dev = DEVICE(pdev);
> +    int drc_index = drck->get_index(drc);
> +    const char *drc_name = drck->get_name(drc);
> +    void *fdt = NULL;
> +    int fdt_start_offset = 0;
> +
> +    /* boot-time devices get their device tree node created by SLOF, but for
> +     * hotplugged devices we need QEMU to generate it so the guest can fetch
> +     * it via RTAS
> +     */
> +    if (dev->hotplugged) {
> +        fdt = spapr_create_pci_child_dt(phb, pdev, drc_index, drc_name,
> +                                        &fdt_start_offset);
> +    }
> +
> +    drck->attach(drc, DEVICE(pdev),
> +                 fdt, fdt_start_offset, !dev->hotplugged, errp);
> +    if (errp) {
> +        g_free(fdt);
> +    }
> +}
> +
> +static void spapr_phb_remove_pci_device_cb(DeviceState *dev, void *opaque)
> +{
> +    /* some version guests do not wait for completion of a device
> +     * cleanup (generally done asynchronously by the kernel) before
> +     * signaling to QEMU that the device is safe, but instead sleep
> +     * for some 'safe' period of time. unfortunately on a busy host
> +     * this sleep isn't guaranteed to be long enough, resulting in
> +     * bad things like IRQ lines being left asserted during final
> +     * device removal. to deal with this we call reset just prior
> +     * to finalizing the device, which will put the device back into
> +     * an 'idle' state, as the device cleanup code expects.
> +     */
> +    pci_device_reset(PCI_DEVICE(dev));
> +    object_unparent(OBJECT(dev));
> +}
> +
> +static void spapr_phb_remove_pci_device(sPAPRDRConnector *drc,
> +                                        sPAPRPHBState *phb,
> +                                        PCIDevice *pdev,
> +                                        Error **errp)
> +{
> +    sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
> +
> +    drck->detach(drc, DEVICE(pdev), spapr_phb_remove_pci_device_cb, phb, errp);
> +}
> +
> +static void spapr_phb_hot_plug_child(HotplugHandler *plug_handler,
> +                                     DeviceState *plugged_dev, Error **errp)
> +{
> +    sPAPRPHBState *phb = SPAPR_PCI_HOST_BRIDGE(DEVICE(plug_handler));
> +    PCIDevice *pdev = PCI_DEVICE(plugged_dev);
> +    sPAPRDRConnector *drc = spapr_dr_connector_by_id(
> +        SPAPR_DR_CONNECTOR_TYPE_PCI,
> +        (phb->index << 16) |
> +        (pci_bus_num(PCI_BUS(qdev_get_parent_bus(plugged_dev))) << 8) |
> +        pdev->devfn);

Helper function for that calculation might be clearer.

> +    Error *local_err = NULL;
> +
> +    /* if DR is disabled we don't need to do anything in the case of
> +     * hotplug or coldplug callbacks
> +     */
> +    if (!phb->dr_enabled) {
> +        /* if this is a hotplug operation initiated by the user
> +         * we need to let them know it's not enabled
> +         */
> +        if (plugged_dev->hotplugged) {
> +            error_set(errp, QERR_BUS_NO_HOTPLUG,
> +                      object_get_typename(OBJECT(phb)));
> +        }
> +        return;
> +    }
> +
> +    g_assert(drc);
> +
> +    spapr_phb_add_pci_device(drc, phb, pdev, &local_err);
> +    if (local_err) {
> +        error_propagate(errp, local_err);
> +        return;
> +    }
> +}
> +
> +static void spapr_phb_hot_unplug_child(HotplugHandler *plug_handler,
> +                                       DeviceState *plugged_dev, Error **errp)
> +{
> +    sPAPRPHBState *phb = SPAPR_PCI_HOST_BRIDGE(DEVICE(plug_handler));
> +    PCIDevice *pdev = PCI_DEVICE(plugged_dev);
> +    sPAPRDRConnectorClass *drck;
> +    sPAPRDRConnector *drc = spapr_dr_connector_by_id(
> +        SPAPR_DR_CONNECTOR_TYPE_PCI,
> +        (phb->index << 16) |
> +        (pci_bus_num(PCI_BUS(qdev_get_parent_bus(plugged_dev))) << 8) |
> +        pdev->devfn);
> +    Error *local_err = NULL;
> +
> +    if (!phb->dr_enabled) {
> +        error_set(errp, QERR_BUS_NO_HOTPLUG,
> +                  object_get_typename(OBJECT(phb)));
> +        return;
> +    }
> +
> +    g_assert(drc);
> +
> +    drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
> +    if (!drck->release_pending(drc)) {
> +        spapr_phb_remove_pci_device(drc, phb, pdev, &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            return;
> +        }
> +    }
> +}
> +
>  static void spapr_phb_realize(DeviceState *dev, Error **errp)
>  {
>      SysBusDevice *s = SYS_BUS_DEVICE(dev);
> @@ -574,6 +936,7 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
>                             &sphb->memspace, &sphb->iospace,
>                             PCI_DEVFN(0, 0), PCI_NUM_PINS, TYPE_PCI_BUS);
>      phb->bus = bus;
> +    qbus_set_hotplug_handler(BUS(phb->bus), DEVICE(sphb), NULL);
>  
>      /*
>       * Initialize PHB address space.
> @@ -810,6 +1173,7 @@ static void spapr_phb_class_init(ObjectClass *klass, void *data)
>      PCIHostBridgeClass *hc = PCI_HOST_BRIDGE_CLASS(klass);
>      DeviceClass *dc = DEVICE_CLASS(klass);
>      sPAPRPHBClass *spc = SPAPR_PCI_HOST_BRIDGE_CLASS(klass);
> +    HotplugHandlerClass *hp = HOTPLUG_HANDLER_CLASS(klass);
>  
>      hc->root_bus_path = spapr_phb_root_bus_path;
>      dc->realize = spapr_phb_realize;
> @@ -819,6 +1183,8 @@ static void spapr_phb_class_init(ObjectClass *klass, void *data)
>      set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories);
>      dc->cannot_instantiate_with_device_add_yet = false;
>      spc->finish_realize = spapr_phb_finish_realize;
> +    hp->plug = spapr_phb_hot_plug_child;
> +    hp->unplug = spapr_phb_hot_unplug_child;
>  }
>  
>  static const TypeInfo spapr_phb_info = {
> @@ -827,6 +1193,10 @@ static const TypeInfo spapr_phb_info = {
>      .instance_size = sizeof(sPAPRPHBState),
>      .class_init    = spapr_phb_class_init,
>      .class_size    = sizeof(sPAPRPHBClass),
> +    .interfaces    = (InterfaceInfo[]) {
> +        { TYPE_HOTPLUG_HANDLER },
> +        { }
> +    }
>  };
>  
>  PCIHostState *spapr_create_phb(sPAPREnvironment *spapr, int index)
> @@ -840,17 +1210,6 @@ PCIHostState *spapr_create_phb(sPAPREnvironment *spapr, int index)
>      return PCI_HOST_BRIDGE(dev);
>  }
>  
> -/* Macros to operate with address in OF binding to PCI */
> -#define b_x(x, p, l)    (((x) & ((1<<(l))-1)) << (p))
> -#define b_n(x)          b_x((x), 31, 1) /* 0 if relocatable */
> -#define b_p(x)          b_x((x), 30, 1) /* 1 if prefetchable */
> -#define b_t(x)          b_x((x), 29, 1) /* 1 if the address is aliased */
> -#define b_ss(x)         b_x((x), 24, 2) /* the space code */
> -#define b_bbbbbbbb(x)   b_x((x), 16, 8) /* bus number */
> -#define b_ddddd(x)      b_x((x), 11, 5) /* device number */
> -#define b_fff(x)        b_x((x), 8, 3)  /* function number */
> -#define b_rrrrrrrr(x)   b_x((x), 0, 8)  /* register number */
> -
>  typedef struct sPAPRTCEDT {
>      void *fdt;
>      int node_off;
> @@ -920,14 +1279,6 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
>          return bus_off;
>      }
>  
> -#define _FDT(exp) \
> -    do { \
> -        int ret = (exp);                                           \
> -        if (ret < 0) {                                             \
> -            return ret;                                            \
> -        }                                                          \
> -    } while (0)
> -
>      /* Write PHB properties */
>      _FDT(fdt_setprop_string(fdt, bus_off, "device_type", "pci"));
>      _FDT(fdt_setprop_string(fdt, bus_off, "compatible", "IBM,Logical_PHB"));
Michael Roth Feb. 25, 2015, 5:17 a.m. UTC | #2
Quoting David Gibson (2015-02-24 21:11:29)
> On Mon, Feb 16, 2015 at 08:27:51AM -0600, Michael Roth wrote:
> > This enables hotplug for PHB bridges. Upon hotplug we generate the
> 
> "PCI Host Bridge bridges" :-p
> 
> > OF-nodes required by PAPR specification and IEEE 1275-1994
> > "PCI Bus Binding to Open Firmware" for the device.
> > 
> > We associate the corresponding FDT for these nodes with the DrcEntry
> > corresponding to the slot, which will be fetched via
> > ibm,configure-connector RTAS calls by the guest as described by PAPR
> > specification. The FDT is cleaned up in the case of unplug.
> > 
> > Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
> > ---
> >  hw/ppc/spapr_pci.c | 391 ++++++++++++++++++++++++++++++++++++++++++++++++++---
> >  1 file changed, 371 insertions(+), 20 deletions(-)
> > 
> > diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> > index 6a3d917..b9af1cd 100644
> > --- a/hw/ppc/spapr_pci.c
> > +++ b/hw/ppc/spapr_pci.c
> > @@ -33,8 +33,11 @@
> >  #include <libfdt.h>
> >  #include "trace.h"
> >  #include "qemu/error-report.h"
> > +#include "qapi/qmp/qerror.h"
> >  
> >  #include "hw/pci/pci_bus.h"
> > +#include "hw/ppc/spapr_drc.h"
> > +#include "sysemu/device_tree.h"
> >  
> >  /* Copied from the kernel arch/powerpc/platforms/pseries/msi.c */
> >  #define RTAS_QUERY_FN           0
> > @@ -47,7 +50,13 @@
> >  #define RTAS_TYPE_MSI           1
> >  #define RTAS_TYPE_MSIX          2
> >  
> > -#include "hw/ppc/spapr_drc.h"
> > +#define _FDT(exp) \
> > +    do { \
> > +        int ret = (exp);                                           \
> > +        if (ret < 0) {                                             \
> > +            return ret;                                            \
> > +        }                                                          \
> > +    } while (0)
> >  
> >  static sPAPRPHBState *find_phb(sPAPREnvironment *spapr, uint64_t buid)
> >  {
> > @@ -481,6 +490,359 @@ static AddressSpace *spapr_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn)
> >      return &phb->iommu_as;
> >  }
> >  
> > +/* Macros to operate with address in OF binding to PCI */
> > +#define b_x(x, p, l)    (((x) & ((1<<(l))-1)) << (p))
> > +#define b_n(x)          b_x((x), 31, 1) /* 0 if relocatable */
> > +#define b_p(x)          b_x((x), 30, 1) /* 1 if prefetchable */
> > +#define b_t(x)          b_x((x), 29, 1) /* 1 if the address is aliased */
> > +#define b_ss(x)         b_x((x), 24, 2) /* the space code */
> > +#define b_bbbbbbbb(x)   b_x((x), 16, 8) /* bus number */
> > +#define b_ddddd(x)      b_x((x), 11, 5) /* device number */
> > +#define b_fff(x)        b_x((x), 8, 3)  /* function number */
> > +#define b_rrrrrrrr(x)   b_x((x), 0, 8)  /* register number */
> > +/* for 'reg'/'assigned-addresses' OF properties */
> > +#define RESOURCE_CELLS_SIZE 2
> > +#define RESOURCE_CELLS_ADDRESS 3
> > +
> > +typedef struct ResourceFields {
> > +    uint32_t phys_hi;
> > +    uint32_t phys_mid;
> > +    uint32_t phys_lo;
> > +    uint32_t size_hi;
> > +    uint32_t size_lo;
> > +} ResourceFields;
> 
> Hrm, 5*32-bit ints, that probably needs a ((packed)) on at least some
> platforms to be safe.

I seem to remember some rule (C99?) about padding only being used in cases
where an n-byte field doesn't naturally fall on an n-byte boundary, but
it doesn't hurt to be safe/explicit about what we're expecting.

> 
> > +typedef struct ResourceProps {
> > +    union {
> > +        ResourceFields fields[8];
> > +        uint8_t data[sizeof(ResourceFields) * 8];
> > +    } reg;
> > +    union {
> > +        ResourceFields fields[7];
> > +        uint8_t data[sizeof(ResourceFields) * 7];
> > +    } assigned;
> > +    uint32_t reg_len;
> > +    uint32_t assigned_len;
> > +} ResourceProps;
> 
> I'm a bit dubious about this union, rather than just casting when you
> need the bytestring version.

Yah, I may have gotten a bit carried away there...

> 
> > +
> > +/* fill in the 'reg'/'assigned-resources' OF properties for
> > + * a PCI device. 'reg' describes resource requirements for a
> > + * device's IO/MEM regions, 'assigned-addresses' describes the
> > + * actual resource assignments.
> > + *
> > + * the properties are arrays of ('phys-addr', 'size') pairs describing
> > + * the addressable regions of the PCI device, where 'phys-addr' is a
> > + * RESOURCE_CELLS_ADDRESS-tuple of 32-bit integers corresponding to
> > + * (phys.hi, phys.mid, phys.lo), and 'size' is a
> > + * RESOURCE_CELLS_SIZE-tuple corresponding to (size.hi, size.lo).
> > + * phys.hi = 0xYYXXXXZZ, where:
> > + *   0xYY = npt000ss
> > + *          |||   |
> > + *          |||   +-- space code: 1 if IO region, 2 if MEM region
> > + *          ||+------ for non-relocatable IO: 1 if aliased
> > + *          ||        for relocatable IO: 1 if below 64KB
> > + *          ||        for MEM: 1 if below 1MB
> > + *          |+------- 1 if region is prefetchable
> > + *          +-------- 1 if region is non-relocatable
> > + *   0xXXXX = bbbbbbbb dddddfff, encoding bus, slot, and function
> > + *            bits respectively
> > + *   0xZZ = rrrrrrrr, the register number of the BAR corresponding
> > + *          to the region
> > + *
> > + * phys.mid and phys.lo correspond respectively to the hi/lo portions
> > + * of the actual address of the region.
> > + *
> > + * how the phys-addr/size values are used differ slightly between
> > + * 'reg' and 'assigned-addresses' properties. namely, 'reg' has
> > + * an additional description for the config space region of the
> > + * device, and in the case of QEMU has n=0 and phys.mid=phys.lo=0
> > + * to describe the region as relocatable, with an address-mapping
> > + * that corresponds directly to the PHB's address space for the
> > + * resource. 'assigned-addresses' always has n=1 set with an absolute
> > + * address assigned for the resource. in general, 'assigned-addresses'
> > + * won't be populated, since addresses for PCI devices are generally
> > + * unmapped initially and left to the guest to assign.
> > + *
> > + * in accordance with PCI Bus Binding to Open Firmware,
> > + * IEEE Std 1275-1994, section 4.1.1, as implemented by PAPR+ v2.7,
> > + * Appendix C.
> > + */
> > +static void populate_resource_props(PCIDevice *d, ResourceProps *rp)
> > +{
> > +    int bus_num = pci_bus_num(PCI_BUS(qdev_get_parent_bus(DEVICE(d))));
> > +    uint32_t dev_id = (b_bbbbbbbb(bus_num) |
> > +                       b_ddddd(PCI_SLOT(d->devfn)) |
> > +                       b_fff(PCI_FUNC(d->devfn)));
> > +    ResourceFields *reg, *assigned;
> > +    int i, reg_idx = 0, assigned_idx = 0;
> > +
> > +    /* config space region */
> > +    reg = &rp->reg.fields[reg_idx++];
> > +    reg->phys_hi = cpu_to_be32(dev_id);
> > +    reg->phys_mid = 0;
> > +    reg->phys_lo = 0;
> > +    reg->size_hi = 0;
> > +    reg->size_lo = 0;
> > +
> > +    for (i = 0; i < PCI_NUM_REGIONS; i++) {
> > +        if (!d->io_regions[i].size) {
> > +            continue;
> > +        }
> > +
> > +        reg = &rp->reg.fields[reg_idx++];
> > +
> > +        reg->phys_hi = cpu_to_be32(dev_id | b_rrrrrrrr(pci_bar(d, i)));
> > +        if (d->io_regions[i].type & PCI_BASE_ADDRESS_SPACE_IO) {
> > +            reg->phys_hi |= cpu_to_be32(b_ss(1));
> > +        } else {
> > +            reg->phys_hi |= cpu_to_be32(b_ss(2));
> > +        }
> > +        reg->phys_mid = 0;
> > +        reg->phys_lo = 0;
> > +        reg->size_hi = cpu_to_be32(d->io_regions[i].size >> 32);
> > +        reg->size_lo = cpu_to_be32(d->io_regions[i].size);
> > +
> > +        if (d->io_regions[i].addr != PCI_BAR_UNMAPPED) {
> > +            continue;
> 
> This has inverted sense, doesn't it?  If the BAR *is* unmapped you
> want to continue, skipping the assigned-addresses stuff.

Yes, should be inverted :( It ends up not mattering much with the
current userspace code, since we rely on PCI rescan which always
re-assigns BARs, but definitely needs fixing.

> 
> > +        }
> > +
> > +        assigned = &rp->assigned.fields[assigned_idx++];
> > +        assigned->phys_hi = cpu_to_be32(reg->phys_hi | b_n(1));
> > +        assigned->phys_mid = cpu_to_be32(d->io_regions[i].addr >> 32);
> > +        assigned->phys_lo = cpu_to_be32(d->io_regions[i].addr);
> 
> Not sure if I understood the comment above properly; should these
> addresses actually be translated through an mappings above the PHB?
> Not that there are any such translations on sPAPR, but...

The io_regions[i].addr values are relative to the IO/MEM address spaces for
the device, which correspond to IO/MEM windows for the PHB. So I think the
memory API should handle any translation above that...

Or do you mean because of the n=0/relocatable IO regions described by 'reg'?

IIUC, when n=0, reg->phys_{mid,lo} can be used to encode a starting offset,
so a guest can re-assign/re-program an IO region that's already been assigned
and described via the correspond fields in 'assigned-addresses', so long as
uses an addr above reg->phys{mid,lo}.

So, we could use those reg->phys_{mid,lo} values as an alternative to the
PHBs IO/MEM windows (I guess for platforms that don't provide these windows
and just expose the full address space?).

But since we have those windows, we end up not needing this, so we always do
n=0, reg->phys_mid = reg->phys_lo = 0, so the values in
assigned->phys_{mid,lo} end up just being offsets into the IO/MEM windows,
which correspond directly to io_regions[n].addr.
Michael Roth Feb. 25, 2015, 6:40 a.m. UTC | #3
Quoting Michael Roth (2015-02-24 23:17:24)
> Quoting David Gibson (2015-02-24 21:11:29)
> > On Mon, Feb 16, 2015 at 08:27:51AM -0600, Michael Roth wrote:
> > > This enables hotplug for PHB bridges. Upon hotplug we generate the
> > 
> > "PCI Host Bridge bridges" :-p
> > 
> > > OF-nodes required by PAPR specification and IEEE 1275-1994
> > > "PCI Bus Binding to Open Firmware" for the device.
> > > 
> > > We associate the corresponding FDT for these nodes with the DrcEntry
> > > corresponding to the slot, which will be fetched via
> > > ibm,configure-connector RTAS calls by the guest as described by PAPR
> > > specification. The FDT is cleaned up in the case of unplug.
> > > 
> > > Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
> > > ---
> > >  hw/ppc/spapr_pci.c | 391 ++++++++++++++++++++++++++++++++++++++++++++++++++---
> > >  1 file changed, 371 insertions(+), 20 deletions(-)
> > > 
> > > diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> > > index 6a3d917..b9af1cd 100644
> > > --- a/hw/ppc/spapr_pci.c
> > > +++ b/hw/ppc/spapr_pci.c
> > > @@ -33,8 +33,11 @@
> > >  #include <libfdt.h>
> > >  #include "trace.h"
> > >  #include "qemu/error-report.h"
> > > +#include "qapi/qmp/qerror.h"
> > >  
> > >  #include "hw/pci/pci_bus.h"
> > > +#include "hw/ppc/spapr_drc.h"
> > > +#include "sysemu/device_tree.h"
> > >  
> > >  /* Copied from the kernel arch/powerpc/platforms/pseries/msi.c */
> > >  #define RTAS_QUERY_FN           0
> > > @@ -47,7 +50,13 @@
> > >  #define RTAS_TYPE_MSI           1
> > >  #define RTAS_TYPE_MSIX          2
> > >  
> > > -#include "hw/ppc/spapr_drc.h"
> > > +#define _FDT(exp) \
> > > +    do { \
> > > +        int ret = (exp);                                           \
> > > +        if (ret < 0) {                                             \
> > > +            return ret;                                            \
> > > +        }                                                          \
> > > +    } while (0)
> > >  
> > >  static sPAPRPHBState *find_phb(sPAPREnvironment *spapr, uint64_t buid)
> > >  {
> > > @@ -481,6 +490,359 @@ static AddressSpace *spapr_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn)
> > >      return &phb->iommu_as;
> > >  }
> > >  
> > > +/* Macros to operate with address in OF binding to PCI */
> > > +#define b_x(x, p, l)    (((x) & ((1<<(l))-1)) << (p))
> > > +#define b_n(x)          b_x((x), 31, 1) /* 0 if relocatable */
> > > +#define b_p(x)          b_x((x), 30, 1) /* 1 if prefetchable */
> > > +#define b_t(x)          b_x((x), 29, 1) /* 1 if the address is aliased */
> > > +#define b_ss(x)         b_x((x), 24, 2) /* the space code */
> > > +#define b_bbbbbbbb(x)   b_x((x), 16, 8) /* bus number */
> > > +#define b_ddddd(x)      b_x((x), 11, 5) /* device number */
> > > +#define b_fff(x)        b_x((x), 8, 3)  /* function number */
> > > +#define b_rrrrrrrr(x)   b_x((x), 0, 8)  /* register number */
> > > +/* for 'reg'/'assigned-addresses' OF properties */
> > > +#define RESOURCE_CELLS_SIZE 2
> > > +#define RESOURCE_CELLS_ADDRESS 3
> > > +
> > > +typedef struct ResourceFields {
> > > +    uint32_t phys_hi;
> > > +    uint32_t phys_mid;
> > > +    uint32_t phys_lo;
> > > +    uint32_t size_hi;
> > > +    uint32_t size_lo;
> > > +} ResourceFields;
> > 
> > Hrm, 5*32-bit ints, that probably needs a ((packed)) on at least some
> > platforms to be safe.
> 
> I seem to remember some rule (C99?) about padding only being used in cases
> where an n-byte field doesn't naturally fall on an n-byte boundary, but
> it doesn't hurt to be safe/explicit about what we're expecting.
> 
> > 
> > > +typedef struct ResourceProps {
> > > +    union {
> > > +        ResourceFields fields[8];
> > > +        uint8_t data[sizeof(ResourceFields) * 8];
> > > +    } reg;
> > > +    union {
> > > +        ResourceFields fields[7];
> > > +        uint8_t data[sizeof(ResourceFields) * 7];
> > > +    } assigned;
> > > +    uint32_t reg_len;
> > > +    uint32_t assigned_len;
> > > +} ResourceProps;
> > 
> > I'm a bit dubious about this union, rather than just casting when you
> > need the bytestring version.
> 
> Yah, I may have gotten a bit carried away there...
> 
> > 
> > > +
> > > +/* fill in the 'reg'/'assigned-resources' OF properties for
> > > + * a PCI device. 'reg' describes resource requirements for a
> > > + * device's IO/MEM regions, 'assigned-addresses' describes the
> > > + * actual resource assignments.
> > > + *
> > > + * the properties are arrays of ('phys-addr', 'size') pairs describing
> > > + * the addressable regions of the PCI device, where 'phys-addr' is a
> > > + * RESOURCE_CELLS_ADDRESS-tuple of 32-bit integers corresponding to
> > > + * (phys.hi, phys.mid, phys.lo), and 'size' is a
> > > + * RESOURCE_CELLS_SIZE-tuple corresponding to (size.hi, size.lo).
> > > + * phys.hi = 0xYYXXXXZZ, where:
> > > + *   0xYY = npt000ss
> > > + *          |||   |
> > > + *          |||   +-- space code: 1 if IO region, 2 if MEM region
> > > + *          ||+------ for non-relocatable IO: 1 if aliased
> > > + *          ||        for relocatable IO: 1 if below 64KB
> > > + *          ||        for MEM: 1 if below 1MB
> > > + *          |+------- 1 if region is prefetchable
> > > + *          +-------- 1 if region is non-relocatable
> > > + *   0xXXXX = bbbbbbbb dddddfff, encoding bus, slot, and function
> > > + *            bits respectively
> > > + *   0xZZ = rrrrrrrr, the register number of the BAR corresponding
> > > + *          to the region
> > > + *
> > > + * phys.mid and phys.lo correspond respectively to the hi/lo portions
> > > + * of the actual address of the region.
> > > + *
> > > + * how the phys-addr/size values are used differ slightly between
> > > + * 'reg' and 'assigned-addresses' properties. namely, 'reg' has
> > > + * an additional description for the config space region of the
> > > + * device, and in the case of QEMU has n=0 and phys.mid=phys.lo=0
> > > + * to describe the region as relocatable, with an address-mapping
> > > + * that corresponds directly to the PHB's address space for the
> > > + * resource. 'assigned-addresses' always has n=1 set with an absolute
> > > + * address assigned for the resource. in general, 'assigned-addresses'
> > > + * won't be populated, since addresses for PCI devices are generally
> > > + * unmapped initially and left to the guest to assign.
> > > + *
> > > + * in accordance with PCI Bus Binding to Open Firmware,
> > > + * IEEE Std 1275-1994, section 4.1.1, as implemented by PAPR+ v2.7,
> > > + * Appendix C.
> > > + */
> > > +static void populate_resource_props(PCIDevice *d, ResourceProps *rp)
> > > +{
> > > +    int bus_num = pci_bus_num(PCI_BUS(qdev_get_parent_bus(DEVICE(d))));
> > > +    uint32_t dev_id = (b_bbbbbbbb(bus_num) |
> > > +                       b_ddddd(PCI_SLOT(d->devfn)) |
> > > +                       b_fff(PCI_FUNC(d->devfn)));
> > > +    ResourceFields *reg, *assigned;
> > > +    int i, reg_idx = 0, assigned_idx = 0;
> > > +
> > > +    /* config space region */
> > > +    reg = &rp->reg.fields[reg_idx++];
> > > +    reg->phys_hi = cpu_to_be32(dev_id);
> > > +    reg->phys_mid = 0;
> > > +    reg->phys_lo = 0;
> > > +    reg->size_hi = 0;
> > > +    reg->size_lo = 0;
> > > +
> > > +    for (i = 0; i < PCI_NUM_REGIONS; i++) {
> > > +        if (!d->io_regions[i].size) {
> > > +            continue;
> > > +        }
> > > +
> > > +        reg = &rp->reg.fields[reg_idx++];
> > > +
> > > +        reg->phys_hi = cpu_to_be32(dev_id | b_rrrrrrrr(pci_bar(d, i)));
> > > +        if (d->io_regions[i].type & PCI_BASE_ADDRESS_SPACE_IO) {
> > > +            reg->phys_hi |= cpu_to_be32(b_ss(1));
> > > +        } else {
> > > +            reg->phys_hi |= cpu_to_be32(b_ss(2));
> > > +        }
> > > +        reg->phys_mid = 0;
> > > +        reg->phys_lo = 0;
> > > +        reg->size_hi = cpu_to_be32(d->io_regions[i].size >> 32);
> > > +        reg->size_lo = cpu_to_be32(d->io_regions[i].size);
> > > +
> > > +        if (d->io_regions[i].addr != PCI_BAR_UNMAPPED) {
> > > +            continue;
> > 
> > This has inverted sense, doesn't it?  If the BAR *is* unmapped you
> > want to continue, skipping the assigned-addresses stuff.
> 
> Yes, should be inverted :( It ends up not mattering much with the
> current userspace code, since we rely on PCI rescan which always
> re-assigns BARs, but definitely needs fixing.
> 
> > 
> > > +        }
> > > +
> > > +        assigned = &rp->assigned.fields[assigned_idx++];
> > > +        assigned->phys_hi = cpu_to_be32(reg->phys_hi | b_n(1));
> > > +        assigned->phys_mid = cpu_to_be32(d->io_regions[i].addr >> 32);
> > > +        assigned->phys_lo = cpu_to_be32(d->io_regions[i].addr);
> > 
> > Not sure if I understood the comment above properly; should these
> > addresses actually be translated through an mappings above the PHB?
> > Not that there are any such translations on sPAPR, but...
> 
> The io_regions[i].addr values are relative to the IO/MEM address spaces for
> the device, which correspond to IO/MEM windows for the PHB. So I think the
> memory API should handle any translation above that...
> 
> Or do you mean because of the n=0/relocatable IO regions described by 'reg'?
> 
> IIUC, when n=0, reg->phys_{mid,lo} can be used to encode a starting offset,
> so a guest can re-assign/re-program an IO region that's already been assigned
> and described via the correspond fields in 'assigned-addresses', so long as
> uses an addr above reg->phys{mid,lo}.
> 
> So, we could use those reg->phys_{mid,lo} values as an alternative to the
> PHBs IO/MEM windows (I guess for platforms that don't provide these windows
> and just expose the full address space?).
> 
> But since we have those windows, we end up not needing this, so we always do
> n=0, reg->phys_mid = reg->phys_lo = 0, so the values in
> assigned->phys_{mid,lo} end up just being offsets into the IO/MEM windows,
> which correspond directly to io_regions[n].addr.

Argh, not sure what I was thinking. io_regions[n].addr is relative to the
IO/MEM window, so from the guest perspective .addr is indeed a different
value...

So maybe reg->phys_mid/reg->phys_lo do in fact need to reflect the window
offsets so that the relocatable region assignments are offset
accordingly...

Will look into it more.
David Gibson Feb. 26, 2015, 3:49 a.m. UTC | #4
On Tue, Feb 24, 2015 at 11:17:24PM -0600, Michael Roth wrote:
> Quoting David Gibson (2015-02-24 21:11:29)
> > On Mon, Feb 16, 2015 at 08:27:51AM -0600, Michael Roth wrote:
> > > This enables hotplug for PHB bridges. Upon hotplug we generate the
> > 
> > "PCI Host Bridge bridges" :-p
> > 
> > > OF-nodes required by PAPR specification and IEEE 1275-1994
> > > "PCI Bus Binding to Open Firmware" for the device.
> > > 
> > > We associate the corresponding FDT for these nodes with the DrcEntry
> > > corresponding to the slot, which will be fetched via
> > > ibm,configure-connector RTAS calls by the guest as described by PAPR
> > > specification. The FDT is cleaned up in the case of unplug.
> > > 
> > > Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
> > > ---
> > >  hw/ppc/spapr_pci.c | 391 ++++++++++++++++++++++++++++++++++++++++++++++++++---
> > >  1 file changed, 371 insertions(+), 20 deletions(-)
> > > 
> > > diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> > > index 6a3d917..b9af1cd 100644
> > > --- a/hw/ppc/spapr_pci.c
> > > +++ b/hw/ppc/spapr_pci.c
> > > @@ -33,8 +33,11 @@
> > >  #include <libfdt.h>
> > >  #include "trace.h"
> > >  #include "qemu/error-report.h"
> > > +#include "qapi/qmp/qerror.h"
> > >  
> > >  #include "hw/pci/pci_bus.h"
> > > +#include "hw/ppc/spapr_drc.h"
> > > +#include "sysemu/device_tree.h"
> > >  
> > >  /* Copied from the kernel arch/powerpc/platforms/pseries/msi.c */
> > >  #define RTAS_QUERY_FN           0
> > > @@ -47,7 +50,13 @@
> > >  #define RTAS_TYPE_MSI           1
> > >  #define RTAS_TYPE_MSIX          2
> > >  
> > > -#include "hw/ppc/spapr_drc.h"
> > > +#define _FDT(exp) \
> > > +    do { \
> > > +        int ret = (exp);                                           \
> > > +        if (ret < 0) {                                             \
> > > +            return ret;                                            \
> > > +        }                                                          \
> > > +    } while (0)
> > >  
> > >  static sPAPRPHBState *find_phb(sPAPREnvironment *spapr, uint64_t buid)
> > >  {
> > > @@ -481,6 +490,359 @@ static AddressSpace *spapr_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn)
> > >      return &phb->iommu_as;
> > >  }
> > >  
> > > +/* Macros to operate with address in OF binding to PCI */
> > > +#define b_x(x, p, l)    (((x) & ((1<<(l))-1)) << (p))
> > > +#define b_n(x)          b_x((x), 31, 1) /* 0 if relocatable */
> > > +#define b_p(x)          b_x((x), 30, 1) /* 1 if prefetchable */
> > > +#define b_t(x)          b_x((x), 29, 1) /* 1 if the address is aliased */
> > > +#define b_ss(x)         b_x((x), 24, 2) /* the space code */
> > > +#define b_bbbbbbbb(x)   b_x((x), 16, 8) /* bus number */
> > > +#define b_ddddd(x)      b_x((x), 11, 5) /* device number */
> > > +#define b_fff(x)        b_x((x), 8, 3)  /* function number */
> > > +#define b_rrrrrrrr(x)   b_x((x), 0, 8)  /* register number */
> > > +/* for 'reg'/'assigned-addresses' OF properties */
> > > +#define RESOURCE_CELLS_SIZE 2
> > > +#define RESOURCE_CELLS_ADDRESS 3
> > > +
> > > +typedef struct ResourceFields {
> > > +    uint32_t phys_hi;
> > > +    uint32_t phys_mid;
> > > +    uint32_t phys_lo;
> > > +    uint32_t size_hi;
> > > +    uint32_t size_lo;
> > > +} ResourceFields;
> > 
> > Hrm, 5*32-bit ints, that probably needs a ((packed)) on at least some
> > platforms to be safe.
> 
> I seem to remember some rule (C99?) about padding only being used in cases
> where an n-byte field doesn't naturally fall on an n-byte boundary, but
> it doesn't hurt to be safe/explicit about what we're expecting.

So, the potential problem is end-of-structure padding rather than
between-field padding.  I know some platforms (ARMs I think) at least
used to round structures out to 8 bytes, even if all the members were
less than that.  Mind you at the time, I believe ((packed)) didn't
actually fix it, causing headaches with nested structures.
Michael Roth Feb. 26, 2015, 8:06 p.m. UTC | #5
Quoting Michael Roth (2015-02-25 00:40:26)
> Quoting Michael Roth (2015-02-24 23:17:24)
> > Quoting David Gibson (2015-02-24 21:11:29)
> > > On Mon, Feb 16, 2015 at 08:27:51AM -0600, Michael Roth wrote:
> > > > This enables hotplug for PHB bridges. Upon hotplug we generate the
> > > 
> > > "PCI Host Bridge bridges" :-p
> > > 
> > > > OF-nodes required by PAPR specification and IEEE 1275-1994
> > > > "PCI Bus Binding to Open Firmware" for the device.
> > > > 
> > > > We associate the corresponding FDT for these nodes with the DrcEntry
> > > > corresponding to the slot, which will be fetched via
> > > > ibm,configure-connector RTAS calls by the guest as described by PAPR
> > > > specification. The FDT is cleaned up in the case of unplug.
> > > > 
> > > > Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
> > > > ---
> > > >  hw/ppc/spapr_pci.c | 391 ++++++++++++++++++++++++++++++++++++++++++++++++++---
> > > >  1 file changed, 371 insertions(+), 20 deletions(-)
> > > > 
> > > > diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> > > > index 6a3d917..b9af1cd 100644
> > > > --- a/hw/ppc/spapr_pci.c
> > > > +++ b/hw/ppc/spapr_pci.c
> > > > @@ -33,8 +33,11 @@
> > > >  #include <libfdt.h>
> > > >  #include "trace.h"
> > > >  #include "qemu/error-report.h"
> > > > +#include "qapi/qmp/qerror.h"
> > > >  
> > > >  #include "hw/pci/pci_bus.h"
> > > > +#include "hw/ppc/spapr_drc.h"
> > > > +#include "sysemu/device_tree.h"
> > > >  
> > > >  /* Copied from the kernel arch/powerpc/platforms/pseries/msi.c */
> > > >  #define RTAS_QUERY_FN           0
> > > > @@ -47,7 +50,13 @@
> > > >  #define RTAS_TYPE_MSI           1
> > > >  #define RTAS_TYPE_MSIX          2
> > > >  
> > > > -#include "hw/ppc/spapr_drc.h"
> > > > +#define _FDT(exp) \
> > > > +    do { \
> > > > +        int ret = (exp);                                           \
> > > > +        if (ret < 0) {                                             \
> > > > +            return ret;                                            \
> > > > +        }                                                          \
> > > > +    } while (0)
> > > >  
> > > >  static sPAPRPHBState *find_phb(sPAPREnvironment *spapr, uint64_t buid)
> > > >  {
> > > > @@ -481,6 +490,359 @@ static AddressSpace *spapr_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn)
> > > >      return &phb->iommu_as;
> > > >  }
> > > >  
> > > > +/* Macros to operate with address in OF binding to PCI */
> > > > +#define b_x(x, p, l)    (((x) & ((1<<(l))-1)) << (p))
> > > > +#define b_n(x)          b_x((x), 31, 1) /* 0 if relocatable */
> > > > +#define b_p(x)          b_x((x), 30, 1) /* 1 if prefetchable */
> > > > +#define b_t(x)          b_x((x), 29, 1) /* 1 if the address is aliased */
> > > > +#define b_ss(x)         b_x((x), 24, 2) /* the space code */
> > > > +#define b_bbbbbbbb(x)   b_x((x), 16, 8) /* bus number */
> > > > +#define b_ddddd(x)      b_x((x), 11, 5) /* device number */
> > > > +#define b_fff(x)        b_x((x), 8, 3)  /* function number */
> > > > +#define b_rrrrrrrr(x)   b_x((x), 0, 8)  /* register number */
> > > > +/* for 'reg'/'assigned-addresses' OF properties */
> > > > +#define RESOURCE_CELLS_SIZE 2
> > > > +#define RESOURCE_CELLS_ADDRESS 3
> > > > +
> > > > +typedef struct ResourceFields {
> > > > +    uint32_t phys_hi;
> > > > +    uint32_t phys_mid;
> > > > +    uint32_t phys_lo;
> > > > +    uint32_t size_hi;
> > > > +    uint32_t size_lo;
> > > > +} ResourceFields;
> > > 
> > > Hrm, 5*32-bit ints, that probably needs a ((packed)) on at least some
> > > platforms to be safe.
> > 
> > I seem to remember some rule (C99?) about padding only being used in cases
> > where an n-byte field doesn't naturally fall on an n-byte boundary, but
> > it doesn't hurt to be safe/explicit about what we're expecting.
> > 
> > > 
> > > > +typedef struct ResourceProps {
> > > > +    union {
> > > > +        ResourceFields fields[8];
> > > > +        uint8_t data[sizeof(ResourceFields) * 8];
> > > > +    } reg;
> > > > +    union {
> > > > +        ResourceFields fields[7];
> > > > +        uint8_t data[sizeof(ResourceFields) * 7];
> > > > +    } assigned;
> > > > +    uint32_t reg_len;
> > > > +    uint32_t assigned_len;
> > > > +} ResourceProps;
> > > 
> > > I'm a bit dubious about this union, rather than just casting when you
> > > need the bytestring version.
> > 
> > Yah, I may have gotten a bit carried away there...
> > 
> > > 
> > > > +
> > > > +/* fill in the 'reg'/'assigned-resources' OF properties for
> > > > + * a PCI device. 'reg' describes resource requirements for a
> > > > + * device's IO/MEM regions, 'assigned-addresses' describes the
> > > > + * actual resource assignments.
> > > > + *
> > > > + * the properties are arrays of ('phys-addr', 'size') pairs describing
> > > > + * the addressable regions of the PCI device, where 'phys-addr' is a
> > > > + * RESOURCE_CELLS_ADDRESS-tuple of 32-bit integers corresponding to
> > > > + * (phys.hi, phys.mid, phys.lo), and 'size' is a
> > > > + * RESOURCE_CELLS_SIZE-tuple corresponding to (size.hi, size.lo).
> > > > + * phys.hi = 0xYYXXXXZZ, where:
> > > > + *   0xYY = npt000ss
> > > > + *          |||   |
> > > > + *          |||   +-- space code: 1 if IO region, 2 if MEM region
> > > > + *          ||+------ for non-relocatable IO: 1 if aliased
> > > > + *          ||        for relocatable IO: 1 if below 64KB
> > > > + *          ||        for MEM: 1 if below 1MB
> > > > + *          |+------- 1 if region is prefetchable
> > > > + *          +-------- 1 if region is non-relocatable
> > > > + *   0xXXXX = bbbbbbbb dddddfff, encoding bus, slot, and function
> > > > + *            bits respectively
> > > > + *   0xZZ = rrrrrrrr, the register number of the BAR corresponding
> > > > + *          to the region
> > > > + *
> > > > + * phys.mid and phys.lo correspond respectively to the hi/lo portions
> > > > + * of the actual address of the region.
> > > > + *
> > > > + * how the phys-addr/size values are used differ slightly between
> > > > + * 'reg' and 'assigned-addresses' properties. namely, 'reg' has
> > > > + * an additional description for the config space region of the
> > > > + * device, and in the case of QEMU has n=0 and phys.mid=phys.lo=0
> > > > + * to describe the region as relocatable, with an address-mapping
> > > > + * that corresponds directly to the PHB's address space for the
> > > > + * resource. 'assigned-addresses' always has n=1 set with an absolute
> > > > + * address assigned for the resource. in general, 'assigned-addresses'
> > > > + * won't be populated, since addresses for PCI devices are generally
> > > > + * unmapped initially and left to the guest to assign.
> > > > + *
> > > > + * in accordance with PCI Bus Binding to Open Firmware,
> > > > + * IEEE Std 1275-1994, section 4.1.1, as implemented by PAPR+ v2.7,
> > > > + * Appendix C.
> > > > + */
> > > > +static void populate_resource_props(PCIDevice *d, ResourceProps *rp)
> > > > +{
> > > > +    int bus_num = pci_bus_num(PCI_BUS(qdev_get_parent_bus(DEVICE(d))));
> > > > +    uint32_t dev_id = (b_bbbbbbbb(bus_num) |
> > > > +                       b_ddddd(PCI_SLOT(d->devfn)) |
> > > > +                       b_fff(PCI_FUNC(d->devfn)));
> > > > +    ResourceFields *reg, *assigned;
> > > > +    int i, reg_idx = 0, assigned_idx = 0;
> > > > +
> > > > +    /* config space region */
> > > > +    reg = &rp->reg.fields[reg_idx++];
> > > > +    reg->phys_hi = cpu_to_be32(dev_id);
> > > > +    reg->phys_mid = 0;
> > > > +    reg->phys_lo = 0;
> > > > +    reg->size_hi = 0;
> > > > +    reg->size_lo = 0;
> > > > +
> > > > +    for (i = 0; i < PCI_NUM_REGIONS; i++) {
> > > > +        if (!d->io_regions[i].size) {
> > > > +            continue;
> > > > +        }
> > > > +
> > > > +        reg = &rp->reg.fields[reg_idx++];
> > > > +
> > > > +        reg->phys_hi = cpu_to_be32(dev_id | b_rrrrrrrr(pci_bar(d, i)));
> > > > +        if (d->io_regions[i].type & PCI_BASE_ADDRESS_SPACE_IO) {
> > > > +            reg->phys_hi |= cpu_to_be32(b_ss(1));
> > > > +        } else {
> > > > +            reg->phys_hi |= cpu_to_be32(b_ss(2));
> > > > +        }
> > > > +        reg->phys_mid = 0;
> > > > +        reg->phys_lo = 0;
> > > > +        reg->size_hi = cpu_to_be32(d->io_regions[i].size >> 32);
> > > > +        reg->size_lo = cpu_to_be32(d->io_regions[i].size);
> > > > +
> > > > +        if (d->io_regions[i].addr != PCI_BAR_UNMAPPED) {
> > > > +            continue;
> > > 
> > > This has inverted sense, doesn't it?  If the BAR *is* unmapped you
> > > want to continue, skipping the assigned-addresses stuff.
> > 
> > Yes, should be inverted :( It ends up not mattering much with the
> > current userspace code, since we rely on PCI rescan which always
> > re-assigns BARs, but definitely needs fixing.
> > 
> > > 
> > > > +        }
> > > > +
> > > > +        assigned = &rp->assigned.fields[assigned_idx++];
> > > > +        assigned->phys_hi = cpu_to_be32(reg->phys_hi | b_n(1));
> > > > +        assigned->phys_mid = cpu_to_be32(d->io_regions[i].addr >> 32);
> > > > +        assigned->phys_lo = cpu_to_be32(d->io_regions[i].addr);
> > > 
> > > Not sure if I understood the comment above properly; should these
> > > addresses actually be translated through an mappings above the PHB?
> > > Not that there are any such translations on sPAPR, but...
> > 
> > The io_regions[i].addr values are relative to the IO/MEM address spaces for
> > the device, which correspond to IO/MEM windows for the PHB. So I think the
> > memory API should handle any translation above that...
> > 
> > Or do you mean because of the n=0/relocatable IO regions described by 'reg'?
> > 
> > IIUC, when n=0, reg->phys_{mid,lo} can be used to encode a starting offset,
> > so a guest can re-assign/re-program an IO region that's already been assigned
> > and described via the correspond fields in 'assigned-addresses', so long as
> > uses an addr above reg->phys{mid,lo}.
> > 
> > So, we could use those reg->phys_{mid,lo} values as an alternative to the
> > PHBs IO/MEM windows (I guess for platforms that don't provide these windows
> > and just expose the full address space?).
> > 
> > But since we have those windows, we end up not needing this, so we always do
> > n=0, reg->phys_mid = reg->phys_lo = 0, so the values in
> > assigned->phys_{mid,lo} end up just being offsets into the IO/MEM windows,
> > which correspond directly to io_regions[n].addr.
> 
> Argh, not sure what I was thinking. io_regions[n].addr is relative to the
> IO/MEM window, so from the guest perspective .addr is indeed a different
> value...
> 
> So maybe reg->phys_mid/reg->phys_lo do in fact need to reflect the window
> offsets so that the relocatable region assignments are offset
> accordingly...
> 
> Will look into it more.

So, maybe my first response wasn't as misguided as I thought. I haven't
found any documentation that lays out the specifics, but the kernel code makes
it fairly clear that the IO/MEM addresses in assigned-addresses are relative
to PHB's IO/MEM windows.

The code basically does this:

  arch/powerpc/kernel/pci_of_scan.c: of_pci_parse_addrs()
    addrs = of_get('assigned-addresses')
    config_offset = addrs[0] && 0xff (phys.hi)
    pci_bar_num = (config_offset - PCI_BASE_ADDRESS_0) / 4
    base = &addrs[1:2] (phys.mid/phys.lo)
    size = &addrs[3:4] (size.hi, size.lo)
    region.start = base
    region.end = base + size -1
    res = &dev->resource[pci_bar_num];
    pcibios_bus_to_resource(dev->bus, res, &region)
  drivers/pci/host-bridge.c:pcibios_bus_to_resource(bus, bar_res, bar_region)
    bridge = pci_host_bridge(bus)
    for window in bridge->windows:
      # skip if window type (IO/MEM), doesn't match device BAR's res)
      bus_region.start = window->res->start - window->offset
      bus_region.end = window->res->end - window->offset
      if bus_region contains device BAR region:
        offset = window->offset
        break
    //take the region, add the window offset, and use that as the resource addr
    res->start = region->start + offset
    res->end = region->end + offset

I tested this with the rpaphp hotplug module, which honors
assigned-addresses (current guest hotplug uses generic pci rescan instead
due to a long-standing issue with rpaphp only being able to handle
1 slot per PHB if there are devices present at module-load time), by adding
some test code to pre-assign the BARs during hotplug (with addresses relative
to PHB's IO/MEM windows) to make sure assigned-addresses values get properly
translated to the correct offset in guest's io/system address space.

Will add a note in the comments.
David Gibson Feb. 27, 2015, 5:14 a.m. UTC | #6
On Thu, Feb 26, 2015 at 02:06:04PM -0600, Michael Roth wrote:
> Quoting Michael Roth (2015-02-25 00:40:26)
> > Quoting Michael Roth (2015-02-24 23:17:24)
> > > Quoting David Gibson (2015-02-24 21:11:29)
> > > > On Mon, Feb 16, 2015 at 08:27:51AM -0600, Michael Roth wrote:
[snip]
> > > > > +        }
> > > > > +
> > > > > +        assigned = &rp->assigned.fields[assigned_idx++];
> > > > > +        assigned->phys_hi = cpu_to_be32(reg->phys_hi | b_n(1));
> > > > > +        assigned->phys_mid = cpu_to_be32(d->io_regions[i].addr >> 32);
> > > > > +        assigned->phys_lo = cpu_to_be32(d->io_regions[i].addr);
> > > > 
> > > > Not sure if I understood the comment above properly; should these
> > > > addresses actually be translated through an mappings above the PHB?
> > > > Not that there are any such translations on sPAPR, but...
> > > 
> > > The io_regions[i].addr values are relative to the IO/MEM address spaces for
> > > the device, which correspond to IO/MEM windows for the PHB. So I think the
> > > memory API should handle any translation above that...
> > > 
> > > Or do you mean because of the n=0/relocatable IO regions described by 'reg'?
> > > 
> > > IIUC, when n=0, reg->phys_{mid,lo} can be used to encode a starting offset,
> > > so a guest can re-assign/re-program an IO region that's already been assigned
> > > and described via the correspond fields in 'assigned-addresses', so long as
> > > uses an addr above reg->phys{mid,lo}.
> > > 
> > > So, we could use those reg->phys_{mid,lo} values as an alternative to the
> > > PHBs IO/MEM windows (I guess for platforms that don't provide these windows
> > > and just expose the full address space?).
> > > 
> > > But since we have those windows, we end up not needing this, so we always do
> > > n=0, reg->phys_mid = reg->phys_lo = 0, so the values in
> > > assigned->phys_{mid,lo} end up just being offsets into the IO/MEM windows,
> > > which correspond directly to io_regions[n].addr.
> > 
> > Argh, not sure what I was thinking. io_regions[n].addr is relative to the
> > IO/MEM window, so from the guest perspective .addr is indeed a different
> > value...
> > 
> > So maybe reg->phys_mid/reg->phys_lo do in fact need to reflect the window
> > offsets so that the relocatable region assignments are offset
> > accordingly...
> > 
> > Will look into it more.
> 
> So, maybe my first response wasn't as misguided as I thought. I haven't
> found any documentation that lays out the specifics, but the kernel code makes
> it fairly clear that the IO/MEM addresses in assigned-addresses are relative
> to PHB's IO/MEM windows.
> 
> The code basically does this:
> 
>   arch/powerpc/kernel/pci_of_scan.c: of_pci_parse_addrs()
>     addrs = of_get('assigned-addresses')
>     config_offset = addrs[0] && 0xff (phys.hi)
>     pci_bar_num = (config_offset - PCI_BASE_ADDRESS_0) / 4
>     base = &addrs[1:2] (phys.mid/phys.lo)
>     size = &addrs[3:4] (size.hi, size.lo)
>     region.start = base
>     region.end = base + size -1
>     res = &dev->resource[pci_bar_num];
>     pcibios_bus_to_resource(dev->bus, res, &region)
>   drivers/pci/host-bridge.c:pcibios_bus_to_resource(bus, bar_res, bar_region)
>     bridge = pci_host_bridge(bus)
>     for window in bridge->windows:
>       # skip if window type (IO/MEM), doesn't match device BAR's res)
>       bus_region.start = window->res->start - window->offset
>       bus_region.end = window->res->end - window->offset
>       if bus_region contains device BAR region:
>         offset = window->offset
>         break
>     //take the region, add the window offset, and use that as the resource addr
>     res->start = region->start + offset
>     res->end = region->end + offset
> 
> I tested this with the rpaphp hotplug module, which honors
> assigned-addresses (current guest hotplug uses generic pci rescan instead
> due to a long-standing issue with rpaphp only being able to handle
> 1 slot per PHB if there are devices present at module-load time), by adding
> some test code to pre-assign the BARs during hotplug (with addresses relative
> to PHB's IO/MEM windows) to make sure assigned-addresses values get properly
> translated to the correct offset in guest's io/system address space.
> 
> Will add a note in the comments.

Ok, sounds like the only problem was a confusing description in the
comment that made it seem like the assigned-address values were GPAs
rather than relative to the PHB windows.
diff mbox

Patch

diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index 6a3d917..b9af1cd 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -33,8 +33,11 @@ 
 #include <libfdt.h>
 #include "trace.h"
 #include "qemu/error-report.h"
+#include "qapi/qmp/qerror.h"
 
 #include "hw/pci/pci_bus.h"
+#include "hw/ppc/spapr_drc.h"
+#include "sysemu/device_tree.h"
 
 /* Copied from the kernel arch/powerpc/platforms/pseries/msi.c */
 #define RTAS_QUERY_FN           0
@@ -47,7 +50,13 @@ 
 #define RTAS_TYPE_MSI           1
 #define RTAS_TYPE_MSIX          2
 
-#include "hw/ppc/spapr_drc.h"
+#define _FDT(exp) \
+    do { \
+        int ret = (exp);                                           \
+        if (ret < 0) {                                             \
+            return ret;                                            \
+        }                                                          \
+    } while (0)
 
 static sPAPRPHBState *find_phb(sPAPREnvironment *spapr, uint64_t buid)
 {
@@ -481,6 +490,359 @@  static AddressSpace *spapr_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn)
     return &phb->iommu_as;
 }
 
+/* Macros to operate with address in OF binding to PCI */
+#define b_x(x, p, l)    (((x) & ((1<<(l))-1)) << (p))
+#define b_n(x)          b_x((x), 31, 1) /* 0 if relocatable */
+#define b_p(x)          b_x((x), 30, 1) /* 1 if prefetchable */
+#define b_t(x)          b_x((x), 29, 1) /* 1 if the address is aliased */
+#define b_ss(x)         b_x((x), 24, 2) /* the space code */
+#define b_bbbbbbbb(x)   b_x((x), 16, 8) /* bus number */
+#define b_ddddd(x)      b_x((x), 11, 5) /* device number */
+#define b_fff(x)        b_x((x), 8, 3)  /* function number */
+#define b_rrrrrrrr(x)   b_x((x), 0, 8)  /* register number */
+
+/* for 'reg'/'assigned-addresses' OF properties */
+#define RESOURCE_CELLS_SIZE 2
+#define RESOURCE_CELLS_ADDRESS 3
+
+typedef struct ResourceFields {
+    uint32_t phys_hi;
+    uint32_t phys_mid;
+    uint32_t phys_lo;
+    uint32_t size_hi;
+    uint32_t size_lo;
+} ResourceFields;
+
+typedef struct ResourceProps {
+    union {
+        ResourceFields fields[8];
+        uint8_t data[sizeof(ResourceFields) * 8];
+    } reg;
+    union {
+        ResourceFields fields[7];
+        uint8_t data[sizeof(ResourceFields) * 7];
+    } assigned;
+    uint32_t reg_len;
+    uint32_t assigned_len;
+} ResourceProps;
+
+/* fill in the 'reg'/'assigned-resources' OF properties for
+ * a PCI device. 'reg' describes resource requirements for a
+ * device's IO/MEM regions, 'assigned-addresses' describes the
+ * actual resource assignments.
+ *
+ * the properties are arrays of ('phys-addr', 'size') pairs describing
+ * the addressable regions of the PCI device, where 'phys-addr' is a
+ * RESOURCE_CELLS_ADDRESS-tuple of 32-bit integers corresponding to
+ * (phys.hi, phys.mid, phys.lo), and 'size' is a
+ * RESOURCE_CELLS_SIZE-tuple corresponding to (size.hi, size.lo).
+ *
+ * phys.hi = 0xYYXXXXZZ, where:
+ *   0xYY = npt000ss
+ *          |||   |
+ *          |||   +-- space code: 1 if IO region, 2 if MEM region
+ *          ||+------ for non-relocatable IO: 1 if aliased
+ *          ||        for relocatable IO: 1 if below 64KB
+ *          ||        for MEM: 1 if below 1MB
+ *          |+------- 1 if region is prefetchable
+ *          +-------- 1 if region is non-relocatable
+ *   0xXXXX = bbbbbbbb dddddfff, encoding bus, slot, and function
+ *            bits respectively
+ *   0xZZ = rrrrrrrr, the register number of the BAR corresponding
+ *          to the region
+ *
+ * phys.mid and phys.lo correspond respectively to the hi/lo portions
+ * of the actual address of the region.
+ *
+ * how the phys-addr/size values are used differ slightly between
+ * 'reg' and 'assigned-addresses' properties. namely, 'reg' has
+ * an additional description for the config space region of the
+ * device, and in the case of QEMU has n=0 and phys.mid=phys.lo=0
+ * to describe the region as relocatable, with an address-mapping
+ * that corresponds directly to the PHB's address space for the
+ * resource. 'assigned-addresses' always has n=1 set with an absolute
+ * address assigned for the resource. in general, 'assigned-addresses'
+ * won't be populated, since addresses for PCI devices are generally
+ * unmapped initially and left to the guest to assign.
+ *
+ * in accordance with PCI Bus Binding to Open Firmware,
+ * IEEE Std 1275-1994, section 4.1.1, as implemented by PAPR+ v2.7,
+ * Appendix C.
+ */
+static void populate_resource_props(PCIDevice *d, ResourceProps *rp)
+{
+    int bus_num = pci_bus_num(PCI_BUS(qdev_get_parent_bus(DEVICE(d))));
+    uint32_t dev_id = (b_bbbbbbbb(bus_num) |
+                       b_ddddd(PCI_SLOT(d->devfn)) |
+                       b_fff(PCI_FUNC(d->devfn)));
+    ResourceFields *reg, *assigned;
+    int i, reg_idx = 0, assigned_idx = 0;
+
+    /* config space region */
+    reg = &rp->reg.fields[reg_idx++];
+    reg->phys_hi = cpu_to_be32(dev_id);
+    reg->phys_mid = 0;
+    reg->phys_lo = 0;
+    reg->size_hi = 0;
+    reg->size_lo = 0;
+
+    for (i = 0; i < PCI_NUM_REGIONS; i++) {
+        if (!d->io_regions[i].size) {
+            continue;
+        }
+
+        reg = &rp->reg.fields[reg_idx++];
+
+        reg->phys_hi = cpu_to_be32(dev_id | b_rrrrrrrr(pci_bar(d, i)));
+        if (d->io_regions[i].type & PCI_BASE_ADDRESS_SPACE_IO) {
+            reg->phys_hi |= cpu_to_be32(b_ss(1));
+        } else {
+            reg->phys_hi |= cpu_to_be32(b_ss(2));
+        }
+        reg->phys_mid = 0;
+        reg->phys_lo = 0;
+        reg->size_hi = cpu_to_be32(d->io_regions[i].size >> 32);
+        reg->size_lo = cpu_to_be32(d->io_regions[i].size);
+
+        if (d->io_regions[i].addr != PCI_BAR_UNMAPPED) {
+            continue;
+        }
+
+        assigned = &rp->assigned.fields[assigned_idx++];
+        assigned->phys_hi = cpu_to_be32(reg->phys_hi | b_n(1));
+        assigned->phys_mid = cpu_to_be32(d->io_regions[i].addr >> 32);
+        assigned->phys_lo = cpu_to_be32(d->io_regions[i].addr);
+        assigned->size_hi = reg->size_hi;
+        assigned->size_lo = reg->size_lo;
+    }
+
+    rp->reg_len = reg_idx * sizeof(ResourceFields);
+    rp->assigned_len = assigned_idx * sizeof(ResourceFields);
+}
+
+static int spapr_populate_pci_child_dt(PCIDevice *dev, void *fdt, int offset,
+                                       int phb_index, int drc_index,
+                                       const char *drc_name)
+{
+    ResourceProps rp;
+    bool is_bridge = false;
+    int pci_status;
+
+    if (pci_default_read_config(dev, PCI_HEADER_TYPE, 1) ==
+        PCI_HEADER_TYPE_BRIDGE) {
+        is_bridge = true;
+    }
+
+    /* in accordance with PAPR+ v2.7 13.6.3, Table 181 */
+    _FDT(fdt_setprop_cell(fdt, offset, "vendor-id",
+                          pci_default_read_config(dev, PCI_VENDOR_ID, 2)));
+    _FDT(fdt_setprop_cell(fdt, offset, "device-id",
+                          pci_default_read_config(dev, PCI_DEVICE_ID, 2)));
+    _FDT(fdt_setprop_cell(fdt, offset, "revision-id",
+                          pci_default_read_config(dev, PCI_REVISION_ID, 1)));
+    _FDT(fdt_setprop_cell(fdt, offset, "class-code",
+                          pci_default_read_config(dev, PCI_CLASS_DEVICE, 2)
+                            << 8));
+    if (pci_default_read_config(dev, PCI_INTERRUPT_PIN, 1)) {
+        _FDT(fdt_setprop_cell(fdt, offset, "interrupts",
+                 pci_default_read_config(dev, PCI_INTERRUPT_PIN, 1)));
+    }
+
+    if (!is_bridge) {
+        _FDT(fdt_setprop_cell(fdt, offset, "min-grant",
+            pci_default_read_config(dev, PCI_MIN_GNT, 1)));
+        _FDT(fdt_setprop_cell(fdt, offset, "max-latency",
+            pci_default_read_config(dev, PCI_MAX_LAT, 1)));
+    }
+
+    if (pci_default_read_config(dev, PCI_SUBSYSTEM_ID, 2)) {
+        _FDT(fdt_setprop_cell(fdt, offset, "subsystem-id",
+                 pci_default_read_config(dev, PCI_SUBSYSTEM_ID, 2)));
+    }
+
+    if (pci_default_read_config(dev, PCI_SUBSYSTEM_VENDOR_ID, 2)) {
+        _FDT(fdt_setprop_cell(fdt, offset, "subsystem-vendor-id",
+                 pci_default_read_config(dev, PCI_SUBSYSTEM_VENDOR_ID, 2)));
+    }
+
+    _FDT(fdt_setprop_cell(fdt, offset, "cache-line-size",
+        pci_default_read_config(dev, PCI_CACHE_LINE_SIZE, 1)));
+
+    /* the following fdt cells are masked off the pci status register */
+    pci_status = pci_default_read_config(dev, PCI_STATUS, 2);
+    _FDT(fdt_setprop_cell(fdt, offset, "devsel-speed",
+                          PCI_STATUS_DEVSEL_MASK & pci_status));
+
+    if (pci_status & PCI_STATUS_FAST_BACK) {
+        _FDT(fdt_setprop(fdt, offset, "fast-back-to-back", NULL, 0));
+    }
+    if (pci_status & PCI_STATUS_66MHZ) {
+        _FDT(fdt_setprop(fdt, offset, "66mhz-capable", NULL, 0));
+    }
+    if (pci_status & PCI_STATUS_UDF) {
+        _FDT(fdt_setprop(fdt, offset, "udf-supported", NULL, 0));
+    }
+
+    _FDT(fdt_setprop_string(fdt, offset, "name", "pci"));
+    _FDT(fdt_setprop(fdt, offset, "ibm,loc-code", drc_name, strlen(drc_name)));
+    _FDT(fdt_setprop_cell(fdt, offset, "ibm,my-drc-index", drc_index));
+
+    _FDT(fdt_setprop_cell(fdt, offset, "#address-cells",
+                          RESOURCE_CELLS_ADDRESS));
+    _FDT(fdt_setprop_cell(fdt, offset, "#size-cells",
+                          RESOURCE_CELLS_SIZE));
+    _FDT(fdt_setprop_cell(fdt, offset, "ibm,req#msi-x",
+                          RESOURCE_CELLS_SIZE));
+
+    populate_resource_props(dev, &rp);
+    _FDT(fdt_setprop(fdt, offset, "reg", rp.reg.data, rp.reg_len));
+    _FDT(fdt_setprop(fdt, offset, "assigned-addresses",
+                     rp.assigned.data, rp.assigned_len));
+
+    return 0;
+}
+
+/* create OF node for pci device and required OF DT properties */
+static void *spapr_create_pci_child_dt(sPAPRPHBState *phb, PCIDevice *dev,
+                                       int drc_index, const char *drc_name,
+                                       int *dt_offset)
+{
+    void *fdt;
+    int offset, ret, fdt_size;
+    int slot = PCI_SLOT(dev->devfn);
+    char nodename[512];
+
+    fdt = create_device_tree(&fdt_size);
+    sprintf(nodename, "pci@%d", slot);
+    offset = fdt_add_subnode(fdt, 0, nodename);
+    ret = spapr_populate_pci_child_dt(dev, fdt, offset, phb->index, drc_index,
+                                      drc_name);
+    g_assert(!ret);
+
+    *dt_offset = offset;
+    return fdt;
+}
+
+static void spapr_phb_add_pci_device(sPAPRDRConnector *drc,
+                                     sPAPRPHBState *phb,
+                                     PCIDevice *pdev,
+                                     Error **errp)
+{
+    sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
+    DeviceState *dev = DEVICE(pdev);
+    int drc_index = drck->get_index(drc);
+    const char *drc_name = drck->get_name(drc);
+    void *fdt = NULL;
+    int fdt_start_offset = 0;
+
+    /* boot-time devices get their device tree node created by SLOF, but for
+     * hotplugged devices we need QEMU to generate it so the guest can fetch
+     * it via RTAS
+     */
+    if (dev->hotplugged) {
+        fdt = spapr_create_pci_child_dt(phb, pdev, drc_index, drc_name,
+                                        &fdt_start_offset);
+    }
+
+    drck->attach(drc, DEVICE(pdev),
+                 fdt, fdt_start_offset, !dev->hotplugged, errp);
+    if (errp) {
+        g_free(fdt);
+    }
+}
+
+static void spapr_phb_remove_pci_device_cb(DeviceState *dev, void *opaque)
+{
+    /* some version guests do not wait for completion of a device
+     * cleanup (generally done asynchronously by the kernel) before
+     * signaling to QEMU that the device is safe, but instead sleep
+     * for some 'safe' period of time. unfortunately on a busy host
+     * this sleep isn't guaranteed to be long enough, resulting in
+     * bad things like IRQ lines being left asserted during final
+     * device removal. to deal with this we call reset just prior
+     * to finalizing the device, which will put the device back into
+     * an 'idle' state, as the device cleanup code expects.
+     */
+    pci_device_reset(PCI_DEVICE(dev));
+    object_unparent(OBJECT(dev));
+}
+
+static void spapr_phb_remove_pci_device(sPAPRDRConnector *drc,
+                                        sPAPRPHBState *phb,
+                                        PCIDevice *pdev,
+                                        Error **errp)
+{
+    sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
+
+    drck->detach(drc, DEVICE(pdev), spapr_phb_remove_pci_device_cb, phb, errp);
+}
+
+static void spapr_phb_hot_plug_child(HotplugHandler *plug_handler,
+                                     DeviceState *plugged_dev, Error **errp)
+{
+    sPAPRPHBState *phb = SPAPR_PCI_HOST_BRIDGE(DEVICE(plug_handler));
+    PCIDevice *pdev = PCI_DEVICE(plugged_dev);
+    sPAPRDRConnector *drc = spapr_dr_connector_by_id(
+        SPAPR_DR_CONNECTOR_TYPE_PCI,
+        (phb->index << 16) |
+        (pci_bus_num(PCI_BUS(qdev_get_parent_bus(plugged_dev))) << 8) |
+        pdev->devfn);
+    Error *local_err = NULL;
+
+    /* if DR is disabled we don't need to do anything in the case of
+     * hotplug or coldplug callbacks
+     */
+    if (!phb->dr_enabled) {
+        /* if this is a hotplug operation initiated by the user
+         * we need to let them know it's not enabled
+         */
+        if (plugged_dev->hotplugged) {
+            error_set(errp, QERR_BUS_NO_HOTPLUG,
+                      object_get_typename(OBJECT(phb)));
+        }
+        return;
+    }
+
+    g_assert(drc);
+
+    spapr_phb_add_pci_device(drc, phb, pdev, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+}
+
+static void spapr_phb_hot_unplug_child(HotplugHandler *plug_handler,
+                                       DeviceState *plugged_dev, Error **errp)
+{
+    sPAPRPHBState *phb = SPAPR_PCI_HOST_BRIDGE(DEVICE(plug_handler));
+    PCIDevice *pdev = PCI_DEVICE(plugged_dev);
+    sPAPRDRConnectorClass *drck;
+    sPAPRDRConnector *drc = spapr_dr_connector_by_id(
+        SPAPR_DR_CONNECTOR_TYPE_PCI,
+        (phb->index << 16) |
+        (pci_bus_num(PCI_BUS(qdev_get_parent_bus(plugged_dev))) << 8) |
+        pdev->devfn);
+    Error *local_err = NULL;
+
+    if (!phb->dr_enabled) {
+        error_set(errp, QERR_BUS_NO_HOTPLUG,
+                  object_get_typename(OBJECT(phb)));
+        return;
+    }
+
+    g_assert(drc);
+
+    drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
+    if (!drck->release_pending(drc)) {
+        spapr_phb_remove_pci_device(drc, phb, pdev, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
+    }
+}
+
 static void spapr_phb_realize(DeviceState *dev, Error **errp)
 {
     SysBusDevice *s = SYS_BUS_DEVICE(dev);
@@ -574,6 +936,7 @@  static void spapr_phb_realize(DeviceState *dev, Error **errp)
                            &sphb->memspace, &sphb->iospace,
                            PCI_DEVFN(0, 0), PCI_NUM_PINS, TYPE_PCI_BUS);
     phb->bus = bus;
+    qbus_set_hotplug_handler(BUS(phb->bus), DEVICE(sphb), NULL);
 
     /*
      * Initialize PHB address space.
@@ -810,6 +1173,7 @@  static void spapr_phb_class_init(ObjectClass *klass, void *data)
     PCIHostBridgeClass *hc = PCI_HOST_BRIDGE_CLASS(klass);
     DeviceClass *dc = DEVICE_CLASS(klass);
     sPAPRPHBClass *spc = SPAPR_PCI_HOST_BRIDGE_CLASS(klass);
+    HotplugHandlerClass *hp = HOTPLUG_HANDLER_CLASS(klass);
 
     hc->root_bus_path = spapr_phb_root_bus_path;
     dc->realize = spapr_phb_realize;
@@ -819,6 +1183,8 @@  static void spapr_phb_class_init(ObjectClass *klass, void *data)
     set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories);
     dc->cannot_instantiate_with_device_add_yet = false;
     spc->finish_realize = spapr_phb_finish_realize;
+    hp->plug = spapr_phb_hot_plug_child;
+    hp->unplug = spapr_phb_hot_unplug_child;
 }
 
 static const TypeInfo spapr_phb_info = {
@@ -827,6 +1193,10 @@  static const TypeInfo spapr_phb_info = {
     .instance_size = sizeof(sPAPRPHBState),
     .class_init    = spapr_phb_class_init,
     .class_size    = sizeof(sPAPRPHBClass),
+    .interfaces    = (InterfaceInfo[]) {
+        { TYPE_HOTPLUG_HANDLER },
+        { }
+    }
 };
 
 PCIHostState *spapr_create_phb(sPAPREnvironment *spapr, int index)
@@ -840,17 +1210,6 @@  PCIHostState *spapr_create_phb(sPAPREnvironment *spapr, int index)
     return PCI_HOST_BRIDGE(dev);
 }
 
-/* Macros to operate with address in OF binding to PCI */
-#define b_x(x, p, l)    (((x) & ((1<<(l))-1)) << (p))
-#define b_n(x)          b_x((x), 31, 1) /* 0 if relocatable */
-#define b_p(x)          b_x((x), 30, 1) /* 1 if prefetchable */
-#define b_t(x)          b_x((x), 29, 1) /* 1 if the address is aliased */
-#define b_ss(x)         b_x((x), 24, 2) /* the space code */
-#define b_bbbbbbbb(x)   b_x((x), 16, 8) /* bus number */
-#define b_ddddd(x)      b_x((x), 11, 5) /* device number */
-#define b_fff(x)        b_x((x), 8, 3)  /* function number */
-#define b_rrrrrrrr(x)   b_x((x), 0, 8)  /* register number */
-
 typedef struct sPAPRTCEDT {
     void *fdt;
     int node_off;
@@ -920,14 +1279,6 @@  int spapr_populate_pci_dt(sPAPRPHBState *phb,
         return bus_off;
     }
 
-#define _FDT(exp) \
-    do { \
-        int ret = (exp);                                           \
-        if (ret < 0) {                                             \
-            return ret;                                            \
-        }                                                          \
-    } while (0)
-
     /* Write PHB properties */
     _FDT(fdt_setprop_string(fdt, bus_off, "device_type", "pci"));
     _FDT(fdt_setprop_string(fdt, bus_off, "compatible", "IBM,Logical_PHB"));