diff mbox

[v2,11/14] spapr_pci: enable basic hotplug operations

Message ID 1386282785-466-12-git-send-email-mdroth@linux.vnet.ibm.com
State New
Headers show

Commit Message

Michael Roth Dec. 5, 2013, 10:33 p.m. UTC
From: Mike Day <ncmike@ncultra.org>

This enables hotplug for PHB bridges. Upon hotplug we generate the
OF-nodes required by PAPR specification and IEEE 1275-1994
"PCI Bus Binding to Open Firmware" for the device.

We associate the corresponding FDT for these nodes with the DrcEntry
corresponding to the slot, which will be fetched via
ibm,configure-connector RTAS calls by the guest as described by PAPR
specification. The FDT is cleaned up in the case of unplug.

Amongst the required OF-node properties for each device are the "reg"
and "assigned-addresses" properties which describe the BAR-assignments
for IO/MEM/ROM regions. To handle these assignments we scan the address
space associated with each region for a contiguous range of appropriate
size based on PCI specification and encode these in accordance with
Open Firmware PCI Bus Binding spec.

These assignments will be used by the guest when the rpaphp hotplug
module is used, but may be re-assigned by guests for cases where we
rely on bus rescan.

Signed-off-by: Mike Day <ncmike@ncultra.org>
Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
---
 hw/ppc/spapr_pci.c     |  375 ++++++++++++++++++++++++++++++++++++++++++++++--
 include/hw/ppc/spapr.h |    1 +
 2 files changed, 368 insertions(+), 8 deletions(-)

Comments

Alexey Kardashevskiy Dec. 16, 2013, 4:36 a.m. UTC | #1
On 12/06/2013 09:33 AM, Michael Roth wrote:
> From: Mike Day <ncmike@ncultra.org>
> 
> This enables hotplug for PHB bridges. Upon hotplug we generate the
> OF-nodes required by PAPR specification and IEEE 1275-1994
> "PCI Bus Binding to Open Firmware" for the device.
> 
> We associate the corresponding FDT for these nodes with the DrcEntry
> corresponding to the slot, which will be fetched via
> ibm,configure-connector RTAS calls by the guest as described by PAPR
> specification. The FDT is cleaned up in the case of unplug.
> 
> Amongst the required OF-node properties for each device are the "reg"
> and "assigned-addresses" properties which describe the BAR-assignments
> for IO/MEM/ROM regions. To handle these assignments we scan the address
> space associated with each region for a contiguous range of appropriate
> size based on PCI specification and encode these in accordance with
> Open Firmware PCI Bus Binding spec.
> 
> These assignments will be used by the guest when the rpaphp hotplug
> module is used, but may be re-assigned by guests for cases where we
> rely on bus rescan.
> 
> Signed-off-by: Mike Day <ncmike@ncultra.org>
> Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
> ---
>  hw/ppc/spapr_pci.c     |  375 ++++++++++++++++++++++++++++++++++++++++++++++--
>  include/hw/ppc/spapr.h |    1 +
>  2 files changed, 368 insertions(+), 8 deletions(-)
> 
> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> index 6e7ee31..9b4f829 100644
> --- a/hw/ppc/spapr_pci.c
> +++ b/hw/ppc/spapr_pci.c
> @@ -56,6 +56,17 @@
>  #define RTAS_TYPE_MSI           1
>  #define RTAS_TYPE_MSIX          2
>  
> +#define FDT_MAX_SIZE            0x10000
> +#define _FDT(exp) \
> +    do { \
> +        int ret = (exp);                                           \
> +        if (ret < 0) {                                             \
> +            return ret;                                            \
> +        }                                                          \
> +    } while (0)
> +
> +static void spapr_drc_state_reset(DrcEntry *drc_entry);
> +
>  static sPAPRPHBState *find_phb(sPAPREnvironment *spapr, uint64_t buid)
>  {
>      sPAPRPHBState *sphb;
> @@ -448,6 +459,22 @@ static void rtas_set_indicator(PowerPCCPU *cpu, sPAPREnvironment *spapr,
>          /* encode the new value into the correct bit field */
>          shift = INDICATOR_ISOLATION_SHIFT;
>          mask = INDICATOR_ISOLATION_MASK;
> +        if (drc_entry) {
> +            /* transition from unisolated to isolated for a hotplug slot
> +             * entails completion of guest-side device unplug/cleanup, so
> +             * we can now safely remove the device if qemu is waiting for
> +             * it to be released
> +             */
> +            if (DECODE_DRC_STATE(*pind, mask, shift) != indicator_state) {
> +                if (indicator_state == 0 && drc_entry->awaiting_release) {
> +                    /* device_del has been called and host is waiting
> +                     * for guest to release/isolate device, go ahead
> +                     * and remove it now
> +                     */
> +                    spapr_drc_state_reset(drc_entry);
> +                }
> +            }
> +        }
>          break;
>      case 9002: /* DR */
>          shift = INDICATOR_DR_SHIFT;
> @@ -776,6 +803,345 @@ static AddressSpace *spapr_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn)
>      return &phb->iommu_as;
>  }
>  
> +/* for 'reg'/'assigned-addresses' OF properties */
> +#define RESOURCE_CELLS_SIZE 2
> +#define RESOURCE_CELLS_ADDRESS 3
> +#define RESOURCE_CELLS_TOTAL \
> +    (RESOURCE_CELLS_SIZE + RESOURCE_CELLS_ADDRESS)
> +
> +static void fill_resource_props(PCIDevice *d, int bus_num,
> +                                uint32_t *reg, int *reg_size,
> +                                uint32_t *assigned, int *assigned_size)
> +{
> +    uint32_t *reg_row, *assigned_row;
> +    uint32_t dev_id = ((bus_num << 8) |
> +                        (PCI_SLOT(d->devfn) << 3) | PCI_FUNC(d->devfn));
> +    int i, idx = 0;
> +
> +    reg[0] = cpu_to_be32(dev_id << 8);
> +
> +    for (i = 0; i < PCI_NUM_REGIONS; i++) {
> +        if (!d->io_regions[i].size) {
> +            continue;
> +        }
> +        reg_row = &reg[(idx + 1) * RESOURCE_CELLS_TOTAL];
> +        assigned_row = &assigned[idx * RESOURCE_CELLS_TOTAL];
> +        reg_row[0] = cpu_to_be32((dev_id << 8) | (pci_bar(d, i) & 0xff));
> +        if (d->io_regions[i].type & PCI_BASE_ADDRESS_SPACE_IO) {
> +            reg_row[0] |= cpu_to_be32(0x01000000);
> +        } else {
> +            reg_row[0] |= cpu_to_be32(0x02000000);
> +        }
> +        assigned_row[0] = cpu_to_be32(reg_row[0] | 0x80000000);


0x80000000 == relocatable? 0x01000000/0x02000000 - space codes? There are
macros (b_n, b_ss) in this file, can you please use them?


> +        assigned_row[3] = reg_row[3] = cpu_to_be32(d->io_regions[i].size >> 32);
> +        assigned_row[4] = reg_row[4] = cpu_to_be32(d->io_regions[i].size);
> +        assigned_row[1] = cpu_to_be32(d->io_regions[i].addr >> 32);
> +        assigned_row[2] = cpu_to_be32(d->io_regions[i].addr);
> +        idx++;
> +    }
> +
> +    *reg_size = (idx + 1) * RESOURCE_CELLS_TOTAL * sizeof(uint32_t);
> +    *assigned_size = idx * RESOURCE_CELLS_TOTAL * sizeof(uint32_t);
> +}
> +
> +static hwaddr spapr_find_bar_addr(sPAPRPHBState *phb, PCIIORegion *r)


This does not use @pbh at all and therefore can go to hw/pci/pci.c may be
(which can be tricky though)?


> +{
> +    MemoryRegionSection mrs = { 0 };
> +    hwaddr search_addr;
> +    hwaddr size = r->size;
> +    hwaddr addr_mask = ~(size - 1);
> +    hwaddr increment = size;
> +    hwaddr limit;
> +
> +    if (r->type == PCI_BASE_ADDRESS_SPACE_MEMORY) {
> +        /* beginning portion of mmio address space for bus does not get
> +         * mapped into system memory, so calculate addr starting at the
> +         * corresponding offset into mmio as.
> +         */
> +        search_addr = (SPAPR_PCI_MEM_WIN_BUS_OFFSET + increment) & addr_mask;
> +    } else {
> +        search_addr = increment;
> +    }
> +    limit = memory_region_size(r->address_space);
> +
> +    do {
> +        mrs = memory_region_find_subregion(r->address_space, search_addr, size);
> +        if (mrs.mr) {
> +            hwaddr mr_last_addr;
> +            mr_last_addr = mrs.mr->addr + memory_region_size(mrs.mr) - 1;
> +            search_addr = (mr_last_addr + 1) & addr_mask;
> +            if (search_addr <= mr_last_addr) {
> +                search_addr += increment;
> +            }
> +            /* this memory region overlaps, unref and continue searching */
> +            memory_region_unref(mrs.mr);
> +        }
> +    } while (int128_nz(mrs.size) && search_addr + size <= limit);
> +
> +    if (search_addr + size >= limit) {
> +        return PCI_BAR_UNMAPPED;
> +    }
> +
> +    return search_addr;
> +}
> +
> +static int spapr_map_bars(sPAPRPHBState *phb, PCIDevice *dev)

This does not use @phb, well, it uses to call spapr_find_bar_addr() but
that function does not use it either.

Yet another candidate to get moved to hw/pci/pci.c? If you do so, you'll
get even more reviews :)


> +{
> +    PCIIORegion *r;
> +    int i, ret = -1;
> +
> +    for (i = 0; i < PCI_NUM_REGIONS; i++) {
> +        uint32_t bar_address = pci_bar(dev, i);
> +        uint32_t bar_value;
> +        uint16_t cmd_value = pci_default_read_config(dev, PCI_COMMAND, 2);
> +        hwaddr addr;
> +
> +        r = &dev->io_regions[i];
> +
> +        /* this region isn't registered */
> +        if (!r->size) {
> +            continue;
> +        }
> +
> +        /* find a hw addr we can map */
> +        addr = spapr_find_bar_addr(phb, r);
> +        if (addr == PCI_BAR_UNMAPPED) {
> +            /* we can't find a free range within address space for this BAR */
> +            fprintf(stderr,
> +                    "Unable to map BAR %d, no free range available\n", i);
> +            return -1;
> +        }
> +        /* we can probably map this region into memory if there is not
> +         * a race condition with some other allocator. write the address
> +         * to the device BAR which will force a call to pci_update_mappings
> +         */
> +        if (r->type & PCI_BASE_ADDRESS_SPACE_IO) {
> +            pci_default_write_config(dev, PCI_COMMAND,
> +                                     cmd_value | PCI_COMMAND_IO, 2);
> +        } else {
> +            pci_default_write_config(dev, PCI_COMMAND,
> +                                     cmd_value | PCI_COMMAND_MEMORY, 2);
> +        }
> +
> +        bar_value = addr;
> +
> +        if (i == PCI_ROM_SLOT) {
> +            bar_value |= PCI_ROM_ADDRESS_ENABLE;
> +        }
> +        /* write the new bar value */
> +        pci_default_write_config(dev, bar_address, bar_value, 4);
> +
> +        /* if this is a 64-bit BAR, we need to also write the
> +         * upper 32 bit value.
> +         */
> +        if (r->type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
> +            bar_value = (addr >> 32) & 0xffffffffUL;
> +            pci_default_write_config(dev, bar_address + 4, bar_value, 4);
> +        }
> +        ret = 0;
> +    }
> +    return ret;
> +}
> +
> +static int spapr_populate_pci_child_dt(PCIDevice *dev, void *fdt, int offset,
> +                                       int phb_index)
> +{
> +    int slot = PCI_SLOT(dev->devfn);
> +    char slotname[16];
> +    bool is_bridge = 1;
> +    DrcEntry *drc_entry, *drc_entry_slot;
> +    uint32_t reg[RESOURCE_CELLS_TOTAL * 8] = { 0 };
> +    uint32_t assigned[RESOURCE_CELLS_TOTAL * 8] = { 0 };
> +    int reg_size, assigned_size;
> +
> +    drc_entry = spapr_phb_to_drc_entry(phb_index + SPAPR_PCI_BASE_BUID);
> +    g_assert(drc_entry);
> +    drc_entry_slot = &drc_entry->child_entries[slot];
> +
> +    if (pci_default_read_config(dev, PCI_HEADER_TYPE, 1) ==


s/1/PCI_HEADER_TYPE_BRIDGE/


> +        PCI_HEADER_TYPE_NORMAL) {
> +        is_bridge = 0;
> +    }
> +
> +    _FDT(fdt_setprop_cell(fdt, offset, "vendor-id",
> +                          pci_default_read_config(dev, PCI_VENDOR_ID, 2)));
> +    _FDT(fdt_setprop_cell(fdt, offset, "device-id",
> +                          pci_default_read_config(dev, PCI_DEVICE_ID, 2)));
> +    _FDT(fdt_setprop_cell(fdt, offset, "revision-id",
> +                          pci_default_read_config(dev, PCI_REVISION_ID, 1)));
> +    _FDT(fdt_setprop_cell(fdt, offset, "class-code",
> +                          pci_default_read_config(dev, PCI_CLASS_DEVICE, 2) << 8));
> +
> +    _FDT(fdt_setprop_cell(fdt, offset, "interrupts",
> +                          pci_default_read_config(dev, PCI_INTERRUPT_PIN, 1)));
> +
> +    /* if this device is NOT a bridge */
> +    if (!is_bridge) {


s/!is_bridge/pci_default_read_config(dev, PCI_HEADER_TYPE, 1) ==
PCI_HEADER_TYPE_NORMAL/

and get rid of is_bridge?



> +        _FDT(fdt_setprop_cell(fdt, offset, "min-grant",
> +            pci_default_read_config(dev, PCI_MIN_GNT, 1)));
> +        _FDT(fdt_setprop_cell(fdt, offset, "max-latency",
> +            pci_default_read_config(dev, PCI_MAX_LAT, 1)));
> +        _FDT(fdt_setprop_cell(fdt, offset, "subsystem-id",
> +            pci_default_read_config(dev, PCI_SUBSYSTEM_ID, 2)));
> +        _FDT(fdt_setprop_cell(fdt, offset, "subsystem-vendor-id",
> +            pci_default_read_config(dev, PCI_SUBSYSTEM_VENDOR_ID, 2)));
> +    }
> +
> +    _FDT(fdt_setprop_cell(fdt, offset, "cache-line-size",
> +        pci_default_read_config(dev, PCI_CACHE_LINE_SIZE, 1)));
> +
> +    /* the following fdt cells are masked off the pci status register */
> +    int pci_status = pci_default_read_config(dev, PCI_STATUS, 2);
> +    _FDT(fdt_setprop_cell(fdt, offset, "devsel-speed",
> +                          PCI_STATUS_DEVSEL_MASK & pci_status));
> +    _FDT(fdt_setprop_cell(fdt, offset, "fast-back-to-back",
> +                          PCI_STATUS_FAST_BACK & pci_status));
> +    _FDT(fdt_setprop_cell(fdt, offset, "66mhz-capable",
> +                          PCI_STATUS_66MHZ & pci_status));
> +    _FDT(fdt_setprop_cell(fdt, offset, "udf-supported",
> +                          PCI_STATUS_UDF & pci_status));
> +
> +    _FDT(fdt_setprop_string(fdt, offset, "name", "pci"));
> +    sprintf(slotname, "Slot %d", slot + phb_index * 32);
> +    _FDT(fdt_setprop(fdt, offset, "ibm,loc-code", slotname, strlen(slotname)));
> +    _FDT(fdt_setprop_cell(fdt, offset, "ibm,my-drc-index",
> +                          drc_entry_slot->drc_index));
> +
> +    _FDT(fdt_setprop_cell(fdt, offset, "#address-cells",
> +                          RESOURCE_CELLS_ADDRESS));
> +    _FDT(fdt_setprop_cell(fdt, offset, "#size-cells",
> +                          RESOURCE_CELLS_SIZE));
> +    _FDT(fdt_setprop_cell(fdt, offset, "ibm,req#msi-x",
> +                          RESOURCE_CELLS_SIZE));
> +    fill_resource_props(dev, phb_index, reg, &reg_size,
> +                        assigned, &assigned_size);
> +    _FDT(fdt_setprop(fdt, offset, "reg", reg, reg_size));
> +    _FDT(fdt_setprop(fdt, offset, "assigned-addresses",
> +                     assigned, assigned_size));
> +
> +    return 0;
> +}
> +
> +static int spapr_device_hotplug_add(DeviceState *qdev, PCIDevice *dev)
> +{
> +    sPAPRPHBState *phb = SPAPR_PCI_HOST_BRIDGE(qdev);
> +    DrcEntry *drc_entry, *drc_entry_slot;
> +    ConfigureConnectorState *ccs;
> +    int slot = PCI_SLOT(dev->devfn);
> +    int offset, ret;
> +    void *fdt_orig, *fdt;
> +    char nodename[512];
> +    uint32_t encoded = ENCODE_DRC_STATE(INDICATOR_ENTITY_SENSE_PRESENT,
> +                                        INDICATOR_ENTITY_SENSE_MASK,
> +                                        INDICATOR_ENTITY_SENSE_SHIFT);
> +
> +    drc_entry = spapr_phb_to_drc_entry(phb->buid);
> +    g_assert(drc_entry);
> +    drc_entry_slot = &drc_entry->child_entries[slot];
> +
> +    drc_entry->state &= ~(uint32_t)INDICATOR_ENTITY_SENSE_MASK;
> +    drc_entry->state |= encoded; /* DR entity present */
> +    drc_entry_slot->state &= ~(uint32_t)INDICATOR_ENTITY_SENSE_MASK;
> +    drc_entry_slot->state |= encoded; /* and the slot */


"and the slot" what?
s/uint32_t encoded/const uint32_t present/ and remove the comments?


> +    /* need to allocate memory region for device BARs */
> +    spapr_map_bars(phb, dev);
> +
> +    /* add OF node for pci device and required OF DT properties */
> +    fdt_orig = g_malloc0(FDT_MAX_SIZE);
> +    offset = fdt_create(fdt_orig, FDT_MAX_SIZE);
> +    fdt_begin_node(fdt_orig, "");
> +    fdt_end_node(fdt_orig);
> +    fdt_finish(fdt_orig);
> +
> +    fdt = g_malloc0(FDT_MAX_SIZE);
> +    fdt_open_into(fdt_orig, fdt, FDT_MAX_SIZE);
> +    sprintf(nodename, "pci@%d", slot);
> +    offset = fdt_add_subnode(fdt, 0, nodename);
> +    ret = spapr_populate_pci_child_dt(dev, fdt, offset, phb->index);
> +    g_assert(!ret);
> +    g_free(fdt_orig);
> +
> +    /* hold on to node, configure_connector will pass it to the guest later */
> +    ccs = &drc_entry_slot->cc_state;
> +    ccs->fdt = fdt;
> +    ccs->offset_start = offset;
> +    ccs->state = CC_STATE_PENDING;
> +    ccs->dev = dev;
> +
> +    return 0;
> +}
> +
> +/* check whether guest has released/isolated device */
> +static bool spapr_drc_state_is_releasable(DrcEntry *drc_entry)
> +{
> +    return !DECODE_DRC_STATE(drc_entry->state,
> +                             INDICATOR_ISOLATION_MASK,
> +                             INDICATOR_ISOLATION_SHIFT);
> +}

It looks like this is the only separated function which calls
DECODE_DRC_STATE, and it is used just once, and  others call
DECODE_DRC_STATE()/ENCODE_DRC_STATE() directly. I'd remove this function
and call DECODE_DRC_STATE() directly, below in the code.


> +
> +/* finalize device unplug/deletion */
> +static void spapr_drc_state_reset(DrcEntry *drc_entry)
> +{
> +    ConfigureConnectorState *ccs = &drc_entry->cc_state;
> +    uint32_t sense_empty = ENCODE_DRC_STATE(INDICATOR_ENTITY_SENSE_EMPTY,
> +                                            INDICATOR_ENTITY_SENSE_MASK,
> +                                            INDICATOR_ENTITY_SENSE_SHIFT);
> +
> +    g_free(ccs->fdt);
> +    ccs->fdt = NULL;
> +    object_unparent(OBJECT(ccs->dev));
> +    ccs->dev = NULL;
> +    ccs->state = CC_STATE_IDLE;
> +    drc_entry->state &= ~INDICATOR_ENTITY_SENSE_MASK;
> +    drc_entry->state |= sense_empty;
> +    drc_entry->awaiting_release = false;
> +}
> +
> +static void spapr_device_hotplug_remove(DeviceState *qdev, PCIDevice *dev)
> +{
> +    sPAPRPHBState *phb = SPAPR_PCI_HOST_BRIDGE(qdev);
> +    DrcEntry *drc_entry, *drc_entry_slot;
> +    ConfigureConnectorState *ccs;
> +    int slot = PCI_SLOT(dev->devfn);
> +
> +    drc_entry = spapr_phb_to_drc_entry(phb->buid);
> +    g_assert(drc_entry);
> +    drc_entry_slot = &drc_entry->child_entries[slot];
> +    ccs = &drc_entry_slot->cc_state;
> +    /* shouldn't be removing devices we haven't created an fdt for */
> +    g_assert(ccs->state != CC_STATE_IDLE);


Instead of g_assert(), would not it be better to return -1 here and then
return this return code from spapr_device_hotplug() and let common PCI code
handle this?

Or we are absolutely sure that spapr_device_hotplug() cannot possibly fail
so we are ready to kill the guest if it does? I do not know, just asking :)


> +    /* if the device has already been released/isolated by guest, go ahead
> +     * and remove it now. Otherwise, flag it as pending guest release so it
> +     * can be removed later
> +     */
> +    if (spapr_drc_state_is_releasable(drc_entry_slot)) {
> +        spapr_drc_state_reset(drc_entry_slot);
> +    } else {
> +        if (drc_entry_slot->awaiting_release) {
> +            fprintf(stderr, "waiting for guest to release the device");
> +        } else {
> +            drc_entry_slot->awaiting_release = true;
> +        }
> +    }
> +}
> +
> +static int spapr_device_hotplug(DeviceState *qdev, PCIDevice *dev,
> +                                PCIHotplugState state)
> +{

sPAPRPHBState *phb = SPAPR_PCI_HOST_BRIDGE(qdev);

> +    if (state == PCI_COLDPLUG_ENABLED) {
> +        return 0;
> +    }
> +
> +    if (state == PCI_HOTPLUG_ENABLED) {
> +        spapr_device_hotplug_add(qdev, dev);
> +    } else {
> +        spapr_device_hotplug_remove(qdev, dev);
> +    }

and here s/qdev/phb/? spapr_device_hotplug_(add|remove),
spapr_pci_hotplug_(add|remove)_event (from further patch(es)) do not use
qdev as a DeviceState anyway, they cast it to sPAPRPHBState and use that.



> +
> +    return 0;
> +}
> +
>  static int spapr_phb_init(SysBusDevice *s)
>  {
>      DeviceState *dev = DEVICE(s);
> @@ -889,6 +1255,7 @@ static int spapr_phb_init(SysBusDevice *s)
>                             &sphb->memspace, &sphb->iospace,
>                             PCI_DEVFN(0, 0), PCI_NUM_PINS, TYPE_PCI_BUS);
>      phb->bus = bus;
> +    pci_bus_hotplug(phb->bus, spapr_device_hotplug, DEVICE(sphb));
>  
>      sphb->dma_window_start = 0;
>      sphb->dma_window_size = 0x40000000;
> @@ -1181,14 +1548,6 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
>          return bus_off;
>      }
>  
> -#define _FDT(exp) \
> -    do { \
> -        int ret = (exp);                                           \
> -        if (ret < 0) {                                             \
> -            return ret;                                            \
> -        }                                                          \
> -    } while (0)
> -
>      /* Write PHB properties */
>      _FDT(fdt_setprop_string(fdt, bus_off, "device_type", "pci"));
>      _FDT(fdt_setprop_string(fdt, bus_off, "compatible", "IBM,Logical_PHB"));
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index 7c8a521..1c9b725 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -328,6 +328,7 @@ struct DrcEntry {
>      void *fdt;
>      int fdt_offset;
>      uint32_t state;
> +    bool awaiting_release;
>      ConfigureConnectorState cc_state;
>      DrcEntry *child_entries;
>  };
>
Michael Roth Jan. 16, 2014, 9:22 p.m. UTC | #2
Quoting Alexey Kardashevskiy (2013-12-15 22:36:32)
> On 12/06/2013 09:33 AM, Michael Roth wrote:
> > From: Mike Day <ncmike@ncultra.org>
> > 
> > This enables hotplug for PHB bridges. Upon hotplug we generate the
> > OF-nodes required by PAPR specification and IEEE 1275-1994
> > "PCI Bus Binding to Open Firmware" for the device.
> > 
> > We associate the corresponding FDT for these nodes with the DrcEntry
> > corresponding to the slot, which will be fetched via
> > ibm,configure-connector RTAS calls by the guest as described by PAPR
> > specification. The FDT is cleaned up in the case of unplug.
> > 
> > Amongst the required OF-node properties for each device are the "reg"
> > and "assigned-addresses" properties which describe the BAR-assignments
> > for IO/MEM/ROM regions. To handle these assignments we scan the address
> > space associated with each region for a contiguous range of appropriate
> > size based on PCI specification and encode these in accordance with
> > Open Firmware PCI Bus Binding spec.
> > 
> > These assignments will be used by the guest when the rpaphp hotplug
> > module is used, but may be re-assigned by guests for cases where we
> > rely on bus rescan.
> > 
> > Signed-off-by: Mike Day <ncmike@ncultra.org>
> > Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
> > ---
> >  hw/ppc/spapr_pci.c     |  375 ++++++++++++++++++++++++++++++++++++++++++++++--
> >  include/hw/ppc/spapr.h |    1 +
> >  2 files changed, 368 insertions(+), 8 deletions(-)
> > 
> > diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> > index 6e7ee31..9b4f829 100644
> > --- a/hw/ppc/spapr_pci.c
> > +++ b/hw/ppc/spapr_pci.c
> > @@ -56,6 +56,17 @@
> >  #define RTAS_TYPE_MSI           1
> >  #define RTAS_TYPE_MSIX          2
> >  
> > +#define FDT_MAX_SIZE            0x10000
> > +#define _FDT(exp) \
> > +    do { \
> > +        int ret = (exp);                                           \
> > +        if (ret < 0) {                                             \
> > +            return ret;                                            \
> > +        }                                                          \
> > +    } while (0)
> > +
> > +static void spapr_drc_state_reset(DrcEntry *drc_entry);
> > +
> >  static sPAPRPHBState *find_phb(sPAPREnvironment *spapr, uint64_t buid)
> >  {
> >      sPAPRPHBState *sphb;
> > @@ -448,6 +459,22 @@ static void rtas_set_indicator(PowerPCCPU *cpu, sPAPREnvironment *spapr,
> >          /* encode the new value into the correct bit field */
> >          shift = INDICATOR_ISOLATION_SHIFT;
> >          mask = INDICATOR_ISOLATION_MASK;
> > +        if (drc_entry) {
> > +            /* transition from unisolated to isolated for a hotplug slot
> > +             * entails completion of guest-side device unplug/cleanup, so
> > +             * we can now safely remove the device if qemu is waiting for
> > +             * it to be released
> > +             */
> > +            if (DECODE_DRC_STATE(*pind, mask, shift) != indicator_state) {
> > +                if (indicator_state == 0 && drc_entry->awaiting_release) {
> > +                    /* device_del has been called and host is waiting
> > +                     * for guest to release/isolate device, go ahead
> > +                     * and remove it now
> > +                     */
> > +                    spapr_drc_state_reset(drc_entry);
> > +                }
> > +            }
> > +        }
> >          break;
> >      case 9002: /* DR */
> >          shift = INDICATOR_DR_SHIFT;
> > @@ -776,6 +803,345 @@ static AddressSpace *spapr_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn)
> >      return &phb->iommu_as;
> >  }
> >  
> > +/* for 'reg'/'assigned-addresses' OF properties */
> > +#define RESOURCE_CELLS_SIZE 2
> > +#define RESOURCE_CELLS_ADDRESS 3
> > +#define RESOURCE_CELLS_TOTAL \
> > +    (RESOURCE_CELLS_SIZE + RESOURCE_CELLS_ADDRESS)
> > +
> > +static void fill_resource_props(PCIDevice *d, int bus_num,
> > +                                uint32_t *reg, int *reg_size,
> > +                                uint32_t *assigned, int *assigned_size)
> > +{
> > +    uint32_t *reg_row, *assigned_row;
> > +    uint32_t dev_id = ((bus_num << 8) |
> > +                        (PCI_SLOT(d->devfn) << 3) | PCI_FUNC(d->devfn));
> > +    int i, idx = 0;
> > +
> > +    reg[0] = cpu_to_be32(dev_id << 8);
> > +
> > +    for (i = 0; i < PCI_NUM_REGIONS; i++) {
> > +        if (!d->io_regions[i].size) {
> > +            continue;
> > +        }
> > +        reg_row = &reg[(idx + 1) * RESOURCE_CELLS_TOTAL];
> > +        assigned_row = &assigned[idx * RESOURCE_CELLS_TOTAL];
> > +        reg_row[0] = cpu_to_be32((dev_id << 8) | (pci_bar(d, i) & 0xff));
> > +        if (d->io_regions[i].type & PCI_BASE_ADDRESS_SPACE_IO) {
> > +            reg_row[0] |= cpu_to_be32(0x01000000);
> > +        } else {
> > +            reg_row[0] |= cpu_to_be32(0x02000000);
> > +        }
> > +        assigned_row[0] = cpu_to_be32(reg_row[0] | 0x80000000);
> 
> 
> 0x80000000 == relocatable? 0x01000000/0x02000000 - space codes? There are
> macros (b_n, b_ss) in this file, can you please use them?

Ah, those look handy, thanks

> 
> 
> > +        assigned_row[3] = reg_row[3] = cpu_to_be32(d->io_regions[i].size >> 32);
> > +        assigned_row[4] = reg_row[4] = cpu_to_be32(d->io_regions[i].size);
> > +        assigned_row[1] = cpu_to_be32(d->io_regions[i].addr >> 32);
> > +        assigned_row[2] = cpu_to_be32(d->io_regions[i].addr);
> > +        idx++;
> > +    }
> > +
> > +    *reg_size = (idx + 1) * RESOURCE_CELLS_TOTAL * sizeof(uint32_t);
> > +    *assigned_size = idx * RESOURCE_CELLS_TOTAL * sizeof(uint32_t);
> > +}
> > +
> > +static hwaddr spapr_find_bar_addr(sPAPRPHBState *phb, PCIIORegion *r)
> 
> 
> This does not use @pbh at all and therefore can go to hw/pci/pci.c may be
> (which can be tricky though)?

SPAPR_PCI_MEM_WIN_BUS_OFFSET is certainly platform specific, so that would
need to be generalized. Beyond that, I don't have enough experience to know
how useful this would be for other platforms, but perhaps it's general enough
for others to add-to/re-use in the future. I'll take a stab at moving it to
hw/pci/pci.c and Cc the appropriate maintainers.

> 
> 
> > +{
> > +    MemoryRegionSection mrs = { 0 };
> > +    hwaddr search_addr;
> > +    hwaddr size = r->size;
> > +    hwaddr addr_mask = ~(size - 1);
> > +    hwaddr increment = size;
> > +    hwaddr limit;
> > +
> > +    if (r->type == PCI_BASE_ADDRESS_SPACE_MEMORY) {
> > +        /* beginning portion of mmio address space for bus does not get
> > +         * mapped into system memory, so calculate addr starting at the
> > +         * corresponding offset into mmio as.
> > +         */
> > +        search_addr = (SPAPR_PCI_MEM_WIN_BUS_OFFSET + increment) & addr_mask;
> > +    } else {
> > +        search_addr = increment;
> > +    }
> > +    limit = memory_region_size(r->address_space);
> > +
> > +    do {
> > +        mrs = memory_region_find_subregion(r->address_space, search_addr, size);
> > +        if (mrs.mr) {
> > +            hwaddr mr_last_addr;
> > +            mr_last_addr = mrs.mr->addr + memory_region_size(mrs.mr) - 1;
> > +            search_addr = (mr_last_addr + 1) & addr_mask;
> > +            if (search_addr <= mr_last_addr) {
> > +                search_addr += increment;
> > +            }
> > +            /* this memory region overlaps, unref and continue searching */
> > +            memory_region_unref(mrs.mr);
> > +        }
> > +    } while (int128_nz(mrs.size) && search_addr + size <= limit);
> > +
> > +    if (search_addr + size >= limit) {
> > +        return PCI_BAR_UNMAPPED;
> > +    }
> > +
> > +    return search_addr;
> > +}
> > +
> > +static int spapr_map_bars(sPAPRPHBState *phb, PCIDevice *dev)
> 
> This does not use @phb, well, it uses to call spapr_find_bar_addr() but
> that function does not use it either.
> 
> Yet another candidate to get moved to hw/pci/pci.c? If you do so, you'll
> get even more reviews :)
> 
> 
> > +{
> > +    PCIIORegion *r;
> > +    int i, ret = -1;
> > +
> > +    for (i = 0; i < PCI_NUM_REGIONS; i++) {
> > +        uint32_t bar_address = pci_bar(dev, i);
> > +        uint32_t bar_value;
> > +        uint16_t cmd_value = pci_default_read_config(dev, PCI_COMMAND, 2);
> > +        hwaddr addr;
> > +
> > +        r = &dev->io_regions[i];
> > +
> > +        /* this region isn't registered */
> > +        if (!r->size) {
> > +            continue;
> > +        }
> > +
> > +        /* find a hw addr we can map */
> > +        addr = spapr_find_bar_addr(phb, r);
> > +        if (addr == PCI_BAR_UNMAPPED) {
> > +            /* we can't find a free range within address space for this BAR */
> > +            fprintf(stderr,
> > +                    "Unable to map BAR %d, no free range available\n", i);
> > +            return -1;
> > +        }
> > +        /* we can probably map this region into memory if there is not
> > +         * a race condition with some other allocator. write the address
> > +         * to the device BAR which will force a call to pci_update_mappings
> > +         */
> > +        if (r->type & PCI_BASE_ADDRESS_SPACE_IO) {
> > +            pci_default_write_config(dev, PCI_COMMAND,
> > +                                     cmd_value | PCI_COMMAND_IO, 2);
> > +        } else {
> > +            pci_default_write_config(dev, PCI_COMMAND,
> > +                                     cmd_value | PCI_COMMAND_MEMORY, 2);
> > +        }
> > +
> > +        bar_value = addr;
> > +
> > +        if (i == PCI_ROM_SLOT) {
> > +            bar_value |= PCI_ROM_ADDRESS_ENABLE;
> > +        }
> > +        /* write the new bar value */
> > +        pci_default_write_config(dev, bar_address, bar_value, 4);
> > +
> > +        /* if this is a 64-bit BAR, we need to also write the
> > +         * upper 32 bit value.
> > +         */
> > +        if (r->type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
> > +            bar_value = (addr >> 32) & 0xffffffffUL;
> > +            pci_default_write_config(dev, bar_address + 4, bar_value, 4);
> > +        }
> > +        ret = 0;
> > +    }
> > +    return ret;
> > +}
> > +
> > +static int spapr_populate_pci_child_dt(PCIDevice *dev, void *fdt, int offset,
> > +                                       int phb_index)
> > +{
> > +    int slot = PCI_SLOT(dev->devfn);
> > +    char slotname[16];
> > +    bool is_bridge = 1;
> > +    DrcEntry *drc_entry, *drc_entry_slot;
> > +    uint32_t reg[RESOURCE_CELLS_TOTAL * 8] = { 0 };
> > +    uint32_t assigned[RESOURCE_CELLS_TOTAL * 8] = { 0 };
> > +    int reg_size, assigned_size;
> > +
> > +    drc_entry = spapr_phb_to_drc_entry(phb_index + SPAPR_PCI_BASE_BUID);
> > +    g_assert(drc_entry);
> > +    drc_entry_slot = &drc_entry->child_entries[slot];
> > +
> > +    if (pci_default_read_config(dev, PCI_HEADER_TYPE, 1) ==
> 
> 
> s/1/PCI_HEADER_TYPE_BRIDGE/
> 
> 
> > +        PCI_HEADER_TYPE_NORMAL) {
> > +        is_bridge = 0;
> > +    }
> > +
> > +    _FDT(fdt_setprop_cell(fdt, offset, "vendor-id",
> > +                          pci_default_read_config(dev, PCI_VENDOR_ID, 2)));
> > +    _FDT(fdt_setprop_cell(fdt, offset, "device-id",
> > +                          pci_default_read_config(dev, PCI_DEVICE_ID, 2)));
> > +    _FDT(fdt_setprop_cell(fdt, offset, "revision-id",
> > +                          pci_default_read_config(dev, PCI_REVISION_ID, 1)));
> > +    _FDT(fdt_setprop_cell(fdt, offset, "class-code",
> > +                          pci_default_read_config(dev, PCI_CLASS_DEVICE, 2) << 8));
> > +
> > +    _FDT(fdt_setprop_cell(fdt, offset, "interrupts",
> > +                          pci_default_read_config(dev, PCI_INTERRUPT_PIN, 1)));
> > +
> > +    /* if this device is NOT a bridge */
> > +    if (!is_bridge) {
> 
> 
> s/!is_bridge/pci_default_read_config(dev, PCI_HEADER_TYPE, 1) ==
> PCI_HEADER_TYPE_NORMAL/
> 
> and get rid of is_bridge?
> 
> 
> 
> > +        _FDT(fdt_setprop_cell(fdt, offset, "min-grant",
> > +            pci_default_read_config(dev, PCI_MIN_GNT, 1)));
> > +        _FDT(fdt_setprop_cell(fdt, offset, "max-latency",
> > +            pci_default_read_config(dev, PCI_MAX_LAT, 1)));
> > +        _FDT(fdt_setprop_cell(fdt, offset, "subsystem-id",
> > +            pci_default_read_config(dev, PCI_SUBSYSTEM_ID, 2)));
> > +        _FDT(fdt_setprop_cell(fdt, offset, "subsystem-vendor-id",
> > +            pci_default_read_config(dev, PCI_SUBSYSTEM_VENDOR_ID, 2)));
> > +    }
> > +
> > +    _FDT(fdt_setprop_cell(fdt, offset, "cache-line-size",
> > +        pci_default_read_config(dev, PCI_CACHE_LINE_SIZE, 1)));
> > +
> > +    /* the following fdt cells are masked off the pci status register */
> > +    int pci_status = pci_default_read_config(dev, PCI_STATUS, 2);
> > +    _FDT(fdt_setprop_cell(fdt, offset, "devsel-speed",
> > +                          PCI_STATUS_DEVSEL_MASK & pci_status));
> > +    _FDT(fdt_setprop_cell(fdt, offset, "fast-back-to-back",
> > +                          PCI_STATUS_FAST_BACK & pci_status));
> > +    _FDT(fdt_setprop_cell(fdt, offset, "66mhz-capable",
> > +                          PCI_STATUS_66MHZ & pci_status));
> > +    _FDT(fdt_setprop_cell(fdt, offset, "udf-supported",
> > +                          PCI_STATUS_UDF & pci_status));
> > +
> > +    _FDT(fdt_setprop_string(fdt, offset, "name", "pci"));
> > +    sprintf(slotname, "Slot %d", slot + phb_index * 32);
> > +    _FDT(fdt_setprop(fdt, offset, "ibm,loc-code", slotname, strlen(slotname)));
> > +    _FDT(fdt_setprop_cell(fdt, offset, "ibm,my-drc-index",
> > +                          drc_entry_slot->drc_index));
> > +
> > +    _FDT(fdt_setprop_cell(fdt, offset, "#address-cells",
> > +                          RESOURCE_CELLS_ADDRESS));
> > +    _FDT(fdt_setprop_cell(fdt, offset, "#size-cells",
> > +                          RESOURCE_CELLS_SIZE));
> > +    _FDT(fdt_setprop_cell(fdt, offset, "ibm,req#msi-x",
> > +                          RESOURCE_CELLS_SIZE));
> > +    fill_resource_props(dev, phb_index, reg, &reg_size,
> > +                        assigned, &assigned_size);
> > +    _FDT(fdt_setprop(fdt, offset, "reg", reg, reg_size));
> > +    _FDT(fdt_setprop(fdt, offset, "assigned-addresses",
> > +                     assigned, assigned_size));
> > +
> > +    return 0;
> > +}
> > +
> > +static int spapr_device_hotplug_add(DeviceState *qdev, PCIDevice *dev)
> > +{
> > +    sPAPRPHBState *phb = SPAPR_PCI_HOST_BRIDGE(qdev);
> > +    DrcEntry *drc_entry, *drc_entry_slot;
> > +    ConfigureConnectorState *ccs;
> > +    int slot = PCI_SLOT(dev->devfn);
> > +    int offset, ret;
> > +    void *fdt_orig, *fdt;
> > +    char nodename[512];
> > +    uint32_t encoded = ENCODE_DRC_STATE(INDICATOR_ENTITY_SENSE_PRESENT,
> > +                                        INDICATOR_ENTITY_SENSE_MASK,
> > +                                        INDICATOR_ENTITY_SENSE_SHIFT);
> > +
> > +    drc_entry = spapr_phb_to_drc_entry(phb->buid);
> > +    g_assert(drc_entry);
> > +    drc_entry_slot = &drc_entry->child_entries[slot];
> > +
> > +    drc_entry->state &= ~(uint32_t)INDICATOR_ENTITY_SENSE_MASK;
> > +    drc_entry->state |= encoded; /* DR entity present */
> > +    drc_entry_slot->state &= ~(uint32_t)INDICATOR_ENTITY_SENSE_MASK;
> > +    drc_entry_slot->state |= encoded; /* and the slot */
> 
> 
> "and the slot" what?
> s/uint32_t encoded/const uint32_t present/ and remove the comments?
> 
> 
> > +    /* need to allocate memory region for device BARs */
> > +    spapr_map_bars(phb, dev);
> > +
> > +    /* add OF node for pci device and required OF DT properties */
> > +    fdt_orig = g_malloc0(FDT_MAX_SIZE);
> > +    offset = fdt_create(fdt_orig, FDT_MAX_SIZE);
> > +    fdt_begin_node(fdt_orig, "");
> > +    fdt_end_node(fdt_orig);
> > +    fdt_finish(fdt_orig);
> > +
> > +    fdt = g_malloc0(FDT_MAX_SIZE);
> > +    fdt_open_into(fdt_orig, fdt, FDT_MAX_SIZE);
> > +    sprintf(nodename, "pci@%d", slot);
> > +    offset = fdt_add_subnode(fdt, 0, nodename);
> > +    ret = spapr_populate_pci_child_dt(dev, fdt, offset, phb->index);
> > +    g_assert(!ret);
> > +    g_free(fdt_orig);
> > +
> > +    /* hold on to node, configure_connector will pass it to the guest later */
> > +    ccs = &drc_entry_slot->cc_state;
> > +    ccs->fdt = fdt;
> > +    ccs->offset_start = offset;
> > +    ccs->state = CC_STATE_PENDING;
> > +    ccs->dev = dev;
> > +
> > +    return 0;
> > +}
> > +
> > +/* check whether guest has released/isolated device */
> > +static bool spapr_drc_state_is_releasable(DrcEntry *drc_entry)
> > +{
> > +    return !DECODE_DRC_STATE(drc_entry->state,
> > +                             INDICATOR_ISOLATION_MASK,
> > +                             INDICATOR_ISOLATION_SHIFT);
> > +}
> 
> It looks like this is the only separated function which calls
> DECODE_DRC_STATE, and it is used just once, and  others call
> DECODE_DRC_STATE()/ENCODE_DRC_STATE() directly. I'd remove this function
> and call DECODE_DRC_STATE() directly, below in the code.
> 
> 
> > +
> > +/* finalize device unplug/deletion */
> > +static void spapr_drc_state_reset(DrcEntry *drc_entry)
> > +{
> > +    ConfigureConnectorState *ccs = &drc_entry->cc_state;
> > +    uint32_t sense_empty = ENCODE_DRC_STATE(INDICATOR_ENTITY_SENSE_EMPTY,
> > +                                            INDICATOR_ENTITY_SENSE_MASK,
> > +                                            INDICATOR_ENTITY_SENSE_SHIFT);
> > +
> > +    g_free(ccs->fdt);
> > +    ccs->fdt = NULL;
> > +    object_unparent(OBJECT(ccs->dev));
> > +    ccs->dev = NULL;
> > +    ccs->state = CC_STATE_IDLE;
> > +    drc_entry->state &= ~INDICATOR_ENTITY_SENSE_MASK;
> > +    drc_entry->state |= sense_empty;
> > +    drc_entry->awaiting_release = false;
> > +}
> > +
> > +static void spapr_device_hotplug_remove(DeviceState *qdev, PCIDevice *dev)
> > +{
> > +    sPAPRPHBState *phb = SPAPR_PCI_HOST_BRIDGE(qdev);
> > +    DrcEntry *drc_entry, *drc_entry_slot;
> > +    ConfigureConnectorState *ccs;
> > +    int slot = PCI_SLOT(dev->devfn);
> > +
> > +    drc_entry = spapr_phb_to_drc_entry(phb->buid);
> > +    g_assert(drc_entry);
> > +    drc_entry_slot = &drc_entry->child_entries[slot];
> > +    ccs = &drc_entry_slot->cc_state;
> > +    /* shouldn't be removing devices we haven't created an fdt for */
> > +    g_assert(ccs->state != CC_STATE_IDLE);
> 
> 
> Instead of g_assert(), would not it be better to return -1 here and then
> return this return code from spapr_device_hotplug() and let common PCI code
> handle this?
> 
> Or we are absolutely sure that spapr_device_hotplug() cannot possibly fail
> so we are ready to kill the guest if it does? I do not know, just asking :)

CC_STATE_IDLE describes an empty slot in the context of PCI, so we should
never hit the assertion. It more of a development aid to ensure proper code
handling during initial device creation. This should not be triggerable by
users.

> 
> 
> > +    /* if the device has already been released/isolated by guest, go ahead
> > +     * and remove it now. Otherwise, flag it as pending guest release so it
> > +     * can be removed later
> > +     */
> > +    if (spapr_drc_state_is_releasable(drc_entry_slot)) {
> > +        spapr_drc_state_reset(drc_entry_slot);
> > +    } else {
> > +        if (drc_entry_slot->awaiting_release) {
> > +            fprintf(stderr, "waiting for guest to release the device");
> > +        } else {
> > +            drc_entry_slot->awaiting_release = true;
> > +        }
> > +    }
> > +}
> > +
> > +static int spapr_device_hotplug(DeviceState *qdev, PCIDevice *dev,
> > +                                PCIHotplugState state)
> > +{
> 
> sPAPRPHBState *phb = SPAPR_PCI_HOST_BRIDGE(qdev);
> 
> > +    if (state == PCI_COLDPLUG_ENABLED) {
> > +        return 0;
> > +    }
> > +
> > +    if (state == PCI_HOTPLUG_ENABLED) {
> > +        spapr_device_hotplug_add(qdev, dev);
> > +    } else {
> > +        spapr_device_hotplug_remove(qdev, dev);
> > +    }
> 
> and here s/qdev/phb/? spapr_device_hotplug_(add|remove),
> spapr_pci_hotplug_(add|remove)_event (from further patch(es)) do not use
> qdev as a DeviceState anyway, they cast it to sPAPRPHBState and use that.
> 

That's cleaner, will fix.

> 
> 
> > +
> > +    return 0;
> > +}
> > +
> >  static int spapr_phb_init(SysBusDevice *s)
> >  {
> >      DeviceState *dev = DEVICE(s);
> > @@ -889,6 +1255,7 @@ static int spapr_phb_init(SysBusDevice *s)
> >                             &sphb->memspace, &sphb->iospace,
> >                             PCI_DEVFN(0, 0), PCI_NUM_PINS, TYPE_PCI_BUS);
> >      phb->bus = bus;
> > +    pci_bus_hotplug(phb->bus, spapr_device_hotplug, DEVICE(sphb));
> >  
> >      sphb->dma_window_start = 0;
> >      sphb->dma_window_size = 0x40000000;
> > @@ -1181,14 +1548,6 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
> >          return bus_off;
> >      }
> >  
> > -#define _FDT(exp) \
> > -    do { \
> > -        int ret = (exp);                                           \
> > -        if (ret < 0) {                                             \
> > -            return ret;                                            \
> > -        }                                                          \
> > -    } while (0)
> > -
> >      /* Write PHB properties */
> >      _FDT(fdt_setprop_string(fdt, bus_off, "device_type", "pci"));
> >      _FDT(fdt_setprop_string(fdt, bus_off, "compatible", "IBM,Logical_PHB"));
> > diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> > index 7c8a521..1c9b725 100644
> > --- a/include/hw/ppc/spapr.h
> > +++ b/include/hw/ppc/spapr.h
> > @@ -328,6 +328,7 @@ struct DrcEntry {
> >      void *fdt;
> >      int fdt_offset;
> >      uint32_t state;
> > +    bool awaiting_release;
> >      ConfigureConnectorState cc_state;
> >      DrcEntry *child_entries;
> >  };
> > 
> 
> 
> -- 
> Alexey
diff mbox

Patch

diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index 6e7ee31..9b4f829 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -56,6 +56,17 @@ 
 #define RTAS_TYPE_MSI           1
 #define RTAS_TYPE_MSIX          2
 
+#define FDT_MAX_SIZE            0x10000
+#define _FDT(exp) \
+    do { \
+        int ret = (exp);                                           \
+        if (ret < 0) {                                             \
+            return ret;                                            \
+        }                                                          \
+    } while (0)
+
+static void spapr_drc_state_reset(DrcEntry *drc_entry);
+
 static sPAPRPHBState *find_phb(sPAPREnvironment *spapr, uint64_t buid)
 {
     sPAPRPHBState *sphb;
@@ -448,6 +459,22 @@  static void rtas_set_indicator(PowerPCCPU *cpu, sPAPREnvironment *spapr,
         /* encode the new value into the correct bit field */
         shift = INDICATOR_ISOLATION_SHIFT;
         mask = INDICATOR_ISOLATION_MASK;
+        if (drc_entry) {
+            /* transition from unisolated to isolated for a hotplug slot
+             * entails completion of guest-side device unplug/cleanup, so
+             * we can now safely remove the device if qemu is waiting for
+             * it to be released
+             */
+            if (DECODE_DRC_STATE(*pind, mask, shift) != indicator_state) {
+                if (indicator_state == 0 && drc_entry->awaiting_release) {
+                    /* device_del has been called and host is waiting
+                     * for guest to release/isolate device, go ahead
+                     * and remove it now
+                     */
+                    spapr_drc_state_reset(drc_entry);
+                }
+            }
+        }
         break;
     case 9002: /* DR */
         shift = INDICATOR_DR_SHIFT;
@@ -776,6 +803,345 @@  static AddressSpace *spapr_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn)
     return &phb->iommu_as;
 }
 
+/* for 'reg'/'assigned-addresses' OF properties */
+#define RESOURCE_CELLS_SIZE 2
+#define RESOURCE_CELLS_ADDRESS 3
+#define RESOURCE_CELLS_TOTAL \
+    (RESOURCE_CELLS_SIZE + RESOURCE_CELLS_ADDRESS)
+
+static void fill_resource_props(PCIDevice *d, int bus_num,
+                                uint32_t *reg, int *reg_size,
+                                uint32_t *assigned, int *assigned_size)
+{
+    uint32_t *reg_row, *assigned_row;
+    uint32_t dev_id = ((bus_num << 8) |
+                        (PCI_SLOT(d->devfn) << 3) | PCI_FUNC(d->devfn));
+    int i, idx = 0;
+
+    reg[0] = cpu_to_be32(dev_id << 8);
+
+    for (i = 0; i < PCI_NUM_REGIONS; i++) {
+        if (!d->io_regions[i].size) {
+            continue;
+        }
+        reg_row = &reg[(idx + 1) * RESOURCE_CELLS_TOTAL];
+        assigned_row = &assigned[idx * RESOURCE_CELLS_TOTAL];
+        reg_row[0] = cpu_to_be32((dev_id << 8) | (pci_bar(d, i) & 0xff));
+        if (d->io_regions[i].type & PCI_BASE_ADDRESS_SPACE_IO) {
+            reg_row[0] |= cpu_to_be32(0x01000000);
+        } else {
+            reg_row[0] |= cpu_to_be32(0x02000000);
+        }
+        assigned_row[0] = cpu_to_be32(reg_row[0] | 0x80000000);
+        assigned_row[3] = reg_row[3] = cpu_to_be32(d->io_regions[i].size >> 32);
+        assigned_row[4] = reg_row[4] = cpu_to_be32(d->io_regions[i].size);
+        assigned_row[1] = cpu_to_be32(d->io_regions[i].addr >> 32);
+        assigned_row[2] = cpu_to_be32(d->io_regions[i].addr);
+        idx++;
+    }
+
+    *reg_size = (idx + 1) * RESOURCE_CELLS_TOTAL * sizeof(uint32_t);
+    *assigned_size = idx * RESOURCE_CELLS_TOTAL * sizeof(uint32_t);
+}
+
+static hwaddr spapr_find_bar_addr(sPAPRPHBState *phb, PCIIORegion *r)
+{
+    MemoryRegionSection mrs = { 0 };
+    hwaddr search_addr;
+    hwaddr size = r->size;
+    hwaddr addr_mask = ~(size - 1);
+    hwaddr increment = size;
+    hwaddr limit;
+
+    if (r->type == PCI_BASE_ADDRESS_SPACE_MEMORY) {
+        /* beginning portion of mmio address space for bus does not get
+         * mapped into system memory, so calculate addr starting at the
+         * corresponding offset into mmio as.
+         */
+        search_addr = (SPAPR_PCI_MEM_WIN_BUS_OFFSET + increment) & addr_mask;
+    } else {
+        search_addr = increment;
+    }
+    limit = memory_region_size(r->address_space);
+
+    do {
+        mrs = memory_region_find_subregion(r->address_space, search_addr, size);
+        if (mrs.mr) {
+            hwaddr mr_last_addr;
+            mr_last_addr = mrs.mr->addr + memory_region_size(mrs.mr) - 1;
+            search_addr = (mr_last_addr + 1) & addr_mask;
+            if (search_addr <= mr_last_addr) {
+                search_addr += increment;
+            }
+            /* this memory region overlaps, unref and continue searching */
+            memory_region_unref(mrs.mr);
+        }
+    } while (int128_nz(mrs.size) && search_addr + size <= limit);
+
+    if (search_addr + size >= limit) {
+        return PCI_BAR_UNMAPPED;
+    }
+
+    return search_addr;
+}
+
+static int spapr_map_bars(sPAPRPHBState *phb, PCIDevice *dev)
+{
+    PCIIORegion *r;
+    int i, ret = -1;
+
+    for (i = 0; i < PCI_NUM_REGIONS; i++) {
+        uint32_t bar_address = pci_bar(dev, i);
+        uint32_t bar_value;
+        uint16_t cmd_value = pci_default_read_config(dev, PCI_COMMAND, 2);
+        hwaddr addr;
+
+        r = &dev->io_regions[i];
+
+        /* this region isn't registered */
+        if (!r->size) {
+            continue;
+        }
+
+        /* find a hw addr we can map */
+        addr = spapr_find_bar_addr(phb, r);
+        if (addr == PCI_BAR_UNMAPPED) {
+            /* we can't find a free range within address space for this BAR */
+            fprintf(stderr,
+                    "Unable to map BAR %d, no free range available\n", i);
+            return -1;
+        }
+        /* we can probably map this region into memory if there is not
+         * a race condition with some other allocator. write the address
+         * to the device BAR which will force a call to pci_update_mappings
+         */
+        if (r->type & PCI_BASE_ADDRESS_SPACE_IO) {
+            pci_default_write_config(dev, PCI_COMMAND,
+                                     cmd_value | PCI_COMMAND_IO, 2);
+        } else {
+            pci_default_write_config(dev, PCI_COMMAND,
+                                     cmd_value | PCI_COMMAND_MEMORY, 2);
+        }
+
+        bar_value = addr;
+
+        if (i == PCI_ROM_SLOT) {
+            bar_value |= PCI_ROM_ADDRESS_ENABLE;
+        }
+        /* write the new bar value */
+        pci_default_write_config(dev, bar_address, bar_value, 4);
+
+        /* if this is a 64-bit BAR, we need to also write the
+         * upper 32 bit value.
+         */
+        if (r->type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
+            bar_value = (addr >> 32) & 0xffffffffUL;
+            pci_default_write_config(dev, bar_address + 4, bar_value, 4);
+        }
+        ret = 0;
+    }
+    return ret;
+}
+
+static int spapr_populate_pci_child_dt(PCIDevice *dev, void *fdt, int offset,
+                                       int phb_index)
+{
+    int slot = PCI_SLOT(dev->devfn);
+    char slotname[16];
+    bool is_bridge = 1;
+    DrcEntry *drc_entry, *drc_entry_slot;
+    uint32_t reg[RESOURCE_CELLS_TOTAL * 8] = { 0 };
+    uint32_t assigned[RESOURCE_CELLS_TOTAL * 8] = { 0 };
+    int reg_size, assigned_size;
+
+    drc_entry = spapr_phb_to_drc_entry(phb_index + SPAPR_PCI_BASE_BUID);
+    g_assert(drc_entry);
+    drc_entry_slot = &drc_entry->child_entries[slot];
+
+    if (pci_default_read_config(dev, PCI_HEADER_TYPE, 1) ==
+        PCI_HEADER_TYPE_NORMAL) {
+        is_bridge = 0;
+    }
+
+    _FDT(fdt_setprop_cell(fdt, offset, "vendor-id",
+                          pci_default_read_config(dev, PCI_VENDOR_ID, 2)));
+    _FDT(fdt_setprop_cell(fdt, offset, "device-id",
+                          pci_default_read_config(dev, PCI_DEVICE_ID, 2)));
+    _FDT(fdt_setprop_cell(fdt, offset, "revision-id",
+                          pci_default_read_config(dev, PCI_REVISION_ID, 1)));
+    _FDT(fdt_setprop_cell(fdt, offset, "class-code",
+                          pci_default_read_config(dev, PCI_CLASS_DEVICE, 2) << 8));
+
+    _FDT(fdt_setprop_cell(fdt, offset, "interrupts",
+                          pci_default_read_config(dev, PCI_INTERRUPT_PIN, 1)));
+
+    /* if this device is NOT a bridge */
+    if (!is_bridge) {
+        _FDT(fdt_setprop_cell(fdt, offset, "min-grant",
+            pci_default_read_config(dev, PCI_MIN_GNT, 1)));
+        _FDT(fdt_setprop_cell(fdt, offset, "max-latency",
+            pci_default_read_config(dev, PCI_MAX_LAT, 1)));
+        _FDT(fdt_setprop_cell(fdt, offset, "subsystem-id",
+            pci_default_read_config(dev, PCI_SUBSYSTEM_ID, 2)));
+        _FDT(fdt_setprop_cell(fdt, offset, "subsystem-vendor-id",
+            pci_default_read_config(dev, PCI_SUBSYSTEM_VENDOR_ID, 2)));
+    }
+
+    _FDT(fdt_setprop_cell(fdt, offset, "cache-line-size",
+        pci_default_read_config(dev, PCI_CACHE_LINE_SIZE, 1)));
+
+    /* the following fdt cells are masked off the pci status register */
+    int pci_status = pci_default_read_config(dev, PCI_STATUS, 2);
+    _FDT(fdt_setprop_cell(fdt, offset, "devsel-speed",
+                          PCI_STATUS_DEVSEL_MASK & pci_status));
+    _FDT(fdt_setprop_cell(fdt, offset, "fast-back-to-back",
+                          PCI_STATUS_FAST_BACK & pci_status));
+    _FDT(fdt_setprop_cell(fdt, offset, "66mhz-capable",
+                          PCI_STATUS_66MHZ & pci_status));
+    _FDT(fdt_setprop_cell(fdt, offset, "udf-supported",
+                          PCI_STATUS_UDF & pci_status));
+
+    _FDT(fdt_setprop_string(fdt, offset, "name", "pci"));
+    sprintf(slotname, "Slot %d", slot + phb_index * 32);
+    _FDT(fdt_setprop(fdt, offset, "ibm,loc-code", slotname, strlen(slotname)));
+    _FDT(fdt_setprop_cell(fdt, offset, "ibm,my-drc-index",
+                          drc_entry_slot->drc_index));
+
+    _FDT(fdt_setprop_cell(fdt, offset, "#address-cells",
+                          RESOURCE_CELLS_ADDRESS));
+    _FDT(fdt_setprop_cell(fdt, offset, "#size-cells",
+                          RESOURCE_CELLS_SIZE));
+    _FDT(fdt_setprop_cell(fdt, offset, "ibm,req#msi-x",
+                          RESOURCE_CELLS_SIZE));
+    fill_resource_props(dev, phb_index, reg, &reg_size,
+                        assigned, &assigned_size);
+    _FDT(fdt_setprop(fdt, offset, "reg", reg, reg_size));
+    _FDT(fdt_setprop(fdt, offset, "assigned-addresses",
+                     assigned, assigned_size));
+
+    return 0;
+}
+
+static int spapr_device_hotplug_add(DeviceState *qdev, PCIDevice *dev)
+{
+    sPAPRPHBState *phb = SPAPR_PCI_HOST_BRIDGE(qdev);
+    DrcEntry *drc_entry, *drc_entry_slot;
+    ConfigureConnectorState *ccs;
+    int slot = PCI_SLOT(dev->devfn);
+    int offset, ret;
+    void *fdt_orig, *fdt;
+    char nodename[512];
+    uint32_t encoded = ENCODE_DRC_STATE(INDICATOR_ENTITY_SENSE_PRESENT,
+                                        INDICATOR_ENTITY_SENSE_MASK,
+                                        INDICATOR_ENTITY_SENSE_SHIFT);
+
+    drc_entry = spapr_phb_to_drc_entry(phb->buid);
+    g_assert(drc_entry);
+    drc_entry_slot = &drc_entry->child_entries[slot];
+
+    drc_entry->state &= ~(uint32_t)INDICATOR_ENTITY_SENSE_MASK;
+    drc_entry->state |= encoded; /* DR entity present */
+    drc_entry_slot->state &= ~(uint32_t)INDICATOR_ENTITY_SENSE_MASK;
+    drc_entry_slot->state |= encoded; /* and the slot */
+
+    /* need to allocate memory region for device BARs */
+    spapr_map_bars(phb, dev);
+
+    /* add OF node for pci device and required OF DT properties */
+    fdt_orig = g_malloc0(FDT_MAX_SIZE);
+    offset = fdt_create(fdt_orig, FDT_MAX_SIZE);
+    fdt_begin_node(fdt_orig, "");
+    fdt_end_node(fdt_orig);
+    fdt_finish(fdt_orig);
+
+    fdt = g_malloc0(FDT_MAX_SIZE);
+    fdt_open_into(fdt_orig, fdt, FDT_MAX_SIZE);
+    sprintf(nodename, "pci@%d", slot);
+    offset = fdt_add_subnode(fdt, 0, nodename);
+    ret = spapr_populate_pci_child_dt(dev, fdt, offset, phb->index);
+    g_assert(!ret);
+    g_free(fdt_orig);
+
+    /* hold on to node, configure_connector will pass it to the guest later */
+    ccs = &drc_entry_slot->cc_state;
+    ccs->fdt = fdt;
+    ccs->offset_start = offset;
+    ccs->state = CC_STATE_PENDING;
+    ccs->dev = dev;
+
+    return 0;
+}
+
+/* check whether guest has released/isolated device */
+static bool spapr_drc_state_is_releasable(DrcEntry *drc_entry)
+{
+    return !DECODE_DRC_STATE(drc_entry->state,
+                             INDICATOR_ISOLATION_MASK,
+                             INDICATOR_ISOLATION_SHIFT);
+}
+
+/* finalize device unplug/deletion */
+static void spapr_drc_state_reset(DrcEntry *drc_entry)
+{
+    ConfigureConnectorState *ccs = &drc_entry->cc_state;
+    uint32_t sense_empty = ENCODE_DRC_STATE(INDICATOR_ENTITY_SENSE_EMPTY,
+                                            INDICATOR_ENTITY_SENSE_MASK,
+                                            INDICATOR_ENTITY_SENSE_SHIFT);
+
+    g_free(ccs->fdt);
+    ccs->fdt = NULL;
+    object_unparent(OBJECT(ccs->dev));
+    ccs->dev = NULL;
+    ccs->state = CC_STATE_IDLE;
+    drc_entry->state &= ~INDICATOR_ENTITY_SENSE_MASK;
+    drc_entry->state |= sense_empty;
+    drc_entry->awaiting_release = false;
+}
+
+static void spapr_device_hotplug_remove(DeviceState *qdev, PCIDevice *dev)
+{
+    sPAPRPHBState *phb = SPAPR_PCI_HOST_BRIDGE(qdev);
+    DrcEntry *drc_entry, *drc_entry_slot;
+    ConfigureConnectorState *ccs;
+    int slot = PCI_SLOT(dev->devfn);
+
+    drc_entry = spapr_phb_to_drc_entry(phb->buid);
+    g_assert(drc_entry);
+    drc_entry_slot = &drc_entry->child_entries[slot];
+    ccs = &drc_entry_slot->cc_state;
+    /* shouldn't be removing devices we haven't created an fdt for */
+    g_assert(ccs->state != CC_STATE_IDLE);
+    /* if the device has already been released/isolated by guest, go ahead
+     * and remove it now. Otherwise, flag it as pending guest release so it
+     * can be removed later
+     */
+    if (spapr_drc_state_is_releasable(drc_entry_slot)) {
+        spapr_drc_state_reset(drc_entry_slot);
+    } else {
+        if (drc_entry_slot->awaiting_release) {
+            fprintf(stderr, "waiting for guest to release the device");
+        } else {
+            drc_entry_slot->awaiting_release = true;
+        }
+    }
+}
+
+static int spapr_device_hotplug(DeviceState *qdev, PCIDevice *dev,
+                                PCIHotplugState state)
+{
+    if (state == PCI_COLDPLUG_ENABLED) {
+        return 0;
+    }
+
+    if (state == PCI_HOTPLUG_ENABLED) {
+        spapr_device_hotplug_add(qdev, dev);
+    } else {
+        spapr_device_hotplug_remove(qdev, dev);
+    }
+
+    return 0;
+}
+
 static int spapr_phb_init(SysBusDevice *s)
 {
     DeviceState *dev = DEVICE(s);
@@ -889,6 +1255,7 @@  static int spapr_phb_init(SysBusDevice *s)
                            &sphb->memspace, &sphb->iospace,
                            PCI_DEVFN(0, 0), PCI_NUM_PINS, TYPE_PCI_BUS);
     phb->bus = bus;
+    pci_bus_hotplug(phb->bus, spapr_device_hotplug, DEVICE(sphb));
 
     sphb->dma_window_start = 0;
     sphb->dma_window_size = 0x40000000;
@@ -1181,14 +1548,6 @@  int spapr_populate_pci_dt(sPAPRPHBState *phb,
         return bus_off;
     }
 
-#define _FDT(exp) \
-    do { \
-        int ret = (exp);                                           \
-        if (ret < 0) {                                             \
-            return ret;                                            \
-        }                                                          \
-    } while (0)
-
     /* Write PHB properties */
     _FDT(fdt_setprop_string(fdt, bus_off, "device_type", "pci"));
     _FDT(fdt_setprop_string(fdt, bus_off, "compatible", "IBM,Logical_PHB"));
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 7c8a521..1c9b725 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -328,6 +328,7 @@  struct DrcEntry {
     void *fdt;
     int fdt_offset;
     uint32_t state;
+    bool awaiting_release;
     ConfigureConnectorState cc_state;
     DrcEntry *child_entries;
 };