diff mbox

[09/12] spapr_pci: enable basic hotplug operations

Message ID 1408407718-10835-10-git-send-email-mdroth@linux.vnet.ibm.com
State New
Headers show

Commit Message

Michael Roth Aug. 19, 2014, 12:21 a.m. UTC
This enables hotplug for PHB bridges. Upon hotplug we generate the
OF-nodes required by PAPR specification and IEEE 1275-1994
"PCI Bus Binding to Open Firmware" for the device.

We associate the corresponding FDT for these nodes with the DrcEntry
corresponding to the slot, which will be fetched via
ibm,configure-connector RTAS calls by the guest as described by PAPR
specification. The FDT is cleaned up in the case of unplug.

Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
---
 hw/ppc/spapr_pci.c     | 235 +++++++++++++++++++++++++++++++++++++++++++++++--
 include/hw/ppc/spapr.h |   1 +
 2 files changed, 228 insertions(+), 8 deletions(-)

Comments

Alexey Kardashevskiy Aug. 26, 2014, 9:40 a.m. UTC | #1
On 08/19/2014 10:21 AM, Michael Roth wrote:
> This enables hotplug for PHB bridges. Upon hotplug we generate the
> OF-nodes required by PAPR specification and IEEE 1275-1994
> "PCI Bus Binding to Open Firmware" for the device.
> 
> We associate the corresponding FDT for these nodes with the DrcEntry
> corresponding to the slot, which will be fetched via
> ibm,configure-connector RTAS calls by the guest as described by PAPR
> specification. The FDT is cleaned up in the case of unplug.
> 
> Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
> ---
>  hw/ppc/spapr_pci.c     | 235 +++++++++++++++++++++++++++++++++++++++++++++++--
>  include/hw/ppc/spapr.h |   1 +
>  2 files changed, 228 insertions(+), 8 deletions(-)
> 
> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> index 96a57be..23864ab 100644
> --- a/hw/ppc/spapr_pci.c
> +++ b/hw/ppc/spapr_pci.c
> @@ -87,6 +87,17 @@
>  #define ENCODE_DRC_STATE(val, m, s) \
>      (((uint32_t)(val) << (s)) & (uint32_t)(m))
>  
> +#define FDT_MAX_SIZE            0x10000
> +#define _FDT(exp) \
> +    do { \
> +        int ret = (exp);                                           \
> +        if (ret < 0) {                                             \
> +            return ret;                                            \
> +        }                                                          \
> +    } while (0)
> +
> +static void spapr_drc_state_reset(sPAPRDrcEntry *drc_entry);
> +
>  static sPAPRPHBState *find_phb(sPAPREnvironment *spapr, uint64_t buid)
>  {
>      sPAPRPHBState *sphb;
> @@ -476,6 +487,22 @@ static void rtas_set_indicator(PowerPCCPU *cpu, sPAPREnvironment *spapr,
>          /* encode the new value into the correct bit field */
>          shift = INDICATOR_ISOLATION_SHIFT;
>          mask = INDICATOR_ISOLATION_MASK;
> +        if (drc_entry) {
> +            /* transition from unisolated to isolated for a hotplug slot
> +             * entails completion of guest-side device unplug/cleanup, so
> +             * we can now safely remove the device if qemu is waiting for
> +             * it to be released
> +             */
> +            if (DECODE_DRC_STATE(*pind, mask, shift) != indicator_state) {
> +                if (indicator_state == 0 && drc_entry->awaiting_release) {
> +                    /* device_del has been called and host is waiting
> +                     * for guest to release/isolate device, go ahead
> +                     * and remove it now
> +                     */
> +                    spapr_drc_state_reset(drc_entry);
> +                }
> +            }
> +        }
>          break;
>      case 9002: /* DR */
>          shift = INDICATOR_DR_SHIFT;
> @@ -816,6 +843,198 @@ static AddressSpace *spapr_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn)
>      return &phb->iommu_as;
>  }
>  
> +static int spapr_populate_pci_child_dt(PCIDevice *dev, void *fdt, int offset,
> +                                       int phb_index)
> +{
> +    int slot = PCI_SLOT(dev->devfn);
> +    char slotname[16];
> +    bool is_bridge = 1;
> +    sPAPRDrcEntry *drc_entry, *drc_entry_slot;
> +    uint32_t reg[RESOURCE_CELLS_TOTAL * 8] = { 0 };
> +    uint32_t assigned[RESOURCE_CELLS_TOTAL * 8] = { 0 };
> +    int reg_size, assigned_size;
> +
> +    drc_entry = spapr_phb_to_drc_entry(phb_index + SPAPR_PCI_BASE_BUID);
> +    g_assert(drc_entry);
> +    drc_entry_slot = &drc_entry->child_entries[slot];
> +
> +    if (pci_default_read_config(dev, PCI_HEADER_TYPE, 1) ==
> +        PCI_HEADER_TYPE_NORMAL) {
> +        is_bridge = 0;
> +    }
> +
> +    _FDT(fdt_setprop_cell(fdt, offset, "vendor-id",
> +                          pci_default_read_config(dev, PCI_VENDOR_ID, 2)));
> +    _FDT(fdt_setprop_cell(fdt, offset, "device-id",
> +                          pci_default_read_config(dev, PCI_DEVICE_ID, 2)));
> +    _FDT(fdt_setprop_cell(fdt, offset, "revision-id",
> +                          pci_default_read_config(dev, PCI_REVISION_ID, 1)));
> +    _FDT(fdt_setprop_cell(fdt, offset, "class-code",
> +                          pci_default_read_config(dev, PCI_CLASS_DEVICE, 2) << 8));
> +
> +    _FDT(fdt_setprop_cell(fdt, offset, "interrupts",
> +                          pci_default_read_config(dev, PCI_INTERRUPT_PIN, 1)));
> +
> +    /* if this device is NOT a bridge */
> +    if (!is_bridge) {
> +        _FDT(fdt_setprop_cell(fdt, offset, "min-grant",
> +            pci_default_read_config(dev, PCI_MIN_GNT, 1)));
> +        _FDT(fdt_setprop_cell(fdt, offset, "max-latency",
> +            pci_default_read_config(dev, PCI_MAX_LAT, 1)));
> +        _FDT(fdt_setprop_cell(fdt, offset, "subsystem-id",
> +            pci_default_read_config(dev, PCI_SUBSYSTEM_ID, 2)));
> +        _FDT(fdt_setprop_cell(fdt, offset, "subsystem-vendor-id",
> +            pci_default_read_config(dev, PCI_SUBSYSTEM_VENDOR_ID, 2)));
> +    }
> +
> +    _FDT(fdt_setprop_cell(fdt, offset, "cache-line-size",
> +        pci_default_read_config(dev, PCI_CACHE_LINE_SIZE, 1)));
> +
> +    /* the following fdt cells are masked off the pci status register */
> +    int pci_status = pci_default_read_config(dev, PCI_STATUS, 2);
> +    _FDT(fdt_setprop_cell(fdt, offset, "devsel-speed",
> +                          PCI_STATUS_DEVSEL_MASK & pci_status));
> +    _FDT(fdt_setprop_cell(fdt, offset, "fast-back-to-back",
> +                          PCI_STATUS_FAST_BACK & pci_status));
> +    _FDT(fdt_setprop_cell(fdt, offset, "66mhz-capable",
> +                          PCI_STATUS_66MHZ & pci_status));
> +    _FDT(fdt_setprop_cell(fdt, offset, "udf-supported",
> +                          PCI_STATUS_UDF & pci_status));
> +
> +    _FDT(fdt_setprop_string(fdt, offset, "name", "pci"));
> +    sprintf(slotname, "Slot %d", slot + phb_index * 32);
> +    _FDT(fdt_setprop(fdt, offset, "ibm,loc-code", slotname, strlen(slotname)));
> +    _FDT(fdt_setprop_cell(fdt, offset, "ibm,my-drc-index",
> +                          drc_entry_slot->drc_index));
> +
> +    _FDT(fdt_setprop_cell(fdt, offset, "#address-cells",
> +                          RESOURCE_CELLS_ADDRESS));
> +    _FDT(fdt_setprop_cell(fdt, offset, "#size-cells",
> +                          RESOURCE_CELLS_SIZE));
> +    _FDT(fdt_setprop_cell(fdt, offset, "ibm,req#msi-x",
> +                          RESOURCE_CELLS_SIZE));
> +    fill_resource_props(dev, phb_index, reg, &reg_size,
> +                        assigned, &assigned_size);
> +    _FDT(fdt_setprop(fdt, offset, "reg", reg, reg_size));
> +    _FDT(fdt_setprop(fdt, offset, "assigned-addresses",
> +                     assigned, assigned_size));
> +
> +    return 0;
> +}
> +
> +static int spapr_device_hotplug_add(DeviceState *qdev, PCIDevice *dev)


Minor comment here too :)

Will this function ever support anything but sPAPRPHBState as a bus device?
It is not a callback and could receive what it actually needs - sPAPRPHBState.



> +{
> +    sPAPRPHBState *phb = SPAPR_PCI_HOST_BRIDGE(qdev);
> +    sPAPRDrcEntry *drc_entry, *drc_entry_slot;
> +    sPAPRConfigureConnectorState *ccs;
> +    int slot = PCI_SLOT(dev->devfn);
> +    int offset, ret;
> +    void *fdt_orig, *fdt;
> +    char nodename[512];
> +    uint32_t encoded = ENCODE_DRC_STATE(INDICATOR_ENTITY_SENSE_PRESENT,
> +                                        INDICATOR_ENTITY_SENSE_MASK,
> +                                        INDICATOR_ENTITY_SENSE_SHIFT);
> +
> +    drc_entry = spapr_phb_to_drc_entry(phb->buid);
> +    g_assert(drc_entry);
> +    drc_entry_slot = &drc_entry->child_entries[slot];
> +
> +    drc_entry->state &= ~(uint32_t)INDICATOR_ENTITY_SENSE_MASK;
> +    drc_entry->state |= encoded; /* DR entity present */
> +    drc_entry_slot->state &= ~(uint32_t)INDICATOR_ENTITY_SENSE_MASK;
> +    drc_entry_slot->state |= encoded; /* and the slot */
> +
> +    /* add OF node for pci device and required OF DT properties */
> +    fdt_orig = g_malloc0(FDT_MAX_SIZE);
> +    offset = fdt_create(fdt_orig, FDT_MAX_SIZE);
> +    fdt_begin_node(fdt_orig, "");
> +    fdt_end_node(fdt_orig);
> +    fdt_finish(fdt_orig);
> +
> +    fdt = g_malloc0(FDT_MAX_SIZE);
> +    fdt_open_into(fdt_orig, fdt, FDT_MAX_SIZE);
> +    sprintf(nodename, "pci@%d", slot);
> +    offset = fdt_add_subnode(fdt, 0, nodename);
> +    ret = spapr_populate_pci_child_dt(dev, fdt, offset, phb->index);
> +    g_assert(!ret);
> +    g_free(fdt_orig);
> +
> +    /* hold on to node, configure_connector will pass it to the guest later */
> +    ccs = &drc_entry_slot->cc_state;
> +    ccs->fdt = fdt;
> +    ccs->offset_start = offset;
> +    ccs->state = CC_STATE_PENDING;
> +    ccs->dev = dev;
> +
> +    return 0;
> +}
> +
> +/* check whether guest has released/isolated device */
> +static bool spapr_drc_state_is_releasable(sPAPRDrcEntry *drc_entry)
> +{
> +    return !DECODE_DRC_STATE(drc_entry->state,
> +                             INDICATOR_ISOLATION_MASK,
> +                             INDICATOR_ISOLATION_SHIFT);
> +}
> +
> +/* finalize device unplug/deletion */
> +static void spapr_drc_state_reset(sPAPRDrcEntry *drc_entry)
> +{
> +    sPAPRConfigureConnectorState *ccs = &drc_entry->cc_state;
> +    uint32_t sense_empty = ENCODE_DRC_STATE(INDICATOR_ENTITY_SENSE_EMPTY,
> +                                            INDICATOR_ENTITY_SENSE_MASK,
> +                                            INDICATOR_ENTITY_SENSE_SHIFT);
> +
> +    g_free(ccs->fdt);
> +    ccs->fdt = NULL;
> +    object_unparent(OBJECT(ccs->dev));
> +    ccs->dev = NULL;
> +    ccs->state = CC_STATE_IDLE;
> +    drc_entry->state &= ~INDICATOR_ENTITY_SENSE_MASK;
> +    drc_entry->state |= sense_empty;
> +    drc_entry->awaiting_release = false;
> +}
> +
> +static void spapr_device_hotplug_remove(DeviceState *qdev, PCIDevice *dev)
> +{
> +    sPAPRPHBState *phb = SPAPR_PCI_HOST_BRIDGE(qdev);
> +    sPAPRDrcEntry *drc_entry, *drc_entry_slot;
> +    sPAPRConfigureConnectorState *ccs;
> +    int slot = PCI_SLOT(dev->devfn);
> +
> +    drc_entry = spapr_phb_to_drc_entry(phb->buid);
> +    g_assert(drc_entry);
> +    drc_entry_slot = &drc_entry->child_entries[slot];
> +    ccs = &drc_entry_slot->cc_state;
> +    /* shouldn't be removing devices we haven't created an fdt for */
> +    g_assert(ccs->state != CC_STATE_IDLE);
> +    /* if the device has already been released/isolated by guest, go ahead
> +     * and remove it now. Otherwise, flag it as pending guest release so it
> +     * can be removed later
> +     */
> +    if (spapr_drc_state_is_releasable(drc_entry_slot)) {
> +        spapr_drc_state_reset(drc_entry_slot);
> +    } else {
> +        if (drc_entry_slot->awaiting_release) {
> +            fprintf(stderr, "waiting for guest to release the device");
> +        } else {
> +            drc_entry_slot->awaiting_release = true;
> +        }
> +    }
> +}
> +
> +static void spapr_phb_hot_plug(HotplugHandler *plug_handler,
> +                               DeviceState *plugged_dev, Error **errp)
> +{
> +    spapr_device_hotplug_add(DEVICE(plug_handler), PCI_DEVICE(plugged_dev));
> +}
> +
> +static void spapr_phb_hot_unplug(HotplugHandler *plug_handler,
> +                                 DeviceState *plugged_dev, Error **errp)
> +{
> +    spapr_device_hotplug_remove(DEVICE(plug_handler), PCI_DEVICE(plugged_dev));
> +}
> +
>  static void spapr_phb_realize(DeviceState *dev, Error **errp)
>  {
>      SysBusDevice *s = SYS_BUS_DEVICE(dev);
> @@ -903,6 +1122,7 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
>                             &sphb->memspace, &sphb->iospace,
>                             PCI_DEVFN(0, 0), PCI_NUM_PINS, TYPE_PCI_BUS);
>      phb->bus = bus;
> +    qbus_set_hotplug_handler(BUS(phb->bus), DEVICE(sphb), NULL);
>  
>      /*
>       * Initialize PHB address space.
> @@ -1108,6 +1328,7 @@ static void spapr_phb_class_init(ObjectClass *klass, void *data)
>      PCIHostBridgeClass *hc = PCI_HOST_BRIDGE_CLASS(klass);
>      DeviceClass *dc = DEVICE_CLASS(klass);
>      sPAPRPHBClass *spc = SPAPR_PCI_HOST_BRIDGE_CLASS(klass);
> +    HotplugHandlerClass *hp = HOTPLUG_HANDLER_CLASS(klass);
>  
>      hc->root_bus_path = spapr_phb_root_bus_path;
>      dc->realize = spapr_phb_realize;
> @@ -1117,6 +1338,8 @@ static void spapr_phb_class_init(ObjectClass *klass, void *data)
>      set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories);
>      dc->cannot_instantiate_with_device_add_yet = false;
>      spc->finish_realize = spapr_phb_finish_realize;
> +    hp->plug = spapr_phb_hot_plug;
> +    hp->unplug = spapr_phb_hot_unplug;
>  }
>  
>  static const TypeInfo spapr_phb_info = {
> @@ -1125,6 +1348,10 @@ static const TypeInfo spapr_phb_info = {
>      .instance_size = sizeof(sPAPRPHBState),
>      .class_init    = spapr_phb_class_init,
>      .class_size    = sizeof(sPAPRPHBClass),
> +    .interfaces    = (InterfaceInfo[]) {
> +        { TYPE_HOTPLUG_HANDLER },
> +        { }
> +    }
>  };
>  
>  PCIHostState *spapr_create_phb(sPAPREnvironment *spapr, int index)
> @@ -1304,14 +1531,6 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
>          return bus_off;
>      }
>  
> -#define _FDT(exp) \
> -    do { \
> -        int ret = (exp);                                           \
> -        if (ret < 0) {                                             \
> -            return ret;                                            \
> -        }                                                          \
> -    } while (0)
> -
>      /* Write PHB properties */
>      _FDT(fdt_setprop_string(fdt, bus_off, "device_type", "pci"));
>      _FDT(fdt_setprop_string(fdt, bus_off, "compatible", "IBM,Logical_PHB"));
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index fac85f8..e08dd2f 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -36,6 +36,7 @@ struct sPAPRDrcEntry {
>      void *fdt;
>      int fdt_offset;
>      uint32_t state;
> +    bool awaiting_release;
>      sPAPRConfigureConnectorState cc_state;
>      sPAPRDrcEntry *child_entries;
>  };
>
Alexander Graf Aug. 26, 2014, 12:30 p.m. UTC | #2
On 19.08.14 02:21, Michael Roth wrote:
> This enables hotplug for PHB bridges. Upon hotplug we generate the
> OF-nodes required by PAPR specification and IEEE 1275-1994
> "PCI Bus Binding to Open Firmware" for the device.
> 
> We associate the corresponding FDT for these nodes with the DrcEntry
> corresponding to the slot, which will be fetched via
> ibm,configure-connector RTAS calls by the guest as described by PAPR
> specification. The FDT is cleaned up in the case of unplug.
> 
> Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
> ---
>  hw/ppc/spapr_pci.c     | 235 +++++++++++++++++++++++++++++++++++++++++++++++--
>  include/hw/ppc/spapr.h |   1 +
>  2 files changed, 228 insertions(+), 8 deletions(-)
> 
> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> index 96a57be..23864ab 100644
> --- a/hw/ppc/spapr_pci.c
> +++ b/hw/ppc/spapr_pci.c
> @@ -87,6 +87,17 @@
>  #define ENCODE_DRC_STATE(val, m, s) \
>      (((uint32_t)(val) << (s)) & (uint32_t)(m))
>  
> +#define FDT_MAX_SIZE            0x10000
> +#define _FDT(exp) \
> +    do { \
> +        int ret = (exp);                                           \
> +        if (ret < 0) {                                             \
> +            return ret;                                            \
> +        }                                                          \
> +    } while (0)
> +
> +static void spapr_drc_state_reset(sPAPRDrcEntry *drc_entry);
> +
>  static sPAPRPHBState *find_phb(sPAPREnvironment *spapr, uint64_t buid)
>  {
>      sPAPRPHBState *sphb;
> @@ -476,6 +487,22 @@ static void rtas_set_indicator(PowerPCCPU *cpu, sPAPREnvironment *spapr,
>          /* encode the new value into the correct bit field */
>          shift = INDICATOR_ISOLATION_SHIFT;
>          mask = INDICATOR_ISOLATION_MASK;
> +        if (drc_entry) {
> +            /* transition from unisolated to isolated for a hotplug slot
> +             * entails completion of guest-side device unplug/cleanup, so
> +             * we can now safely remove the device if qemu is waiting for
> +             * it to be released
> +             */
> +            if (DECODE_DRC_STATE(*pind, mask, shift) != indicator_state) {
> +                if (indicator_state == 0 && drc_entry->awaiting_release) {
> +                    /* device_del has been called and host is waiting
> +                     * for guest to release/isolate device, go ahead
> +                     * and remove it now
> +                     */
> +                    spapr_drc_state_reset(drc_entry);
> +                }
> +            }
> +        }
>          break;
>      case 9002: /* DR */
>          shift = INDICATOR_DR_SHIFT;
> @@ -816,6 +843,198 @@ static AddressSpace *spapr_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn)
>      return &phb->iommu_as;
>  }
>  
> +static int spapr_populate_pci_child_dt(PCIDevice *dev, void *fdt, int offset,
> +                                       int phb_index)
> +{
> +    int slot = PCI_SLOT(dev->devfn);
> +    char slotname[16];
> +    bool is_bridge = 1;
> +    sPAPRDrcEntry *drc_entry, *drc_entry_slot;
> +    uint32_t reg[RESOURCE_CELLS_TOTAL * 8] = { 0 };
> +    uint32_t assigned[RESOURCE_CELLS_TOTAL * 8] = { 0 };
> +    int reg_size, assigned_size;
> +
> +    drc_entry = spapr_phb_to_drc_entry(phb_index + SPAPR_PCI_BASE_BUID);
> +    g_assert(drc_entry);
> +    drc_entry_slot = &drc_entry->child_entries[slot];
> +
> +    if (pci_default_read_config(dev, PCI_HEADER_TYPE, 1) ==
> +        PCI_HEADER_TYPE_NORMAL) {
> +        is_bridge = 0;
> +    }
> +
> +    _FDT(fdt_setprop_cell(fdt, offset, "vendor-id",
> +                          pci_default_read_config(dev, PCI_VENDOR_ID, 2)));
> +    _FDT(fdt_setprop_cell(fdt, offset, "device-id",
> +                          pci_default_read_config(dev, PCI_DEVICE_ID, 2)));
> +    _FDT(fdt_setprop_cell(fdt, offset, "revision-id",
> +                          pci_default_read_config(dev, PCI_REVISION_ID, 1)));
> +    _FDT(fdt_setprop_cell(fdt, offset, "class-code",
> +                          pci_default_read_config(dev, PCI_CLASS_DEVICE, 2) << 8));
> +
> +    _FDT(fdt_setprop_cell(fdt, offset, "interrupts",
> +                          pci_default_read_config(dev, PCI_INTERRUPT_PIN, 1)));
> +
> +    /* if this device is NOT a bridge */
> +    if (!is_bridge) {
> +        _FDT(fdt_setprop_cell(fdt, offset, "min-grant",
> +            pci_default_read_config(dev, PCI_MIN_GNT, 1)));
> +        _FDT(fdt_setprop_cell(fdt, offset, "max-latency",
> +            pci_default_read_config(dev, PCI_MAX_LAT, 1)));
> +        _FDT(fdt_setprop_cell(fdt, offset, "subsystem-id",
> +            pci_default_read_config(dev, PCI_SUBSYSTEM_ID, 2)));
> +        _FDT(fdt_setprop_cell(fdt, offset, "subsystem-vendor-id",
> +            pci_default_read_config(dev, PCI_SUBSYSTEM_VENDOR_ID, 2)));
> +    }
> +
> +    _FDT(fdt_setprop_cell(fdt, offset, "cache-line-size",
> +        pci_default_read_config(dev, PCI_CACHE_LINE_SIZE, 1)));
> +
> +    /* the following fdt cells are masked off the pci status register */
> +    int pci_status = pci_default_read_config(dev, PCI_STATUS, 2);
> +    _FDT(fdt_setprop_cell(fdt, offset, "devsel-speed",
> +                          PCI_STATUS_DEVSEL_MASK & pci_status));
> +    _FDT(fdt_setprop_cell(fdt, offset, "fast-back-to-back",
> +                          PCI_STATUS_FAST_BACK & pci_status));
> +    _FDT(fdt_setprop_cell(fdt, offset, "66mhz-capable",
> +                          PCI_STATUS_66MHZ & pci_status));
> +    _FDT(fdt_setprop_cell(fdt, offset, "udf-supported",
> +                          PCI_STATUS_UDF & pci_status));
> +
> +    _FDT(fdt_setprop_string(fdt, offset, "name", "pci"));
> +    sprintf(slotname, "Slot %d", slot + phb_index * 32);
> +    _FDT(fdt_setprop(fdt, offset, "ibm,loc-code", slotname, strlen(slotname)));
> +    _FDT(fdt_setprop_cell(fdt, offset, "ibm,my-drc-index",
> +                          drc_entry_slot->drc_index));
> +
> +    _FDT(fdt_setprop_cell(fdt, offset, "#address-cells",
> +                          RESOURCE_CELLS_ADDRESS));
> +    _FDT(fdt_setprop_cell(fdt, offset, "#size-cells",
> +                          RESOURCE_CELLS_SIZE));
> +    _FDT(fdt_setprop_cell(fdt, offset, "ibm,req#msi-x",
> +                          RESOURCE_CELLS_SIZE));
> +    fill_resource_props(dev, phb_index, reg, &reg_size,
> +                        assigned, &assigned_size);
> +    _FDT(fdt_setprop(fdt, offset, "reg", reg, reg_size));
> +    _FDT(fdt_setprop(fdt, offset, "assigned-addresses",
> +                     assigned, assigned_size));
> +
> +    return 0;
> +}
> +
> +static int spapr_device_hotplug_add(DeviceState *qdev, PCIDevice *dev)
> +{
> +    sPAPRPHBState *phb = SPAPR_PCI_HOST_BRIDGE(qdev);
> +    sPAPRDrcEntry *drc_entry, *drc_entry_slot;
> +    sPAPRConfigureConnectorState *ccs;
> +    int slot = PCI_SLOT(dev->devfn);
> +    int offset, ret;
> +    void *fdt_orig, *fdt;
> +    char nodename[512];

I think we're better off using g_strdup_printf.

> +    uint32_t encoded = ENCODE_DRC_STATE(INDICATOR_ENTITY_SENSE_PRESENT,
> +                                        INDICATOR_ENTITY_SENSE_MASK,
> +                                        INDICATOR_ENTITY_SENSE_SHIFT);
> +
> +    drc_entry = spapr_phb_to_drc_entry(phb->buid);
> +    g_assert(drc_entry);
> +    drc_entry_slot = &drc_entry->child_entries[slot];
> +
> +    drc_entry->state &= ~(uint32_t)INDICATOR_ENTITY_SENSE_MASK;
> +    drc_entry->state |= encoded; /* DR entity present */
> +    drc_entry_slot->state &= ~(uint32_t)INDICATOR_ENTITY_SENSE_MASK;
> +    drc_entry_slot->state |= encoded; /* and the slot */
> +
> +    /* add OF node for pci device and required OF DT properties */
> +    fdt_orig = g_malloc0(FDT_MAX_SIZE);
> +    offset = fdt_create(fdt_orig, FDT_MAX_SIZE);
> +    fdt_begin_node(fdt_orig, "");
> +    fdt_end_node(fdt_orig);
> +    fdt_finish(fdt_orig);
> +
> +    fdt = g_malloc0(FDT_MAX_SIZE);
> +    fdt_open_into(fdt_orig, fdt, FDT_MAX_SIZE);
> +    sprintf(nodename, "pci@%d", slot);
> +    offset = fdt_add_subnode(fdt, 0, nodename);
> +    ret = spapr_populate_pci_child_dt(dev, fdt, offset, phb->index);
> +    g_assert(!ret);
> +    g_free(fdt_orig);
> +
> +    /* hold on to node, configure_connector will pass it to the guest later */
> +    ccs = &drc_entry_slot->cc_state;
> +    ccs->fdt = fdt;
> +    ccs->offset_start = offset;
> +    ccs->state = CC_STATE_PENDING;
> +    ccs->dev = dev;
> +
> +    return 0;
> +}
> +
> +/* check whether guest has released/isolated device */
> +static bool spapr_drc_state_is_releasable(sPAPRDrcEntry *drc_entry)
> +{
> +    return !DECODE_DRC_STATE(drc_entry->state,
> +                             INDICATOR_ISOLATION_MASK,
> +                             INDICATOR_ISOLATION_SHIFT);
> +}
> +
> +/* finalize device unplug/deletion */
> +static void spapr_drc_state_reset(sPAPRDrcEntry *drc_entry)
> +{
> +    sPAPRConfigureConnectorState *ccs = &drc_entry->cc_state;
> +    uint32_t sense_empty = ENCODE_DRC_STATE(INDICATOR_ENTITY_SENSE_EMPTY,
> +                                            INDICATOR_ENTITY_SENSE_MASK,
> +                                            INDICATOR_ENTITY_SENSE_SHIFT);
> +
> +    g_free(ccs->fdt);
> +    ccs->fdt = NULL;
> +    object_unparent(OBJECT(ccs->dev));
> +    ccs->dev = NULL;
> +    ccs->state = CC_STATE_IDLE;
> +    drc_entry->state &= ~INDICATOR_ENTITY_SENSE_MASK;
> +    drc_entry->state |= sense_empty;
> +    drc_entry->awaiting_release = false;
> +}
> +
> +static void spapr_device_hotplug_remove(DeviceState *qdev, PCIDevice *dev)
> +{
> +    sPAPRPHBState *phb = SPAPR_PCI_HOST_BRIDGE(qdev);
> +    sPAPRDrcEntry *drc_entry, *drc_entry_slot;
> +    sPAPRConfigureConnectorState *ccs;
> +    int slot = PCI_SLOT(dev->devfn);
> +
> +    drc_entry = spapr_phb_to_drc_entry(phb->buid);
> +    g_assert(drc_entry);
> +    drc_entry_slot = &drc_entry->child_entries[slot];
> +    ccs = &drc_entry_slot->cc_state;
> +    /* shouldn't be removing devices we haven't created an fdt for */
> +    g_assert(ccs->state != CC_STATE_IDLE);
> +    /* if the device has already been released/isolated by guest, go ahead
> +     * and remove it now. Otherwise, flag it as pending guest release so it
> +     * can be removed later
> +     */
> +    if (spapr_drc_state_is_releasable(drc_entry_slot)) {
> +        spapr_drc_state_reset(drc_entry_slot);
> +    } else {
> +        if (drc_entry_slot->awaiting_release) {
> +            fprintf(stderr, "waiting for guest to release the device");

Please add a timer in this case that force ejects the device. Also we
need to tell the caller that the device hasn't been ejected yet, no? How
does our ACPI implementation deal with uncooperative guests and
notifying upper stacks about them?


Alex
Bharata B Rao Sept. 3, 2014, 10:33 a.m. UTC | #3
On Tue, Aug 19, 2014 at 5:51 AM, Michael Roth <mdroth@linux.vnet.ibm.com> wrote:
> This enables hotplug for PHB bridges. Upon hotplug we generate the
> OF-nodes required by PAPR specification and IEEE 1275-1994
> "PCI Bus Binding to Open Firmware" for the device.
>
> We associate the corresponding FDT for these nodes with the DrcEntry
> corresponding to the slot, which will be fetched via
> ibm,configure-connector RTAS calls by the guest as described by PAPR
> specification. The FDT is cleaned up in the case of unplug.
>
> Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
> ---
>  hw/ppc/spapr_pci.c     | 235 +++++++++++++++++++++++++++++++++++++++++++++++--
>  include/hw/ppc/spapr.h |   1 +
>  2 files changed, 228 insertions(+), 8 deletions(-)
>
> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> index 96a57be..23864ab 100644
> --- a/hw/ppc/spapr_pci.c
> +++ b/hw/ppc/spapr_pci.c
> @@ -87,6 +87,17 @@
>  #define ENCODE_DRC_STATE(val, m, s) \
>      (((uint32_t)(val) << (s)) & (uint32_t)(m))
>
> +#define FDT_MAX_SIZE            0x10000
> +#define _FDT(exp) \
> +    do { \
> +        int ret = (exp);                                           \
> +        if (ret < 0) {                                             \
> +            return ret;                                            \
> +        }                                                          \
> +    } while (0)
> +
> +static void spapr_drc_state_reset(sPAPRDrcEntry *drc_entry);
> +
>  static sPAPRPHBState *find_phb(sPAPREnvironment *spapr, uint64_t buid)
>  {
>      sPAPRPHBState *sphb;
> @@ -476,6 +487,22 @@ static void rtas_set_indicator(PowerPCCPU *cpu, sPAPREnvironment *spapr,
>          /* encode the new value into the correct bit field */
>          shift = INDICATOR_ISOLATION_SHIFT;
>          mask = INDICATOR_ISOLATION_MASK;
> +        if (drc_entry) {
> +            /* transition from unisolated to isolated for a hotplug slot
> +             * entails completion of guest-side device unplug/cleanup, so
> +             * we can now safely remove the device if qemu is waiting for
> +             * it to be released
> +             */
> +            if (DECODE_DRC_STATE(*pind, mask, shift) != indicator_state) {
> +                if (indicator_state == 0 && drc_entry->awaiting_release) {
> +                    /* device_del has been called and host is waiting
> +                     * for guest to release/isolate device, go ahead
> +                     * and remove it now
> +                     */
> +                    spapr_drc_state_reset(drc_entry);
> +                }
> +            }
> +        }
>          break;
>      case 9002: /* DR */
>          shift = INDICATOR_DR_SHIFT;
> @@ -816,6 +843,198 @@ static AddressSpace *spapr_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn)
>      return &phb->iommu_as;
>  }
>
> +static int spapr_populate_pci_child_dt(PCIDevice *dev, void *fdt, int offset,
> +                                       int phb_index)
> +{
> +    int slot = PCI_SLOT(dev->devfn);
> +    char slotname[16];
> +    bool is_bridge = 1;
> +    sPAPRDrcEntry *drc_entry, *drc_entry_slot;
> +    uint32_t reg[RESOURCE_CELLS_TOTAL * 8] = { 0 };
> +    uint32_t assigned[RESOURCE_CELLS_TOTAL * 8] = { 0 };
> +    int reg_size, assigned_size;
> +
> +    drc_entry = spapr_phb_to_drc_entry(phb_index + SPAPR_PCI_BASE_BUID);
> +    g_assert(drc_entry);
> +    drc_entry_slot = &drc_entry->child_entries[slot];
> +
> +    if (pci_default_read_config(dev, PCI_HEADER_TYPE, 1) ==
> +        PCI_HEADER_TYPE_NORMAL) {
> +        is_bridge = 0;
> +    }
> +
> +    _FDT(fdt_setprop_cell(fdt, offset, "vendor-id",
> +                          pci_default_read_config(dev, PCI_VENDOR_ID, 2)));
> +    _FDT(fdt_setprop_cell(fdt, offset, "device-id",
> +                          pci_default_read_config(dev, PCI_DEVICE_ID, 2)));
> +    _FDT(fdt_setprop_cell(fdt, offset, "revision-id",
> +                          pci_default_read_config(dev, PCI_REVISION_ID, 1)));
> +    _FDT(fdt_setprop_cell(fdt, offset, "class-code",
> +                          pci_default_read_config(dev, PCI_CLASS_DEVICE, 2) << 8));
> +
> +    _FDT(fdt_setprop_cell(fdt, offset, "interrupts",
> +                          pci_default_read_config(dev, PCI_INTERRUPT_PIN, 1)));
> +
> +    /* if this device is NOT a bridge */
> +    if (!is_bridge) {
> +        _FDT(fdt_setprop_cell(fdt, offset, "min-grant",
> +            pci_default_read_config(dev, PCI_MIN_GNT, 1)));
> +        _FDT(fdt_setprop_cell(fdt, offset, "max-latency",
> +            pci_default_read_config(dev, PCI_MAX_LAT, 1)));
> +        _FDT(fdt_setprop_cell(fdt, offset, "subsystem-id",
> +            pci_default_read_config(dev, PCI_SUBSYSTEM_ID, 2)));
> +        _FDT(fdt_setprop_cell(fdt, offset, "subsystem-vendor-id",
> +            pci_default_read_config(dev, PCI_SUBSYSTEM_VENDOR_ID, 2)));
> +    }
> +
> +    _FDT(fdt_setprop_cell(fdt, offset, "cache-line-size",
> +        pci_default_read_config(dev, PCI_CACHE_LINE_SIZE, 1)));
> +
> +    /* the following fdt cells are masked off the pci status register */
> +    int pci_status = pci_default_read_config(dev, PCI_STATUS, 2);
> +    _FDT(fdt_setprop_cell(fdt, offset, "devsel-speed",
> +                          PCI_STATUS_DEVSEL_MASK & pci_status));
> +    _FDT(fdt_setprop_cell(fdt, offset, "fast-back-to-back",
> +                          PCI_STATUS_FAST_BACK & pci_status));
> +    _FDT(fdt_setprop_cell(fdt, offset, "66mhz-capable",
> +                          PCI_STATUS_66MHZ & pci_status));
> +    _FDT(fdt_setprop_cell(fdt, offset, "udf-supported",
> +                          PCI_STATUS_UDF & pci_status));
> +
> +    _FDT(fdt_setprop_string(fdt, offset, "name", "pci"));
> +    sprintf(slotname, "Slot %d", slot + phb_index * 32);
> +    _FDT(fdt_setprop(fdt, offset, "ibm,loc-code", slotname, strlen(slotname)));
> +    _FDT(fdt_setprop_cell(fdt, offset, "ibm,my-drc-index",
> +                          drc_entry_slot->drc_index));
> +
> +    _FDT(fdt_setprop_cell(fdt, offset, "#address-cells",
> +                          RESOURCE_CELLS_ADDRESS));
> +    _FDT(fdt_setprop_cell(fdt, offset, "#size-cells",
> +                          RESOURCE_CELLS_SIZE));
> +    _FDT(fdt_setprop_cell(fdt, offset, "ibm,req#msi-x",
> +                          RESOURCE_CELLS_SIZE));
> +    fill_resource_props(dev, phb_index, reg, &reg_size,
> +                        assigned, &assigned_size);
> +    _FDT(fdt_setprop(fdt, offset, "reg", reg, reg_size));
> +    _FDT(fdt_setprop(fdt, offset, "assigned-addresses",
> +                     assigned, assigned_size));
> +
> +    return 0;
> +}
> +
> +static int spapr_device_hotplug_add(DeviceState *qdev, PCIDevice *dev)
> +{
> +    sPAPRPHBState *phb = SPAPR_PCI_HOST_BRIDGE(qdev);
> +    sPAPRDrcEntry *drc_entry, *drc_entry_slot;
> +    sPAPRConfigureConnectorState *ccs;
> +    int slot = PCI_SLOT(dev->devfn);
> +    int offset, ret;
> +    void *fdt_orig, *fdt;
> +    char nodename[512];
> +    uint32_t encoded = ENCODE_DRC_STATE(INDICATOR_ENTITY_SENSE_PRESENT,
> +                                        INDICATOR_ENTITY_SENSE_MASK,
> +                                        INDICATOR_ENTITY_SENSE_SHIFT);
> +

I am building on this infrastructure of yours and adding CPU hotplug
support to sPAPR guests.

So we start with dr state of UNUSABLE and change it to PRESENT like
above when performing hotplug operation. But after this, in case of
CPU hotplug, the CPU hotplug code in the kernel
(arch/powerpc/platforms/pseries/dlpar.c:dlpar_acquire_drc()) does
get-sensor-state rtas call and expects the dr state to be UNUSABLE. Is
the guest kernel right in expecting dr state to be in UNUSABLE state
like this ? I have in fact disabled this check in the guest kernel to
get a CPU hotplugged successfully, but wanted to understand the state
changes and the expectations from the guest kernel correctly.

Regards,
Bharata.
Michael Roth Sept. 3, 2014, 11:03 p.m. UTC | #4
Quoting Bharata B Rao (2014-09-03 05:33:56)
> On Tue, Aug 19, 2014 at 5:51 AM, Michael Roth <mdroth@linux.vnet.ibm.com> wrote:
> > This enables hotplug for PHB bridges. Upon hotplug we generate the
> > OF-nodes required by PAPR specification and IEEE 1275-1994
> > "PCI Bus Binding to Open Firmware" for the device.
> >
> > We associate the corresponding FDT for these nodes with the DrcEntry
> > corresponding to the slot, which will be fetched via
> > ibm,configure-connector RTAS calls by the guest as described by PAPR
> > specification. The FDT is cleaned up in the case of unplug.
> >
> > Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
> > ---
> >  hw/ppc/spapr_pci.c     | 235 +++++++++++++++++++++++++++++++++++++++++++++++--
> >  include/hw/ppc/spapr.h |   1 +
> >  2 files changed, 228 insertions(+), 8 deletions(-)
> >
> > diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> > index 96a57be..23864ab 100644
> > --- a/hw/ppc/spapr_pci.c
> > +++ b/hw/ppc/spapr_pci.c
> > @@ -87,6 +87,17 @@
> >  #define ENCODE_DRC_STATE(val, m, s) \
> >      (((uint32_t)(val) << (s)) & (uint32_t)(m))
> >
> > +#define FDT_MAX_SIZE            0x10000
> > +#define _FDT(exp) \
> > +    do { \
> > +        int ret = (exp);                                           \
> > +        if (ret < 0) {                                             \
> > +            return ret;                                            \
> > +        }                                                          \
> > +    } while (0)
> > +
> > +static void spapr_drc_state_reset(sPAPRDrcEntry *drc_entry);
> > +
> >  static sPAPRPHBState *find_phb(sPAPREnvironment *spapr, uint64_t buid)
> >  {
> >      sPAPRPHBState *sphb;
> > @@ -476,6 +487,22 @@ static void rtas_set_indicator(PowerPCCPU *cpu, sPAPREnvironment *spapr,
> >          /* encode the new value into the correct bit field */
> >          shift = INDICATOR_ISOLATION_SHIFT;
> >          mask = INDICATOR_ISOLATION_MASK;
> > +        if (drc_entry) {
> > +            /* transition from unisolated to isolated for a hotplug slot
> > +             * entails completion of guest-side device unplug/cleanup, so
> > +             * we can now safely remove the device if qemu is waiting for
> > +             * it to be released
> > +             */
> > +            if (DECODE_DRC_STATE(*pind, mask, shift) != indicator_state) {
> > +                if (indicator_state == 0 && drc_entry->awaiting_release) {
> > +                    /* device_del has been called and host is waiting
> > +                     * for guest to release/isolate device, go ahead
> > +                     * and remove it now
> > +                     */
> > +                    spapr_drc_state_reset(drc_entry);
> > +                }
> > +            }
> > +        }
> >          break;
> >      case 9002: /* DR */
> >          shift = INDICATOR_DR_SHIFT;
> > @@ -816,6 +843,198 @@ static AddressSpace *spapr_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn)
> >      return &phb->iommu_as;
> >  }
> >
> > +static int spapr_populate_pci_child_dt(PCIDevice *dev, void *fdt, int offset,
> > +                                       int phb_index)
> > +{
> > +    int slot = PCI_SLOT(dev->devfn);
> > +    char slotname[16];
> > +    bool is_bridge = 1;
> > +    sPAPRDrcEntry *drc_entry, *drc_entry_slot;
> > +    uint32_t reg[RESOURCE_CELLS_TOTAL * 8] = { 0 };
> > +    uint32_t assigned[RESOURCE_CELLS_TOTAL * 8] = { 0 };
> > +    int reg_size, assigned_size;
> > +
> > +    drc_entry = spapr_phb_to_drc_entry(phb_index + SPAPR_PCI_BASE_BUID);
> > +    g_assert(drc_entry);
> > +    drc_entry_slot = &drc_entry->child_entries[slot];
> > +
> > +    if (pci_default_read_config(dev, PCI_HEADER_TYPE, 1) ==
> > +        PCI_HEADER_TYPE_NORMAL) {
> > +        is_bridge = 0;
> > +    }
> > +
> > +    _FDT(fdt_setprop_cell(fdt, offset, "vendor-id",
> > +                          pci_default_read_config(dev, PCI_VENDOR_ID, 2)));
> > +    _FDT(fdt_setprop_cell(fdt, offset, "device-id",
> > +                          pci_default_read_config(dev, PCI_DEVICE_ID, 2)));
> > +    _FDT(fdt_setprop_cell(fdt, offset, "revision-id",
> > +                          pci_default_read_config(dev, PCI_REVISION_ID, 1)));
> > +    _FDT(fdt_setprop_cell(fdt, offset, "class-code",
> > +                          pci_default_read_config(dev, PCI_CLASS_DEVICE, 2) << 8));
> > +
> > +    _FDT(fdt_setprop_cell(fdt, offset, "interrupts",
> > +                          pci_default_read_config(dev, PCI_INTERRUPT_PIN, 1)));
> > +
> > +    /* if this device is NOT a bridge */
> > +    if (!is_bridge) {
> > +        _FDT(fdt_setprop_cell(fdt, offset, "min-grant",
> > +            pci_default_read_config(dev, PCI_MIN_GNT, 1)));
> > +        _FDT(fdt_setprop_cell(fdt, offset, "max-latency",
> > +            pci_default_read_config(dev, PCI_MAX_LAT, 1)));
> > +        _FDT(fdt_setprop_cell(fdt, offset, "subsystem-id",
> > +            pci_default_read_config(dev, PCI_SUBSYSTEM_ID, 2)));
> > +        _FDT(fdt_setprop_cell(fdt, offset, "subsystem-vendor-id",
> > +            pci_default_read_config(dev, PCI_SUBSYSTEM_VENDOR_ID, 2)));
> > +    }
> > +
> > +    _FDT(fdt_setprop_cell(fdt, offset, "cache-line-size",
> > +        pci_default_read_config(dev, PCI_CACHE_LINE_SIZE, 1)));
> > +
> > +    /* the following fdt cells are masked off the pci status register */
> > +    int pci_status = pci_default_read_config(dev, PCI_STATUS, 2);
> > +    _FDT(fdt_setprop_cell(fdt, offset, "devsel-speed",
> > +                          PCI_STATUS_DEVSEL_MASK & pci_status));
> > +    _FDT(fdt_setprop_cell(fdt, offset, "fast-back-to-back",
> > +                          PCI_STATUS_FAST_BACK & pci_status));
> > +    _FDT(fdt_setprop_cell(fdt, offset, "66mhz-capable",
> > +                          PCI_STATUS_66MHZ & pci_status));
> > +    _FDT(fdt_setprop_cell(fdt, offset, "udf-supported",
> > +                          PCI_STATUS_UDF & pci_status));
> > +
> > +    _FDT(fdt_setprop_string(fdt, offset, "name", "pci"));
> > +    sprintf(slotname, "Slot %d", slot + phb_index * 32);
> > +    _FDT(fdt_setprop(fdt, offset, "ibm,loc-code", slotname, strlen(slotname)));
> > +    _FDT(fdt_setprop_cell(fdt, offset, "ibm,my-drc-index",
> > +                          drc_entry_slot->drc_index));
> > +
> > +    _FDT(fdt_setprop_cell(fdt, offset, "#address-cells",
> > +                          RESOURCE_CELLS_ADDRESS));
> > +    _FDT(fdt_setprop_cell(fdt, offset, "#size-cells",
> > +                          RESOURCE_CELLS_SIZE));
> > +    _FDT(fdt_setprop_cell(fdt, offset, "ibm,req#msi-x",
> > +                          RESOURCE_CELLS_SIZE));
> > +    fill_resource_props(dev, phb_index, reg, &reg_size,
> > +                        assigned, &assigned_size);
> > +    _FDT(fdt_setprop(fdt, offset, "reg", reg, reg_size));
> > +    _FDT(fdt_setprop(fdt, offset, "assigned-addresses",
> > +                     assigned, assigned_size));
> > +
> > +    return 0;
> > +}
> > +
> > +static int spapr_device_hotplug_add(DeviceState *qdev, PCIDevice *dev)
> > +{
> > +    sPAPRPHBState *phb = SPAPR_PCI_HOST_BRIDGE(qdev);
> > +    sPAPRDrcEntry *drc_entry, *drc_entry_slot;
> > +    sPAPRConfigureConnectorState *ccs;
> > +    int slot = PCI_SLOT(dev->devfn);
> > +    int offset, ret;
> > +    void *fdt_orig, *fdt;
> > +    char nodename[512];
> > +    uint32_t encoded = ENCODE_DRC_STATE(INDICATOR_ENTITY_SENSE_PRESENT,
> > +                                        INDICATOR_ENTITY_SENSE_MASK,
> > +                                        INDICATOR_ENTITY_SENSE_SHIFT);
> > +
> 
> I am building on this infrastructure of yours and adding CPU hotplug
> support to sPAPR guests.
> 
> So we start with dr state of UNUSABLE and change it to PRESENT like
> above when performing hotplug operation. But after this, in case of
> CPU hotplug, the CPU hotplug code in the kernel
> (arch/powerpc/platforms/pseries/dlpar.c:dlpar_acquire_drc()) does
> get-sensor-state rtas call and expects the dr state to be UNUSABLE. Is
> the guest kernel right in expecting dr state to be in UNUSABLE state
> like this ? I have in fact disabled this check in the guest kernel to
> get a CPU hotplugged successfully, but wanted to understand the state
> changes and the expectations from the guest kernel correctly.

According to PAPR+ 2.7 13.5.3.3,

  PRESENT (1):
  
  Returned for logical and physical DR entities when the DR connector is
  allocated to the OS and the DR entity is present. For physical DR entities,
  this indicates that the DR connector actually has a DR entity plugged into
  it. For DR connectors of physical DR entities, the DR connector must be
  allocated to the OS to return this value, otherwise a status of -3, no such
  sensor implemented, will be returned from the get-sensor-state RTAS call. For
  DR connectors of logical DR entities, the DR connector must be allocated to
  the OS to return this value, otherwise a sensor value of 2 or 3 will be
  returned.
  
  UNUSABLE (2):
  
  Returned for logical DR entities when the DR entity is not currently
  available to the OS, but may possibly be made available to the OS by calling
  set-indicator with the allocation-state indicator, setting that indicator to
  usable.

So it seems 'PRESENT' is in fact the right value immediately after PCI
hotplug, but it doesn't seem clear from the documentation whether 'PRESENT'
or 'UNUSABLE' is more correct immediately after CPU hotplug. What does
seem clear as that UNUSABLE does not have any use in the case of PCI
devices: just PRESENT/EMPTY(0).

But we actually 0-initialize the sensor field for DRCEntrys corresponding
to PCI devices, which corresponds to 'EMPTY' (0), so the handling seems
correct for PCI devices...

And since we already initialize PHB sensors to UNUSABLE in the top-level
DRC list, I'm not sure why adjacent CPU entries would be affected by what
we do later for PCI devices? It seems like you'd just need to do the
equivalent of what we do for PHBs during realize:

  spapr_add_phb_to_drc_table(sphb->buid, 2 /* Unusable */);

So I'm not sure where the need for guest kernel changes is coming from for
CPU hotplug. Do you have a snippet of what the initialize/hot_add hooks
like in your case?

> 
> Regards,
> Bharata.
Bharata B Rao Sept. 4, 2014, 3:08 p.m. UTC | #5
On Thu, Sep 4, 2014 at 4:33 AM, Michael Roth <mdroth@linux.vnet.ibm.com> wrote:
>> > +static int spapr_device_hotplug_add(DeviceState *qdev, PCIDevice *dev)
>> > +{
>> > +    sPAPRPHBState *phb = SPAPR_PCI_HOST_BRIDGE(qdev);
>> > +    sPAPRDrcEntry *drc_entry, *drc_entry_slot;
>> > +    sPAPRConfigureConnectorState *ccs;
>> > +    int slot = PCI_SLOT(dev->devfn);
>> > +    int offset, ret;
>> > +    void *fdt_orig, *fdt;
>> > +    char nodename[512];
>> > +    uint32_t encoded = ENCODE_DRC_STATE(INDICATOR_ENTITY_SENSE_PRESENT,
>> > +                                        INDICATOR_ENTITY_SENSE_MASK,
>> > +                                        INDICATOR_ENTITY_SENSE_SHIFT);
>> > +
>>
>> I am building on this infrastructure of yours and adding CPU hotplug
>> support to sPAPR guests.
>>
>> So we start with dr state of UNUSABLE and change it to PRESENT like
>> above when performing hotplug operation. But after this, in case of
>> CPU hotplug, the CPU hotplug code in the kernel
>> (arch/powerpc/platforms/pseries/dlpar.c:dlpar_acquire_drc()) does
>> get-sensor-state rtas call and expects the dr state to be UNUSABLE. Is
>> the guest kernel right in expecting dr state to be in UNUSABLE state
>> like this ? I have in fact disabled this check in the guest kernel to
>> get a CPU hotplugged successfully, but wanted to understand the state
>> changes and the expectations from the guest kernel correctly.
>
> According to PAPR+ 2.7 13.5.3.3,
>
>   PRESENT (1):
>
>   Returned for logical and physical DR entities when the DR connector is
>   allocated to the OS and the DR entity is present. For physical DR entities,
>   this indicates that the DR connector actually has a DR entity plugged into
>   it. For DR connectors of physical DR entities, the DR connector must be
>   allocated to the OS to return this value, otherwise a status of -3, no such
>   sensor implemented, will be returned from the get-sensor-state RTAS call. For
>   DR connectors of logical DR entities, the DR connector must be allocated to
>   the OS to return this value, otherwise a sensor value of 2 or 3 will be
>   returned.
>
>   UNUSABLE (2):
>
>   Returned for logical DR entities when the DR entity is not currently
>   available to the OS, but may possibly be made available to the OS by calling
>   set-indicator with the allocation-state indicator, setting that indicator to
>   usable.
>
> So it seems 'PRESENT' is in fact the right value immediately after PCI
> hotplug, but it doesn't seem clear from the documentation whether 'PRESENT'
> or 'UNUSABLE' is more correct immediately after CPU hotplug. What does
> seem clear as that UNUSABLE does not have any use in the case of PCI
> devices: just PRESENT/EMPTY(0).
>
> But we actually 0-initialize the sensor field for DRCEntrys corresponding
> to PCI devices, which corresponds to 'EMPTY' (0), so the handling seems
> correct for PCI devices...

Thanks Michael for the above information on PRESENT and USABLE states.

>
> And since we already initialize PHB sensors to UNUSABLE in the top-level
> DRC list, I'm not sure why adjacent CPU entries would be affected by what
> we do later for PCI devices?

Sorry if I wasn't clear enough in my previous mail. CPU hotplug isn't
affected by what you do for PCI devices, but...

> It seems like you'd just need to do the
> equivalent of what we do for PHBs during realize:

when I try to do the same state changes for CPU hotplug, things don't
work as expected.

>
>   spapr_add_phb_to_drc_table(sphb->buid, 2 /* Unusable */);
>
> So I'm not sure where the need for guest kernel changes is coming from for
> CPU hotplug.

When the resource is hotplugged, you change the state from UNUSABLE to
PRESENT in QEMU before signalling the guest (via check exception irq).
But the same state change in CPU hotplug case isn't as per guest
kernel's expectation.

> Do you have a snippet of what the initialize/hot_add hooks
> like in your case?

I am talking about this piece of code in the the kernel in
arch/powerpc/platforms/pseries/dlpar.c

int dlpar_acquire_drc(u32 drc_index)
{
        int dr_status, rc;

        rc = rtas_call(rtas_token("get-sensor-state"), 2, 2, &dr_status,
                       DR_ENTITY_SENSE, drc_index);
        if (rc || dr_status != DR_ENTITY_UNUSABLE)
          return -1;
       ...
}

I have circumvented this problem by not setting the state to PRESENT
in my current hotplug patch. You can refer to the same in
spapr_cpu_hotplug_add() routine that's part of my patch 14/15
(https://lists.gnu.org/archive/html/qemu-devel/2014-09/msg00757.html)

Regards,
Bharata.
Michael Roth Sept. 4, 2014, 4:12 p.m. UTC | #6
Quoting Bharata B Rao (2014-09-04 10:08:20)
> On Thu, Sep 4, 2014 at 4:33 AM, Michael Roth <mdroth@linux.vnet.ibm.com> wrote:
> >> > +static int spapr_device_hotplug_add(DeviceState *qdev, PCIDevice *dev)
> >> > +{
> >> > +    sPAPRPHBState *phb = SPAPR_PCI_HOST_BRIDGE(qdev);
> >> > +    sPAPRDrcEntry *drc_entry, *drc_entry_slot;
> >> > +    sPAPRConfigureConnectorState *ccs;
> >> > +    int slot = PCI_SLOT(dev->devfn);
> >> > +    int offset, ret;
> >> > +    void *fdt_orig, *fdt;
> >> > +    char nodename[512];
> >> > +    uint32_t encoded = ENCODE_DRC_STATE(INDICATOR_ENTITY_SENSE_PRESENT,
> >> > +                                        INDICATOR_ENTITY_SENSE_MASK,
> >> > +                                        INDICATOR_ENTITY_SENSE_SHIFT);
> >> > +
> >>
> >> I am building on this infrastructure of yours and adding CPU hotplug
> >> support to sPAPR guests.
> >>
> >> So we start with dr state of UNUSABLE and change it to PRESENT like
> >> above when performing hotplug operation. But after this, in case of
> >> CPU hotplug, the CPU hotplug code in the kernel
> >> (arch/powerpc/platforms/pseries/dlpar.c:dlpar_acquire_drc()) does
> >> get-sensor-state rtas call and expects the dr state to be UNUSABLE. Is
> >> the guest kernel right in expecting dr state to be in UNUSABLE state
> >> like this ? I have in fact disabled this check in the guest kernel to
> >> get a CPU hotplugged successfully, but wanted to understand the state
> >> changes and the expectations from the guest kernel correctly.
> >
> > According to PAPR+ 2.7 13.5.3.3,
> >
> >   PRESENT (1):
> >
> >   Returned for logical and physical DR entities when the DR connector is
> >   allocated to the OS and the DR entity is present. For physical DR entities,
> >   this indicates that the DR connector actually has a DR entity plugged into
> >   it. For DR connectors of physical DR entities, the DR connector must be
> >   allocated to the OS to return this value, otherwise a status of -3, no such
> >   sensor implemented, will be returned from the get-sensor-state RTAS call. For
> >   DR connectors of logical DR entities, the DR connector must be allocated to
> >   the OS to return this value, otherwise a sensor value of 2 or 3 will be
> >   returned.
> >
> >   UNUSABLE (2):
> >
> >   Returned for logical DR entities when the DR entity is not currently
> >   available to the OS, but may possibly be made available to the OS by calling
> >   set-indicator with the allocation-state indicator, setting that indicator to
> >   usable.
> >
> > So it seems 'PRESENT' is in fact the right value immediately after PCI
> > hotplug, but it doesn't seem clear from the documentation whether 'PRESENT'
> > or 'UNUSABLE' is more correct immediately after CPU hotplug. What does
> > seem clear as that UNUSABLE does not have any use in the case of PCI
> > devices: just PRESENT/EMPTY(0).
> >
> > But we actually 0-initialize the sensor field for DRCEntrys corresponding
> > to PCI devices, which corresponds to 'EMPTY' (0), so the handling seems
> > correct for PCI devices...
> 
> Thanks Michael for the above information on PRESENT and USABLE states.
> 
> >
> > And since we already initialize PHB sensors to UNUSABLE in the top-level
> > DRC list, I'm not sure why adjacent CPU entries would be affected by what
> > we do later for PCI devices?
> 
> Sorry if I wasn't clear enough in my previous mail. CPU hotplug isn't
> affected by what you do for PCI devices, but...
> 
> > It seems like you'd just need to do the
> > equivalent of what we do for PHBs during realize:
> 
> when I try to do the same state changes for CPU hotplug, things don't
> work as expected.
> 
> >
> >   spapr_add_phb_to_drc_table(sphb->buid, 2 /* Unusable */);
> >
> > So I'm not sure where the need for guest kernel changes is coming from for
> > CPU hotplug.
> 
> When the resource is hotplugged, you change the state from UNUSABLE to
> PRESENT in QEMU before signalling the guest (via check exception irq).
> But the same state change in CPU hotplug case isn't as per guest
> kernel's expectation.
> 
> > Do you have a snippet of what the initialize/hot_add hooks
> > like in your case?
> 
> I am talking about this piece of code in the the kernel in
> arch/powerpc/platforms/pseries/dlpar.c
> 
> int dlpar_acquire_drc(u32 drc_index)
> {
>         int dr_status, rc;
> 
>         rc = rtas_call(rtas_token("get-sensor-state"), 2, 2, &dr_status,
>                        DR_ENTITY_SENSE, drc_index);
>         if (rc || dr_status != DR_ENTITY_UNUSABLE)
>           return -1;
>        ...
> }
> 
> I have circumvented this problem by not setting the state to PRESENT
> in my current hotplug patch. You can refer to the same in
> spapr_cpu_hotplug_add() routine that's part of my patch 14/15
> (https://lists.gnu.org/archive/html/qemu-devel/2014-09/msg00757.html)

Yah, that's what I was getting at: at least just to get things working
for testing, just avoid the PRESENT bits in your hot_add_cpu hook rather
than patching the guest. Unfortunately the documentation isn't particularly
clear about which of these approaches is more correct as far as CPUs go. But
looking at it again:

   UNUSABLE (2):

   Returned for logical DR entities when the DR entity is not currently
   available to the OS, but may possibly be made available to the OS by calling
   set-indicator with the allocation-state indicator, setting that indicator to
   usable.

That 'usable' indicator setting is documented for set-indicator as (1), which
happens to correspond to PRESENT (1). So my read would be that for 'physical'
hotplug (like PCI), the firmware changes the indicator state to PRESENT/USABLE
immediately after hotplug, whereas with 'logical' hotplug (like PHB/CPU), the
guest OS signals this transition to USABLE through set-indicator calls for the
9003 sensor/allocation state after hotplug (which also seems to match up with
the kernel code).

This seems to correspond with the dlpar_acquire_drc() function, but I'm a
little confused why that's not also called in the PHB path...I think maybe
in that case it's handled by drmgr in userspace... will take another look
to confirm.

> 
> Regards,
> Bharata.
Michael Roth Sept. 4, 2014, 4:34 p.m. UTC | #7
Quoting Michael Roth (2014-09-04 11:12:15)
> Quoting Bharata B Rao (2014-09-04 10:08:20)
> > On Thu, Sep 4, 2014 at 4:33 AM, Michael Roth <mdroth@linux.vnet.ibm.com> wrote:
> > >> > +static int spapr_device_hotplug_add(DeviceState *qdev, PCIDevice *dev)
> > >> > +{
> > >> > +    sPAPRPHBState *phb = SPAPR_PCI_HOST_BRIDGE(qdev);
> > >> > +    sPAPRDrcEntry *drc_entry, *drc_entry_slot;
> > >> > +    sPAPRConfigureConnectorState *ccs;
> > >> > +    int slot = PCI_SLOT(dev->devfn);
> > >> > +    int offset, ret;
> > >> > +    void *fdt_orig, *fdt;
> > >> > +    char nodename[512];
> > >> > +    uint32_t encoded = ENCODE_DRC_STATE(INDICATOR_ENTITY_SENSE_PRESENT,
> > >> > +                                        INDICATOR_ENTITY_SENSE_MASK,
> > >> > +                                        INDICATOR_ENTITY_SENSE_SHIFT);
> > >> > +
> > >>
> > >> I am building on this infrastructure of yours and adding CPU hotplug
> > >> support to sPAPR guests.
> > >>
> > >> So we start with dr state of UNUSABLE and change it to PRESENT like
> > >> above when performing hotplug operation. But after this, in case of
> > >> CPU hotplug, the CPU hotplug code in the kernel
> > >> (arch/powerpc/platforms/pseries/dlpar.c:dlpar_acquire_drc()) does
> > >> get-sensor-state rtas call and expects the dr state to be UNUSABLE. Is
> > >> the guest kernel right in expecting dr state to be in UNUSABLE state
> > >> like this ? I have in fact disabled this check in the guest kernel to
> > >> get a CPU hotplugged successfully, but wanted to understand the state
> > >> changes and the expectations from the guest kernel correctly.
> > >
> > > According to PAPR+ 2.7 13.5.3.3,
> > >
> > >   PRESENT (1):
> > >
> > >   Returned for logical and physical DR entities when the DR connector is
> > >   allocated to the OS and the DR entity is present. For physical DR entities,
> > >   this indicates that the DR connector actually has a DR entity plugged into
> > >   it. For DR connectors of physical DR entities, the DR connector must be
> > >   allocated to the OS to return this value, otherwise a status of -3, no such
> > >   sensor implemented, will be returned from the get-sensor-state RTAS call. For
> > >   DR connectors of logical DR entities, the DR connector must be allocated to
> > >   the OS to return this value, otherwise a sensor value of 2 or 3 will be
> > >   returned.
> > >
> > >   UNUSABLE (2):
> > >
> > >   Returned for logical DR entities when the DR entity is not currently
> > >   available to the OS, but may possibly be made available to the OS by calling
> > >   set-indicator with the allocation-state indicator, setting that indicator to
> > >   usable.
> > >
> > > So it seems 'PRESENT' is in fact the right value immediately after PCI
> > > hotplug, but it doesn't seem clear from the documentation whether 'PRESENT'
> > > or 'UNUSABLE' is more correct immediately after CPU hotplug. What does
> > > seem clear as that UNUSABLE does not have any use in the case of PCI
> > > devices: just PRESENT/EMPTY(0).
> > >
> > > But we actually 0-initialize the sensor field for DRCEntrys corresponding
> > > to PCI devices, which corresponds to 'EMPTY' (0), so the handling seems
> > > correct for PCI devices...
> > 
> > Thanks Michael for the above information on PRESENT and USABLE states.
> > 
> > >
> > > And since we already initialize PHB sensors to UNUSABLE in the top-level
> > > DRC list, I'm not sure why adjacent CPU entries would be affected by what
> > > we do later for PCI devices?
> > 
> > Sorry if I wasn't clear enough in my previous mail. CPU hotplug isn't
> > affected by what you do for PCI devices, but...
> > 
> > > It seems like you'd just need to do the
> > > equivalent of what we do for PHBs during realize:
> > 
> > when I try to do the same state changes for CPU hotplug, things don't
> > work as expected.
> > 
> > >
> > >   spapr_add_phb_to_drc_table(sphb->buid, 2 /* Unusable */);
> > >
> > > So I'm not sure where the need for guest kernel changes is coming from for
> > > CPU hotplug.
> > 
> > When the resource is hotplugged, you change the state from UNUSABLE to
> > PRESENT in QEMU before signalling the guest (via check exception irq).
> > But the same state change in CPU hotplug case isn't as per guest
> > kernel's expectation.
> > 
> > > Do you have a snippet of what the initialize/hot_add hooks
> > > like in your case?
> > 
> > I am talking about this piece of code in the the kernel in
> > arch/powerpc/platforms/pseries/dlpar.c
> > 
> > int dlpar_acquire_drc(u32 drc_index)
> > {
> >         int dr_status, rc;
> > 
> >         rc = rtas_call(rtas_token("get-sensor-state"), 2, 2, &dr_status,
> >                        DR_ENTITY_SENSE, drc_index);
> >         if (rc || dr_status != DR_ENTITY_UNUSABLE)
> >           return -1;
> >        ...
> > }
> > 
> > I have circumvented this problem by not setting the state to PRESENT
> > in my current hotplug patch. You can refer to the same in
> > spapr_cpu_hotplug_add() routine that's part of my patch 14/15
> > (https://lists.gnu.org/archive/html/qemu-devel/2014-09/msg00757.html)
> 
> Yah, that's what I was getting at: at least just to get things working
> for testing, just avoid the PRESENT bits in your hot_add_cpu hook rather
> than patching the guest. Unfortunately the documentation isn't particularly
> clear about which of these approaches is more correct as far as CPUs go. But
> looking at it again:
> 
>    UNUSABLE (2):
> 
>    Returned for logical DR entities when the DR entity is not currently
>    available to the OS, but may possibly be made available to the OS by calling
>    set-indicator with the allocation-state indicator, setting that indicator to
>    usable.
> 
> That 'usable' indicator setting is documented for set-indicator as (1), which
> happens to correspond to PRESENT (1). So my read would be that for 'physical'
> hotplug (like PCI), the firmware changes the indicator state to PRESENT/USABLE
> immediately after hotplug, whereas with 'logical' hotplug (like PHB/CPU), the
> guest OS signals this transition to USABLE through set-indicator calls for the
> 9003 sensor/allocation state after hotplug (which also seems to match up with
> the kernel code).
> 
> This seems to correspond with the dlpar_acquire_drc() function, but I'm a
> little confused why that's not also called in the PHB path...I think maybe
> in that case it's handled by drmgr in userspace... will take another look
> to confirm.

Yah, here's the code from drmgr, same expectations:

int     
acquire_drc(uint32_t drc_index)
{
    int rc;

    say(DEBUG, "Acquiring drc index 0x%x\n", drc_index);

    rc = dr_entity_sense(drc_index);
    if (rc != STATE_UNUSABLE) {
        say(ERROR, "Entity sense failed for drc %x with %d\n%s\n",
            drc_index, rc, entity_sense_error(rc));
        return -1;
    }

    say(DEBUG, "Setting allocation state to 'alloc usable'\n");
    rc = rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_USABLE);
    if (rc) {
        say(ERROR, "Allocation failed for drc %x with %d\n%s\n",
            drc_index, rc, set_indicator_error(rc));
        return -1;
    }

    say(DEBUG, "Setting indicator state to 'unisolate'\n");
    rc = rtas_set_indicator(ISOLATION_STATE, drc_index, UNISOLATE);
    if (rc) {
        int ret;
        rc = -1;

        say(ERROR, "Unisolate failed for drc %x with %d\n%s\n",
            drc_index, rc, set_indicator_error(rc));
        ret = rtas_set_indicator(ALLOCATION_STATE, drc_index,
                     ALLOC_UNUSABLE);
        if (ret) {
            say(ERROR, "Failed recovery to unusable state after "
                "unisolate failure for drc %x with %d\n%s\n",
                drc_index, ret, set_indicator_error(ret));
        }
    }

    return rc;
}

> 
> > 
> > Regards,
> > Bharata.
Nathan Fontenot Sept. 5, 2014, 3:10 a.m. UTC | #8
On 09/04/2014 11:34 AM, Michael Roth wrote:
> Quoting Michael Roth (2014-09-04 11:12:15)
>> Quoting Bharata B Rao (2014-09-04 10:08:20)
>>> On Thu, Sep 4, 2014 at 4:33 AM, Michael Roth <mdroth@linux.vnet.ibm.com> wrote:
>>>>>> +static int spapr_device_hotplug_add(DeviceState *qdev, PCIDevice *dev)
>>>>>> +{
>>>>>> +    sPAPRPHBState *phb = SPAPR_PCI_HOST_BRIDGE(qdev);
>>>>>> +    sPAPRDrcEntry *drc_entry, *drc_entry_slot;
>>>>>> +    sPAPRConfigureConnectorState *ccs;
>>>>>> +    int slot = PCI_SLOT(dev->devfn);
>>>>>> +    int offset, ret;
>>>>>> +    void *fdt_orig, *fdt;
>>>>>> +    char nodename[512];
>>>>>> +    uint32_t encoded = ENCODE_DRC_STATE(INDICATOR_ENTITY_SENSE_PRESENT,
>>>>>> +                                        INDICATOR_ENTITY_SENSE_MASK,
>>>>>> +                                        INDICATOR_ENTITY_SENSE_SHIFT);
>>>>>> +
>>>>>
>>>>> I am building on this infrastructure of yours and adding CPU hotplug
>>>>> support to sPAPR guests.
>>>>>
>>>>> So we start with dr state of UNUSABLE and change it to PRESENT like
>>>>> above when performing hotplug operation. But after this, in case of
>>>>> CPU hotplug, the CPU hotplug code in the kernel
>>>>> (arch/powerpc/platforms/pseries/dlpar.c:dlpar_acquire_drc()) does
>>>>> get-sensor-state rtas call and expects the dr state to be UNUSABLE. Is
>>>>> the guest kernel right in expecting dr state to be in UNUSABLE state
>>>>> like this ? I have in fact disabled this check in the guest kernel to
>>>>> get a CPU hotplugged successfully, but wanted to understand the state
>>>>> changes and the expectations from the guest kernel correctly.
>>>>
>>>> According to PAPR+ 2.7 13.5.3.3,
>>>>
>>>>   PRESENT (1):
>>>>
>>>>   Returned for logical and physical DR entities when the DR connector is
>>>>   allocated to the OS and the DR entity is present. For physical DR entities,
>>>>   this indicates that the DR connector actually has a DR entity plugged into
>>>>   it. For DR connectors of physical DR entities, the DR connector must be
>>>>   allocated to the OS to return this value, otherwise a status of -3, no such
>>>>   sensor implemented, will be returned from the get-sensor-state RTAS call. For
>>>>   DR connectors of logical DR entities, the DR connector must be allocated to
>>>>   the OS to return this value, otherwise a sensor value of 2 or 3 will be
>>>>   returned.
>>>>
>>>>   UNUSABLE (2):
>>>>
>>>>   Returned for logical DR entities when the DR entity is not currently
>>>>   available to the OS, but may possibly be made available to the OS by calling
>>>>   set-indicator with the allocation-state indicator, setting that indicator to
>>>>   usable.
>>>>
>>>> So it seems 'PRESENT' is in fact the right value immediately after PCI
>>>> hotplug, but it doesn't seem clear from the documentation whether 'PRESENT'
>>>> or 'UNUSABLE' is more correct immediately after CPU hotplug. What does
>>>> seem clear as that UNUSABLE does not have any use in the case of PCI
>>>> devices: just PRESENT/EMPTY(0).
>>>>
>>>> But we actually 0-initialize the sensor field for DRCEntrys corresponding
>>>> to PCI devices, which corresponds to 'EMPTY' (0), so the handling seems
>>>> correct for PCI devices...
>>>
>>> Thanks Michael for the above information on PRESENT and USABLE states.
>>>
>>>>
>>>> And since we already initialize PHB sensors to UNUSABLE in the top-level
>>>> DRC list, I'm not sure why adjacent CPU entries would be affected by what
>>>> we do later for PCI devices?
>>>
>>> Sorry if I wasn't clear enough in my previous mail. CPU hotplug isn't
>>> affected by what you do for PCI devices, but...
>>>
>>>> It seems like you'd just need to do the
>>>> equivalent of what we do for PHBs during realize:
>>>
>>> when I try to do the same state changes for CPU hotplug, things don't
>>> work as expected.
>>>
>>>>
>>>>   spapr_add_phb_to_drc_table(sphb->buid, 2 /* Unusable */);
>>>>
>>>> So I'm not sure where the need for guest kernel changes is coming from for
>>>> CPU hotplug.
>>>
>>> When the resource is hotplugged, you change the state from UNUSABLE to
>>> PRESENT in QEMU before signalling the guest (via check exception irq).
>>> But the same state change in CPU hotplug case isn't as per guest
>>> kernel's expectation.
>>>
>>>> Do you have a snippet of what the initialize/hot_add hooks
>>>> like in your case?
>>>
>>> I am talking about this piece of code in the the kernel in
>>> arch/powerpc/platforms/pseries/dlpar.c
>>>
>>> int dlpar_acquire_drc(u32 drc_index)
>>> {
>>>         int dr_status, rc;
>>>
>>>         rc = rtas_call(rtas_token("get-sensor-state"), 2, 2, &dr_status,
>>>                        DR_ENTITY_SENSE, drc_index);
>>>         if (rc || dr_status != DR_ENTITY_UNUSABLE)
>>>           return -1;
>>>        ...
>>> }
>>>
>>> I have circumvented this problem by not setting the state to PRESENT
>>> in my current hotplug patch. You can refer to the same in
>>> spapr_cpu_hotplug_add() routine that's part of my patch 14/15
>>> (https://lists.gnu.org/archive/html/qemu-devel/2014-09/msg00757.html)
>>
>> Yah, that's what I was getting at: at least just to get things working
>> for testing, just avoid the PRESENT bits in your hot_add_cpu hook rather
>> than patching the guest. Unfortunately the documentation isn't particularly
>> clear about which of these approaches is more correct as far as CPUs go. But
>> looking at it again:
>>
>>    UNUSABLE (2):
>>
>>    Returned for logical DR entities when the DR entity is not currently
>>    available to the OS, but may possibly be made available to the OS by calling
>>    set-indicator with the allocation-state indicator, setting that indicator to
>>    usable.
>>
>> That 'usable' indicator setting is documented for set-indicator as (1), which
>> happens to correspond to PRESENT (1). So my read would be that for 'physical'
>> hotplug (like PCI), the firmware changes the indicator state to PRESENT/USABLE
>> immediately after hotplug, whereas with 'logical' hotplug (like PHB/CPU), the
>> guest OS signals this transition to USABLE through set-indicator calls for the
>> 9003 sensor/allocation state after hotplug (which also seems to match up with
>> the kernel code).
>>
>> This seems to correspond with the dlpar_acquire_drc() function, but I'm a
>> little confused why that's not also called in the PHB path...I think maybe
>> in that case it's handled by drmgr in userspace... will take another look
>> to confirm.
> 
> Yah, here's the code from drmgr, same expectations:

Yes, the guest expects the state to be UNUSABLE.

As mentioned above, I don't think we should be changing the state to PRESENT
before notifying the guest.

-Nathan

 
> 
> int     
> acquire_drc(uint32_t drc_index)
> {
>     int rc;
> 
>     say(DEBUG, "Acquiring drc index 0x%x\n", drc_index);
> 
>     rc = dr_entity_sense(drc_index);
>     if (rc != STATE_UNUSABLE) {
>         say(ERROR, "Entity sense failed for drc %x with %d\n%s\n",
>             drc_index, rc, entity_sense_error(rc));
>         return -1;
>     }
> 
>     say(DEBUG, "Setting allocation state to 'alloc usable'\n");
>     rc = rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_USABLE);
>     if (rc) {
>         say(ERROR, "Allocation failed for drc %x with %d\n%s\n",
>             drc_index, rc, set_indicator_error(rc));
>         return -1;
>     }
> 
>     say(DEBUG, "Setting indicator state to 'unisolate'\n");
>     rc = rtas_set_indicator(ISOLATION_STATE, drc_index, UNISOLATE);
>     if (rc) {
>         int ret;
>         rc = -1;
> 
>         say(ERROR, "Unisolate failed for drc %x with %d\n%s\n",
>             drc_index, rc, set_indicator_error(rc));
>         ret = rtas_set_indicator(ALLOCATION_STATE, drc_index,
>                      ALLOC_UNUSABLE);
>         if (ret) {
>             say(ERROR, "Failed recovery to unusable state after "
>                 "unisolate failure for drc %x with %d\n%s\n",
>                 drc_index, ret, set_indicator_error(ret));
>         }
>     }
> 
>     return rc;
> }
> 
>>
>>>
>>> Regards,
>>> Bharata.
Tyrel Datwyler Sept. 5, 2014, 5:17 p.m. UTC | #9
On 09/04/2014 08:10 PM, Nathan Fontenot wrote:
> On 09/04/2014 11:34 AM, Michael Roth wrote:
>> Quoting Michael Roth (2014-09-04 11:12:15)
>>> Quoting Bharata B Rao (2014-09-04 10:08:20)
>>>> On Thu, Sep 4, 2014 at 4:33 AM, Michael Roth <mdroth@linux.vnet.ibm.com> wrote:
>>>>>>> +static int spapr_device_hotplug_add(DeviceState *qdev, PCIDevice *dev)
>>>>>>> +{
>>>>>>> +    sPAPRPHBState *phb = SPAPR_PCI_HOST_BRIDGE(qdev);
>>>>>>> +    sPAPRDrcEntry *drc_entry, *drc_entry_slot;
>>>>>>> +    sPAPRConfigureConnectorState *ccs;
>>>>>>> +    int slot = PCI_SLOT(dev->devfn);
>>>>>>> +    int offset, ret;
>>>>>>> +    void *fdt_orig, *fdt;
>>>>>>> +    char nodename[512];
>>>>>>> +    uint32_t encoded = ENCODE_DRC_STATE(INDICATOR_ENTITY_SENSE_PRESENT,
>>>>>>> +                                        INDICATOR_ENTITY_SENSE_MASK,
>>>>>>> +                                        INDICATOR_ENTITY_SENSE_SHIFT);
>>>>>>> +
>>>>>>
>>>>>> I am building on this infrastructure of yours and adding CPU hotplug
>>>>>> support to sPAPR guests.
>>>>>>
>>>>>> So we start with dr state of UNUSABLE and change it to PRESENT like
>>>>>> above when performing hotplug operation. But after this, in case of
>>>>>> CPU hotplug, the CPU hotplug code in the kernel
>>>>>> (arch/powerpc/platforms/pseries/dlpar.c:dlpar_acquire_drc()) does
>>>>>> get-sensor-state rtas call and expects the dr state to be UNUSABLE. Is
>>>>>> the guest kernel right in expecting dr state to be in UNUSABLE state
>>>>>> like this ? I have in fact disabled this check in the guest kernel to
>>>>>> get a CPU hotplugged successfully, but wanted to understand the state
>>>>>> changes and the expectations from the guest kernel correctly.
>>>>>
>>>>> According to PAPR+ 2.7 13.5.3.3,
>>>>>
>>>>>   PRESENT (1):
>>>>>
>>>>>   Returned for logical and physical DR entities when the DR connector is
>>>>>   allocated to the OS and the DR entity is present. For physical DR entities,
>>>>>   this indicates that the DR connector actually has a DR entity plugged into
>>>>>   it. For DR connectors of physical DR entities, the DR connector must be
>>>>>   allocated to the OS to return this value, otherwise a status of -3, no such
>>>>>   sensor implemented, will be returned from the get-sensor-state RTAS call. For
>>>>>   DR connectors of logical DR entities, the DR connector must be allocated to
>>>>>   the OS to return this value, otherwise a sensor value of 2 or 3 will be
>>>>>   returned.
>>>>>
>>>>>   UNUSABLE (2):
>>>>>
>>>>>   Returned for logical DR entities when the DR entity is not currently
>>>>>   available to the OS, but may possibly be made available to the OS by calling
>>>>>   set-indicator with the allocation-state indicator, setting that indicator to
>>>>>   usable.
>>>>>
>>>>> So it seems 'PRESENT' is in fact the right value immediately after PCI
>>>>> hotplug, but it doesn't seem clear from the documentation whether 'PRESENT'
>>>>> or 'UNUSABLE' is more correct immediately after CPU hotplug. What does
>>>>> seem clear as that UNUSABLE does not have any use in the case of PCI
>>>>> devices: just PRESENT/EMPTY(0).
>>>>>
>>>>> But we actually 0-initialize the sensor field for DRCEntrys corresponding
>>>>> to PCI devices, which corresponds to 'EMPTY' (0), so the handling seems
>>>>> correct for PCI devices...
>>>>
>>>> Thanks Michael for the above information on PRESENT and USABLE states.
>>>>
>>>>>
>>>>> And since we already initialize PHB sensors to UNUSABLE in the top-level
>>>>> DRC list, I'm not sure why adjacent CPU entries would be affected by what
>>>>> we do later for PCI devices?
>>>>
>>>> Sorry if I wasn't clear enough in my previous mail. CPU hotplug isn't
>>>> affected by what you do for PCI devices, but...
>>>>
>>>>> It seems like you'd just need to do the
>>>>> equivalent of what we do for PHBs during realize:
>>>>
>>>> when I try to do the same state changes for CPU hotplug, things don't
>>>> work as expected.
>>>>
>>>>>
>>>>>   spapr_add_phb_to_drc_table(sphb->buid, 2 /* Unusable */);
>>>>>
>>>>> So I'm not sure where the need for guest kernel changes is coming from for
>>>>> CPU hotplug.
>>>>
>>>> When the resource is hotplugged, you change the state from UNUSABLE to
>>>> PRESENT in QEMU before signalling the guest (via check exception irq).
>>>> But the same state change in CPU hotplug case isn't as per guest
>>>> kernel's expectation.
>>>>
>>>>> Do you have a snippet of what the initialize/hot_add hooks
>>>>> like in your case?
>>>>
>>>> I am talking about this piece of code in the the kernel in
>>>> arch/powerpc/platforms/pseries/dlpar.c
>>>>
>>>> int dlpar_acquire_drc(u32 drc_index)
>>>> {
>>>>         int dr_status, rc;
>>>>
>>>>         rc = rtas_call(rtas_token("get-sensor-state"), 2, 2, &dr_status,
>>>>                        DR_ENTITY_SENSE, drc_index);
>>>>         if (rc || dr_status != DR_ENTITY_UNUSABLE)
>>>>           return -1;
>>>>        ...
>>>> }
>>>>
>>>> I have circumvented this problem by not setting the state to PRESENT
>>>> in my current hotplug patch. You can refer to the same in
>>>> spapr_cpu_hotplug_add() routine that's part of my patch 14/15
>>>> (https://lists.gnu.org/archive/html/qemu-devel/2014-09/msg00757.html)
>>>
>>> Yah, that's what I was getting at: at least just to get things working
>>> for testing, just avoid the PRESENT bits in your hot_add_cpu hook rather
>>> than patching the guest. Unfortunately the documentation isn't particularly
>>> clear about which of these approaches is more correct as far as CPUs go. But
>>> looking at it again:
>>>
>>>    UNUSABLE (2):
>>>
>>>    Returned for logical DR entities when the DR entity is not currently
>>>    available to the OS, but may possibly be made available to the OS by calling
>>>    set-indicator with the allocation-state indicator, setting that indicator to
>>>    usable.
>>>
>>> That 'usable' indicator setting is documented for set-indicator as (1), which
>>> happens to correspond to PRESENT (1). So my read would be that for 'physical'
>>> hotplug (like PCI), the firmware changes the indicator state to PRESENT/USABLE
>>> immediately after hotplug, whereas with 'logical' hotplug (like PHB/CPU), the
>>> guest OS signals this transition to USABLE through set-indicator calls for the
>>> 9003 sensor/allocation state after hotplug (which also seems to match up with
>>> the kernel code).
>>>
>>> This seems to correspond with the dlpar_acquire_drc() function, but I'm a
>>> little confused why that's not also called in the PHB path...I think maybe
>>> in that case it's handled by drmgr in userspace... will take another look
>>> to confirm.
>>
>> Yah, here's the code from drmgr, same expectations:
> 
> Yes, the guest expects the state to be UNUSABLE.
> 
> As mentioned above, I don't think we should be changing the state to PRESENT
> before notifying the guest.
> 
> -Nathan
> 
>  

There is a subtle difference in the state transitions for logical and
physical entities. In the pci case which is a physical entity the state
needs to start as PRESENT to indicate a device is present in the slot.
Physical entities are always considered resources that are owned by the
OS. For cpus which are considered logical entities the state starts in
UNUSABLE waiting for a set-indicator call to change the allocation to
usable. Logical entities are owned by the platform and reserved until
the they are made owned by the OS through an allocation attempt by the
set-indicator call from the guest. If the changing of the allocation
state to usable is successful the entities sense state should now be
PRESENT.

The state transition diagram in PAPR section 13.4 should make this a
little clearer.

-Tyrel

>>
>> int     
>> acquire_drc(uint32_t drc_index)
>> {
>>     int rc;
>>
>>     say(DEBUG, "Acquiring drc index 0x%x\n", drc_index);
>>
>>     rc = dr_entity_sense(drc_index);
>>     if (rc != STATE_UNUSABLE) {
>>         say(ERROR, "Entity sense failed for drc %x with %d\n%s\n",
>>             drc_index, rc, entity_sense_error(rc));
>>         return -1;
>>     }
>>
>>     say(DEBUG, "Setting allocation state to 'alloc usable'\n");
>>     rc = rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_USABLE);
>>     if (rc) {
>>         say(ERROR, "Allocation failed for drc %x with %d\n%s\n",
>>             drc_index, rc, set_indicator_error(rc));
>>         return -1;
>>     }
>>
>>     say(DEBUG, "Setting indicator state to 'unisolate'\n");
>>     rc = rtas_set_indicator(ISOLATION_STATE, drc_index, UNISOLATE);
>>     if (rc) {
>>         int ret;
>>         rc = -1;
>>
>>         say(ERROR, "Unisolate failed for drc %x with %d\n%s\n",
>>             drc_index, rc, set_indicator_error(rc));
>>         ret = rtas_set_indicator(ALLOCATION_STATE, drc_index,
>>                      ALLOC_UNUSABLE);
>>         if (ret) {
>>             say(ERROR, "Failed recovery to unusable state after "
>>                 "unisolate failure for drc %x with %d\n%s\n",
>>                 drc_index, ret, set_indicator_error(ret));
>>         }
>>     }
>>
>>     return rc;
>> }
>>
>>>
>>>>
>>>> Regards,
>>>> Bharata.
> 
>
diff mbox

Patch

diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index 96a57be..23864ab 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -87,6 +87,17 @@ 
 #define ENCODE_DRC_STATE(val, m, s) \
     (((uint32_t)(val) << (s)) & (uint32_t)(m))
 
+#define FDT_MAX_SIZE            0x10000
+#define _FDT(exp) \
+    do { \
+        int ret = (exp);                                           \
+        if (ret < 0) {                                             \
+            return ret;                                            \
+        }                                                          \
+    } while (0)
+
+static void spapr_drc_state_reset(sPAPRDrcEntry *drc_entry);
+
 static sPAPRPHBState *find_phb(sPAPREnvironment *spapr, uint64_t buid)
 {
     sPAPRPHBState *sphb;
@@ -476,6 +487,22 @@  static void rtas_set_indicator(PowerPCCPU *cpu, sPAPREnvironment *spapr,
         /* encode the new value into the correct bit field */
         shift = INDICATOR_ISOLATION_SHIFT;
         mask = INDICATOR_ISOLATION_MASK;
+        if (drc_entry) {
+            /* transition from unisolated to isolated for a hotplug slot
+             * entails completion of guest-side device unplug/cleanup, so
+             * we can now safely remove the device if qemu is waiting for
+             * it to be released
+             */
+            if (DECODE_DRC_STATE(*pind, mask, shift) != indicator_state) {
+                if (indicator_state == 0 && drc_entry->awaiting_release) {
+                    /* device_del has been called and host is waiting
+                     * for guest to release/isolate device, go ahead
+                     * and remove it now
+                     */
+                    spapr_drc_state_reset(drc_entry);
+                }
+            }
+        }
         break;
     case 9002: /* DR */
         shift = INDICATOR_DR_SHIFT;
@@ -816,6 +843,198 @@  static AddressSpace *spapr_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn)
     return &phb->iommu_as;
 }
 
+static int spapr_populate_pci_child_dt(PCIDevice *dev, void *fdt, int offset,
+                                       int phb_index)
+{
+    int slot = PCI_SLOT(dev->devfn);
+    char slotname[16];
+    bool is_bridge = 1;
+    sPAPRDrcEntry *drc_entry, *drc_entry_slot;
+    uint32_t reg[RESOURCE_CELLS_TOTAL * 8] = { 0 };
+    uint32_t assigned[RESOURCE_CELLS_TOTAL * 8] = { 0 };
+    int reg_size, assigned_size;
+
+    drc_entry = spapr_phb_to_drc_entry(phb_index + SPAPR_PCI_BASE_BUID);
+    g_assert(drc_entry);
+    drc_entry_slot = &drc_entry->child_entries[slot];
+
+    if (pci_default_read_config(dev, PCI_HEADER_TYPE, 1) ==
+        PCI_HEADER_TYPE_NORMAL) {
+        is_bridge = 0;
+    }
+
+    _FDT(fdt_setprop_cell(fdt, offset, "vendor-id",
+                          pci_default_read_config(dev, PCI_VENDOR_ID, 2)));
+    _FDT(fdt_setprop_cell(fdt, offset, "device-id",
+                          pci_default_read_config(dev, PCI_DEVICE_ID, 2)));
+    _FDT(fdt_setprop_cell(fdt, offset, "revision-id",
+                          pci_default_read_config(dev, PCI_REVISION_ID, 1)));
+    _FDT(fdt_setprop_cell(fdt, offset, "class-code",
+                          pci_default_read_config(dev, PCI_CLASS_DEVICE, 2) << 8));
+
+    _FDT(fdt_setprop_cell(fdt, offset, "interrupts",
+                          pci_default_read_config(dev, PCI_INTERRUPT_PIN, 1)));
+
+    /* if this device is NOT a bridge */
+    if (!is_bridge) {
+        _FDT(fdt_setprop_cell(fdt, offset, "min-grant",
+            pci_default_read_config(dev, PCI_MIN_GNT, 1)));
+        _FDT(fdt_setprop_cell(fdt, offset, "max-latency",
+            pci_default_read_config(dev, PCI_MAX_LAT, 1)));
+        _FDT(fdt_setprop_cell(fdt, offset, "subsystem-id",
+            pci_default_read_config(dev, PCI_SUBSYSTEM_ID, 2)));
+        _FDT(fdt_setprop_cell(fdt, offset, "subsystem-vendor-id",
+            pci_default_read_config(dev, PCI_SUBSYSTEM_VENDOR_ID, 2)));
+    }
+
+    _FDT(fdt_setprop_cell(fdt, offset, "cache-line-size",
+        pci_default_read_config(dev, PCI_CACHE_LINE_SIZE, 1)));
+
+    /* the following fdt cells are masked off the pci status register */
+    int pci_status = pci_default_read_config(dev, PCI_STATUS, 2);
+    _FDT(fdt_setprop_cell(fdt, offset, "devsel-speed",
+                          PCI_STATUS_DEVSEL_MASK & pci_status));
+    _FDT(fdt_setprop_cell(fdt, offset, "fast-back-to-back",
+                          PCI_STATUS_FAST_BACK & pci_status));
+    _FDT(fdt_setprop_cell(fdt, offset, "66mhz-capable",
+                          PCI_STATUS_66MHZ & pci_status));
+    _FDT(fdt_setprop_cell(fdt, offset, "udf-supported",
+                          PCI_STATUS_UDF & pci_status));
+
+    _FDT(fdt_setprop_string(fdt, offset, "name", "pci"));
+    sprintf(slotname, "Slot %d", slot + phb_index * 32);
+    _FDT(fdt_setprop(fdt, offset, "ibm,loc-code", slotname, strlen(slotname)));
+    _FDT(fdt_setprop_cell(fdt, offset, "ibm,my-drc-index",
+                          drc_entry_slot->drc_index));
+
+    _FDT(fdt_setprop_cell(fdt, offset, "#address-cells",
+                          RESOURCE_CELLS_ADDRESS));
+    _FDT(fdt_setprop_cell(fdt, offset, "#size-cells",
+                          RESOURCE_CELLS_SIZE));
+    _FDT(fdt_setprop_cell(fdt, offset, "ibm,req#msi-x",
+                          RESOURCE_CELLS_SIZE));
+    fill_resource_props(dev, phb_index, reg, &reg_size,
+                        assigned, &assigned_size);
+    _FDT(fdt_setprop(fdt, offset, "reg", reg, reg_size));
+    _FDT(fdt_setprop(fdt, offset, "assigned-addresses",
+                     assigned, assigned_size));
+
+    return 0;
+}
+
+static int spapr_device_hotplug_add(DeviceState *qdev, PCIDevice *dev)
+{
+    sPAPRPHBState *phb = SPAPR_PCI_HOST_BRIDGE(qdev);
+    sPAPRDrcEntry *drc_entry, *drc_entry_slot;
+    sPAPRConfigureConnectorState *ccs;
+    int slot = PCI_SLOT(dev->devfn);
+    int offset, ret;
+    void *fdt_orig, *fdt;
+    char nodename[512];
+    uint32_t encoded = ENCODE_DRC_STATE(INDICATOR_ENTITY_SENSE_PRESENT,
+                                        INDICATOR_ENTITY_SENSE_MASK,
+                                        INDICATOR_ENTITY_SENSE_SHIFT);
+
+    drc_entry = spapr_phb_to_drc_entry(phb->buid);
+    g_assert(drc_entry);
+    drc_entry_slot = &drc_entry->child_entries[slot];
+
+    drc_entry->state &= ~(uint32_t)INDICATOR_ENTITY_SENSE_MASK;
+    drc_entry->state |= encoded; /* DR entity present */
+    drc_entry_slot->state &= ~(uint32_t)INDICATOR_ENTITY_SENSE_MASK;
+    drc_entry_slot->state |= encoded; /* and the slot */
+
+    /* add OF node for pci device and required OF DT properties */
+    fdt_orig = g_malloc0(FDT_MAX_SIZE);
+    offset = fdt_create(fdt_orig, FDT_MAX_SIZE);
+    fdt_begin_node(fdt_orig, "");
+    fdt_end_node(fdt_orig);
+    fdt_finish(fdt_orig);
+
+    fdt = g_malloc0(FDT_MAX_SIZE);
+    fdt_open_into(fdt_orig, fdt, FDT_MAX_SIZE);
+    sprintf(nodename, "pci@%d", slot);
+    offset = fdt_add_subnode(fdt, 0, nodename);
+    ret = spapr_populate_pci_child_dt(dev, fdt, offset, phb->index);
+    g_assert(!ret);
+    g_free(fdt_orig);
+
+    /* hold on to node, configure_connector will pass it to the guest later */
+    ccs = &drc_entry_slot->cc_state;
+    ccs->fdt = fdt;
+    ccs->offset_start = offset;
+    ccs->state = CC_STATE_PENDING;
+    ccs->dev = dev;
+
+    return 0;
+}
+
+/* check whether guest has released/isolated device */
+static bool spapr_drc_state_is_releasable(sPAPRDrcEntry *drc_entry)
+{
+    return !DECODE_DRC_STATE(drc_entry->state,
+                             INDICATOR_ISOLATION_MASK,
+                             INDICATOR_ISOLATION_SHIFT);
+}
+
+/* finalize device unplug/deletion */
+static void spapr_drc_state_reset(sPAPRDrcEntry *drc_entry)
+{
+    sPAPRConfigureConnectorState *ccs = &drc_entry->cc_state;
+    uint32_t sense_empty = ENCODE_DRC_STATE(INDICATOR_ENTITY_SENSE_EMPTY,
+                                            INDICATOR_ENTITY_SENSE_MASK,
+                                            INDICATOR_ENTITY_SENSE_SHIFT);
+
+    g_free(ccs->fdt);
+    ccs->fdt = NULL;
+    object_unparent(OBJECT(ccs->dev));
+    ccs->dev = NULL;
+    ccs->state = CC_STATE_IDLE;
+    drc_entry->state &= ~INDICATOR_ENTITY_SENSE_MASK;
+    drc_entry->state |= sense_empty;
+    drc_entry->awaiting_release = false;
+}
+
+static void spapr_device_hotplug_remove(DeviceState *qdev, PCIDevice *dev)
+{
+    sPAPRPHBState *phb = SPAPR_PCI_HOST_BRIDGE(qdev);
+    sPAPRDrcEntry *drc_entry, *drc_entry_slot;
+    sPAPRConfigureConnectorState *ccs;
+    int slot = PCI_SLOT(dev->devfn);
+
+    drc_entry = spapr_phb_to_drc_entry(phb->buid);
+    g_assert(drc_entry);
+    drc_entry_slot = &drc_entry->child_entries[slot];
+    ccs = &drc_entry_slot->cc_state;
+    /* shouldn't be removing devices we haven't created an fdt for */
+    g_assert(ccs->state != CC_STATE_IDLE);
+    /* if the device has already been released/isolated by guest, go ahead
+     * and remove it now. Otherwise, flag it as pending guest release so it
+     * can be removed later
+     */
+    if (spapr_drc_state_is_releasable(drc_entry_slot)) {
+        spapr_drc_state_reset(drc_entry_slot);
+    } else {
+        if (drc_entry_slot->awaiting_release) {
+            fprintf(stderr, "waiting for guest to release the device");
+        } else {
+            drc_entry_slot->awaiting_release = true;
+        }
+    }
+}
+
+static void spapr_phb_hot_plug(HotplugHandler *plug_handler,
+                               DeviceState *plugged_dev, Error **errp)
+{
+    spapr_device_hotplug_add(DEVICE(plug_handler), PCI_DEVICE(plugged_dev));
+}
+
+static void spapr_phb_hot_unplug(HotplugHandler *plug_handler,
+                                 DeviceState *plugged_dev, Error **errp)
+{
+    spapr_device_hotplug_remove(DEVICE(plug_handler), PCI_DEVICE(plugged_dev));
+}
+
 static void spapr_phb_realize(DeviceState *dev, Error **errp)
 {
     SysBusDevice *s = SYS_BUS_DEVICE(dev);
@@ -903,6 +1122,7 @@  static void spapr_phb_realize(DeviceState *dev, Error **errp)
                            &sphb->memspace, &sphb->iospace,
                            PCI_DEVFN(0, 0), PCI_NUM_PINS, TYPE_PCI_BUS);
     phb->bus = bus;
+    qbus_set_hotplug_handler(BUS(phb->bus), DEVICE(sphb), NULL);
 
     /*
      * Initialize PHB address space.
@@ -1108,6 +1328,7 @@  static void spapr_phb_class_init(ObjectClass *klass, void *data)
     PCIHostBridgeClass *hc = PCI_HOST_BRIDGE_CLASS(klass);
     DeviceClass *dc = DEVICE_CLASS(klass);
     sPAPRPHBClass *spc = SPAPR_PCI_HOST_BRIDGE_CLASS(klass);
+    HotplugHandlerClass *hp = HOTPLUG_HANDLER_CLASS(klass);
 
     hc->root_bus_path = spapr_phb_root_bus_path;
     dc->realize = spapr_phb_realize;
@@ -1117,6 +1338,8 @@  static void spapr_phb_class_init(ObjectClass *klass, void *data)
     set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories);
     dc->cannot_instantiate_with_device_add_yet = false;
     spc->finish_realize = spapr_phb_finish_realize;
+    hp->plug = spapr_phb_hot_plug;
+    hp->unplug = spapr_phb_hot_unplug;
 }
 
 static const TypeInfo spapr_phb_info = {
@@ -1125,6 +1348,10 @@  static const TypeInfo spapr_phb_info = {
     .instance_size = sizeof(sPAPRPHBState),
     .class_init    = spapr_phb_class_init,
     .class_size    = sizeof(sPAPRPHBClass),
+    .interfaces    = (InterfaceInfo[]) {
+        { TYPE_HOTPLUG_HANDLER },
+        { }
+    }
 };
 
 PCIHostState *spapr_create_phb(sPAPREnvironment *spapr, int index)
@@ -1304,14 +1531,6 @@  int spapr_populate_pci_dt(sPAPRPHBState *phb,
         return bus_off;
     }
 
-#define _FDT(exp) \
-    do { \
-        int ret = (exp);                                           \
-        if (ret < 0) {                                             \
-            return ret;                                            \
-        }                                                          \
-    } while (0)
-
     /* Write PHB properties */
     _FDT(fdt_setprop_string(fdt, bus_off, "device_type", "pci"));
     _FDT(fdt_setprop_string(fdt, bus_off, "compatible", "IBM,Logical_PHB"));
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index fac85f8..e08dd2f 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -36,6 +36,7 @@  struct sPAPRDrcEntry {
     void *fdt;
     int fdt_offset;
     uint32_t state;
+    bool awaiting_release;
     sPAPRConfigureConnectorState cc_state;
     sPAPRDrcEntry *child_entries;
 };