diff mbox

[RFC,v0,4/5] spapr: Support hotplug by specifying DRC count

Message ID 1438580143-587-5-git-send-email-bharata@linux.vnet.ibm.com
State New
Headers show

Commit Message

Bharata B Rao Aug. 3, 2015, 5:35 a.m. UTC
Support hotplug identifier type RTAS_LOG_V6_HP_ID_DRC_COUNT that allows
hotplugging of DRCs by specifying the DRC count.

While we are here, rename

spapr_hotplug_req_add_event() to spapr_hotplug_req_add_by_index()
spapr_hotplug_req_remove_event() to spapr_hotplug_req_remove_by_index()

so that they match with spapr_hotplug_req_add_by_count().

Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
---
 hw/ppc/spapr.c         |  2 +-
 hw/ppc/spapr_events.c  | 47 ++++++++++++++++++++++++++++++++++++++---------
 hw/ppc/spapr_pci.c     |  4 ++--
 include/hw/ppc/spapr.h |  8 ++++++--
 4 files changed, 47 insertions(+), 14 deletions(-)

Comments

David Gibson Aug. 3, 2015, 6:55 a.m. UTC | #1
On Mon, Aug 03, 2015 at 11:05:42AM +0530, Bharata B Rao wrote:
> Support hotplug identifier type RTAS_LOG_V6_HP_ID_DRC_COUNT that allows
> hotplugging of DRCs by specifying the DRC count.
> 
> While we are here, rename
> 
> spapr_hotplug_req_add_event() to spapr_hotplug_req_add_by_index()
> spapr_hotplug_req_remove_event() to spapr_hotplug_req_remove_by_index()
> 
> so that they match with spapr_hotplug_req_add_by_count().
> 
> Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
> ---
>  hw/ppc/spapr.c         |  2 +-
>  hw/ppc/spapr_events.c  | 47 ++++++++++++++++++++++++++++++++++++++---------
>  hw/ppc/spapr_pci.c     |  4 ++--
>  include/hw/ppc/spapr.h |  8 ++++++--
>  4 files changed, 47 insertions(+), 14 deletions(-)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 669dc43..13af9be 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -2072,7 +2072,7 @@ static void spapr_add_lmbs(DeviceState *dev, uint64_t addr, uint64_t size,
>  
>          drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
>          drck->attach(drc, dev, fdt, fdt_offset, !dev->hotplugged, errp);
> -        spapr_hotplug_req_add_event(drc);
> +        spapr_hotplug_req_add_by_index(drc);
>          addr += SPAPR_MEMORY_BLOCK_SIZE;
>      }
>  }
> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
> index 98bf7ae..744ea62 100644
> --- a/hw/ppc/spapr_events.c
> +++ b/hw/ppc/spapr_events.c
> @@ -386,7 +386,9 @@ static void spapr_powerdown_req(Notifier *n, void *opaque)
>      qemu_irq_pulse(xics_get_qirq(spapr->icp, spapr->check_exception_irq));
>  }
>  
> -static void spapr_hotplug_req_event(sPAPRDRConnector *drc, uint8_t hp_action)
> +static void spapr_hotplug_req_event(uint8_t hp_id, uint8_t hp_action,
> +                                    sPAPRDRConnectorType drc_type,
> +                                    uint32_t drc)
>  {
>      sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
>      struct hp_log_full *new_hp;
> @@ -395,8 +397,6 @@ static void spapr_hotplug_req_event(sPAPRDRConnector *drc, uint8_t hp_action)
>      struct rtas_event_log_v6_maina *maina;
>      struct rtas_event_log_v6_mainb *mainb;
>      struct rtas_event_log_v6_hp *hp;
> -    sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
> -    sPAPRDRConnectorType drc_type = drck->get_type(drc);
>  
>      new_hp = g_malloc0(sizeof(struct hp_log_full));
>      hdr = &new_hp->hdr;
> @@ -427,8 +427,7 @@ static void spapr_hotplug_req_event(sPAPRDRConnector *drc, uint8_t hp_action)
>      hp->hdr.section_length = cpu_to_be16(sizeof(*hp));
>      hp->hdr.section_version = 1; /* includes extended modifier */
>      hp->hotplug_action = hp_action;
> -    hp->drc.index = cpu_to_be32(drck->get_index(drc));
> -    hp->hotplug_identifier = RTAS_LOG_V6_HP_ID_DRC_INDEX;
> +    hp->hotplug_identifier = hp_id;
>  
>      switch (drc_type) {
>      case SPAPR_DR_CONNECTOR_TYPE_PCI:
> @@ -445,19 +444,49 @@ static void spapr_hotplug_req_event(sPAPRDRConnector *drc, uint8_t hp_action)
>          return;
>      }
>  
> +    if (hp_id == RTAS_LOG_V6_HP_ID_DRC_COUNT) {
> +        hp->drc.count = cpu_to_be32(drc);

I'm a bit confused as to how this message can work with *only* a
count and not some sort of base index.

> +    } else if (hp_id == RTAS_LOG_V6_HP_ID_DRC_INDEX) {
> +        hp->drc.index = cpu_to_be32(drc);
> +    }
> +
>      rtas_event_log_queue(RTAS_LOG_TYPE_HOTPLUG, new_hp, true);
>  
>      qemu_irq_pulse(xics_get_qirq(spapr->icp, spapr->check_exception_irq));
>  }
>  
> -void spapr_hotplug_req_add_event(sPAPRDRConnector *drc)
> +void spapr_hotplug_req_add_by_index(sPAPRDRConnector *drc)
> +{
> +    sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
> +    sPAPRDRConnectorType drc_type = drck->get_type(drc);
> +    uint32 index = drck->get_index(drc);
> +
> +    spapr_hotplug_req_event(RTAS_LOG_V6_HP_ID_DRC_INDEX,
> +                            RTAS_LOG_V6_HP_ACTION_ADD, drc_type, index);
> +}
> +
> +void spapr_hotplug_req_remove_by_index(sPAPRDRConnector *drc)
> +{
> +    sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
> +    sPAPRDRConnectorType drc_type = drck->get_type(drc);
> +    uint32 index = drck->get_index(drc);
> +
> +    spapr_hotplug_req_event(RTAS_LOG_V6_HP_ID_DRC_INDEX,
> +                            RTAS_LOG_V6_HP_ACTION_REMOVE, drc_type, index);
> +}
> +
> +void spapr_hotplug_req_add_by_count(sPAPRDRConnectorType drc_type,
> +                                       uint32_t count)
>  {
> -    spapr_hotplug_req_event(drc, RTAS_LOG_V6_HP_ACTION_ADD);
> +    spapr_hotplug_req_event(RTAS_LOG_V6_HP_ID_DRC_COUNT,
> +                            RTAS_LOG_V6_HP_ACTION_ADD, drc_type, count);
>  }
>  
> -void spapr_hotplug_req_remove_event(sPAPRDRConnector *drc)
> +void spapr_hotplug_req_remove_by_count(sPAPRDRConnectorType drc_type,
> +                                          uint32_t count)
>  {
> -    spapr_hotplug_req_event(drc, RTAS_LOG_V6_HP_ACTION_REMOVE);
> +    spapr_hotplug_req_event(RTAS_LOG_V6_HP_ID_DRC_COUNT,
> +                            RTAS_LOG_V6_HP_ACTION_REMOVE, drc_type, count);
>  }
>  
>  static void check_exception(PowerPCCPU *cpu, sPAPRMachineState *spapr,
> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> index cfd3b7b..9d41060 100644
> --- a/hw/ppc/spapr_pci.c
> +++ b/hw/ppc/spapr_pci.c
> @@ -1179,7 +1179,7 @@ static void spapr_phb_hot_plug_child(HotplugHandler *plug_handler,
>          return;
>      }
>      if (plugged_dev->hotplugged) {
> -        spapr_hotplug_req_add_event(drc);
> +        spapr_hotplug_req_add_by_index(drc);
>      }
>  }
>  
> @@ -1207,7 +1207,7 @@ static void spapr_phb_hot_unplug_child(HotplugHandler *plug_handler,
>              error_propagate(errp, local_err);
>              return;
>          }
> -        spapr_hotplug_req_remove_event(drc);
> +        spapr_hotplug_req_remove_by_index(drc);
>      }
>  }
>  
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index b6cb0d0..d87c6d4 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -588,8 +588,12 @@ int spapr_dma_dt(void *fdt, int node_off, const char *propname,
>  int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
>                        sPAPRTCETable *tcet);
>  void spapr_pci_switch_vga(bool big_endian);
> -void spapr_hotplug_req_add_event(sPAPRDRConnector *drc);
> -void spapr_hotplug_req_remove_event(sPAPRDRConnector *drc);
> +void spapr_hotplug_req_add_by_index(sPAPRDRConnector *drc);
> +void spapr_hotplug_req_remove_by_index(sPAPRDRConnector *drc);
> +void spapr_hotplug_req_add_by_count(sPAPRDRConnectorType drc_type,
> +                                       uint32_t count);
> +void spapr_hotplug_req_remove_by_count(sPAPRDRConnectorType drc_type,
> +                                          uint32_t count);
>  
>  /* rtas-configure-connector state */
>  struct sPAPRConfigureConnectorState {
Bharata B Rao Aug. 3, 2015, 7:53 a.m. UTC | #2
On Mon, Aug 03, 2015 at 04:55:01PM +1000, David Gibson wrote:
> On Mon, Aug 03, 2015 at 11:05:42AM +0530, Bharata B Rao wrote:
> > Support hotplug identifier type RTAS_LOG_V6_HP_ID_DRC_COUNT that allows
> > hotplugging of DRCs by specifying the DRC count.
> > 
> > While we are here, rename
> > 
> > spapr_hotplug_req_add_event() to spapr_hotplug_req_add_by_index()
> > spapr_hotplug_req_remove_event() to spapr_hotplug_req_remove_by_index()
> > 
> > so that they match with spapr_hotplug_req_add_by_count().
> > 
> > Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
> > ---
> >  hw/ppc/spapr.c         |  2 +-
> >  hw/ppc/spapr_events.c  | 47 ++++++++++++++++++++++++++++++++++++++---------
> >  hw/ppc/spapr_pci.c     |  4 ++--
> >  include/hw/ppc/spapr.h |  8 ++++++--
> >  4 files changed, 47 insertions(+), 14 deletions(-)
> > 
> > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> > index 669dc43..13af9be 100644
> > --- a/hw/ppc/spapr.c
> > +++ b/hw/ppc/spapr.c
> > @@ -2072,7 +2072,7 @@ static void spapr_add_lmbs(DeviceState *dev, uint64_t addr, uint64_t size,
> >  
> >          drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
> >          drck->attach(drc, dev, fdt, fdt_offset, !dev->hotplugged, errp);
> > -        spapr_hotplug_req_add_event(drc);
> > +        spapr_hotplug_req_add_by_index(drc);
> >          addr += SPAPR_MEMORY_BLOCK_SIZE;
> >      }
> >  }
> > diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
> > index 98bf7ae..744ea62 100644
> > --- a/hw/ppc/spapr_events.c
> > +++ b/hw/ppc/spapr_events.c
> > @@ -386,7 +386,9 @@ static void spapr_powerdown_req(Notifier *n, void *opaque)
> >      qemu_irq_pulse(xics_get_qirq(spapr->icp, spapr->check_exception_irq));
> >  }
> >  
> > -static void spapr_hotplug_req_event(sPAPRDRConnector *drc, uint8_t hp_action)
> > +static void spapr_hotplug_req_event(uint8_t hp_id, uint8_t hp_action,
> > +                                    sPAPRDRConnectorType drc_type,
> > +                                    uint32_t drc)
> >  {
> >      sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> >      struct hp_log_full *new_hp;
> > @@ -395,8 +397,6 @@ static void spapr_hotplug_req_event(sPAPRDRConnector *drc, uint8_t hp_action)
> >      struct rtas_event_log_v6_maina *maina;
> >      struct rtas_event_log_v6_mainb *mainb;
> >      struct rtas_event_log_v6_hp *hp;
> > -    sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
> > -    sPAPRDRConnectorType drc_type = drck->get_type(drc);
> >  
> >      new_hp = g_malloc0(sizeof(struct hp_log_full));
> >      hdr = &new_hp->hdr;
> > @@ -427,8 +427,7 @@ static void spapr_hotplug_req_event(sPAPRDRConnector *drc, uint8_t hp_action)
> >      hp->hdr.section_length = cpu_to_be16(sizeof(*hp));
> >      hp->hdr.section_version = 1; /* includes extended modifier */
> >      hp->hotplug_action = hp_action;
> > -    hp->drc.index = cpu_to_be32(drck->get_index(drc));
> > -    hp->hotplug_identifier = RTAS_LOG_V6_HP_ID_DRC_INDEX;
> > +    hp->hotplug_identifier = hp_id;
> >  
> >      switch (drc_type) {
> >      case SPAPR_DR_CONNECTOR_TYPE_PCI:
> > @@ -445,19 +444,49 @@ static void spapr_hotplug_req_event(sPAPRDRConnector *drc, uint8_t hp_action)
> >          return;
> >      }
> >  
> > +    if (hp_id == RTAS_LOG_V6_HP_ID_DRC_COUNT) {
> > +        hp->drc.count = cpu_to_be32(drc);
> 
> I'm a bit confused as to how this message can work with *only* a
> count and not some sort of base index.

Right and this can be an issue when we start supporting memory removal
and the currrent userspace drmgr tool will just go ahead and remove
'count' number of LMBs that have been marked for removal and there is
really no association b/n these LMBs to the LMBs that make up the
pc-dimm device that is being removed.

One solution is to extend the rtas event header and include the base
drc-index (with count type of identifier)  so that we know exactly which
set of LMBs to remove for the given pc-dimm device.

Michael has some thoughts about alternative ways in thich we can achieve
removal of correct LMBs without needing to pass the base drc-index.

Regards,
Bharata.
Michael Roth Aug. 3, 2015, 10:32 p.m. UTC | #3
Quoting Bharata B Rao (2015-08-03 02:53:02)
> On Mon, Aug 03, 2015 at 04:55:01PM +1000, David Gibson wrote:
> > On Mon, Aug 03, 2015 at 11:05:42AM +0530, Bharata B Rao wrote:
> > > Support hotplug identifier type RTAS_LOG_V6_HP_ID_DRC_COUNT that allows
> > > hotplugging of DRCs by specifying the DRC count.
> > > 
> > > While we are here, rename
> > > 
> > > spapr_hotplug_req_add_event() to spapr_hotplug_req_add_by_index()
> > > spapr_hotplug_req_remove_event() to spapr_hotplug_req_remove_by_index()
> > > 
> > > so that they match with spapr_hotplug_req_add_by_count().
> > > 
> > > Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
> > > ---
> > >  hw/ppc/spapr.c         |  2 +-
> > >  hw/ppc/spapr_events.c  | 47 ++++++++++++++++++++++++++++++++++++++---------
> > >  hw/ppc/spapr_pci.c     |  4 ++--
> > >  include/hw/ppc/spapr.h |  8 ++++++--
> > >  4 files changed, 47 insertions(+), 14 deletions(-)
> > > 
> > > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> > > index 669dc43..13af9be 100644
> > > --- a/hw/ppc/spapr.c
> > > +++ b/hw/ppc/spapr.c
> > > @@ -2072,7 +2072,7 @@ static void spapr_add_lmbs(DeviceState *dev, uint64_t addr, uint64_t size,
> > >  
> > >          drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
> > >          drck->attach(drc, dev, fdt, fdt_offset, !dev->hotplugged, errp);
> > > -        spapr_hotplug_req_add_event(drc);
> > > +        spapr_hotplug_req_add_by_index(drc);
> > >          addr += SPAPR_MEMORY_BLOCK_SIZE;
> > >      }
> > >  }
> > > diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
> > > index 98bf7ae..744ea62 100644
> > > --- a/hw/ppc/spapr_events.c
> > > +++ b/hw/ppc/spapr_events.c
> > > @@ -386,7 +386,9 @@ static void spapr_powerdown_req(Notifier *n, void *opaque)
> > >      qemu_irq_pulse(xics_get_qirq(spapr->icp, spapr->check_exception_irq));
> > >  }
> > >  
> > > -static void spapr_hotplug_req_event(sPAPRDRConnector *drc, uint8_t hp_action)
> > > +static void spapr_hotplug_req_event(uint8_t hp_id, uint8_t hp_action,
> > > +                                    sPAPRDRConnectorType drc_type,
> > > +                                    uint32_t drc)
> > >  {
> > >      sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> > >      struct hp_log_full *new_hp;
> > > @@ -395,8 +397,6 @@ static void spapr_hotplug_req_event(sPAPRDRConnector *drc, uint8_t hp_action)
> > >      struct rtas_event_log_v6_maina *maina;
> > >      struct rtas_event_log_v6_mainb *mainb;
> > >      struct rtas_event_log_v6_hp *hp;
> > > -    sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
> > > -    sPAPRDRConnectorType drc_type = drck->get_type(drc);
> > >  
> > >      new_hp = g_malloc0(sizeof(struct hp_log_full));
> > >      hdr = &new_hp->hdr;
> > > @@ -427,8 +427,7 @@ static void spapr_hotplug_req_event(sPAPRDRConnector *drc, uint8_t hp_action)
> > >      hp->hdr.section_length = cpu_to_be16(sizeof(*hp));
> > >      hp->hdr.section_version = 1; /* includes extended modifier */
> > >      hp->hotplug_action = hp_action;
> > > -    hp->drc.index = cpu_to_be32(drck->get_index(drc));
> > > -    hp->hotplug_identifier = RTAS_LOG_V6_HP_ID_DRC_INDEX;
> > > +    hp->hotplug_identifier = hp_id;
> > >  
> > >      switch (drc_type) {
> > >      case SPAPR_DR_CONNECTOR_TYPE_PCI:
> > > @@ -445,19 +444,49 @@ static void spapr_hotplug_req_event(sPAPRDRConnector *drc, uint8_t hp_action)
> > >          return;
> > >      }
> > >  
> > > +    if (hp_id == RTAS_LOG_V6_HP_ID_DRC_COUNT) {
> > > +        hp->drc.count = cpu_to_be32(drc);
> > 
> > I'm a bit confused as to how this message can work with *only* a
> > count and not some sort of base index.

Talked a bit with Nathan for some more insight on *why* it works:

drc-count is actually how hotplug add/remove is done for pHyp cpu/mem
resources, drc-index is used in special situations like memory/cpu
failures, and for other drc types like pci.

So in general, drc-count with no drc-index is supposed to work fine
with existing guests/tools.

As far as the *how*:

guests just attempt to add LMBs until they fulfill the request, if
there's a failure they abort/rewind and try the next LMB.

There are some complications with QEMU though. Namely, our LMBs are
hotplugged in the form of DIMM-backed LMBs, as opposed to pHyp where
we hotplug from a logical pool of LMBs where there's no DIMM-level
modeling of resources. So, with both pHyp and pKVM, LMB hotplug/unplug
can fail for various reasons, but in the case of QEMU we have the
unique no-dimm-backing-this-lmb failure. We fail this at the DRC
level: if a guest attempts to set DRC/LMB allocation state to USED,
and QEMU decides there's no DIMM backing this memory, it simply reports
an error. This is in line with PAPR+. And due to the
try/abort/rewind/next handling this all still works.

Memory *unplug* is a little hairier:

All the above applies, but prior to getting to the point of QEMU failing
the DRC/LMB removal (via set-isolation-state:ISOLATED,
set-allocation-state:UNUSED returning failure), the guest kernel will
offline the LMB, migrating the memory if need be, so when the guest hits
the DRC failure it has to bring the LMB back online. This unecessary
churn and memory migration is obviously not desirable, but we've yet
to look into the performance aspects of it.

I think a drc-index being provided as a hint/starting point would be
the right way to address this potential performance issue, but I'm hoping
the approach in this series is sufficient for mem hotplug/unplug
for existing guests that wouldn't support such a hint.

This drc-index hint would likely be introduced in tandem with a new
ibm,client-set-architecture flag so QEMU can gracefully switch over for
newer guests. Nathan is working the PAPR+ spec for this in the process
of his work to move memory hotplug handling into guest kernel.

> 
> Right and this can be an issue when we start supporting memory removal
> and the currrent userspace drmgr tool will just go ahead and remove
> 'count' number of LMBs that have been marked for removal and there is
> really no association b/n these LMBs to the LMBs that make up the
> pc-dimm device that is being removed.
> 
> One solution is to extend the rtas event header and include the base
> drc-index (with count type of identifier)  so that we know exactly which
> set of LMBs to remove for the given pc-dimm device.
> 
> Michael has some thoughts about alternative ways in thich we can achieve
> removal of correct LMBs without needing to pass the base drc-index.
> 
> Regards,
> Bharata.
>
David Gibson Aug. 4, 2015, 4:36 a.m. UTC | #4
On Mon, Aug 03, 2015 at 05:32:43PM -0500, Michael Roth wrote:
> Quoting Bharata B Rao (2015-08-03 02:53:02)
> > On Mon, Aug 03, 2015 at 04:55:01PM +1000, David Gibson wrote:
> > > On Mon, Aug 03, 2015 at 11:05:42AM +0530, Bharata B Rao wrote:
> > > > Support hotplug identifier type RTAS_LOG_V6_HP_ID_DRC_COUNT that allows
> > > > hotplugging of DRCs by specifying the DRC count.
> > > > 
> > > > While we are here, rename
> > > > 
> > > > spapr_hotplug_req_add_event() to spapr_hotplug_req_add_by_index()
> > > > spapr_hotplug_req_remove_event() to spapr_hotplug_req_remove_by_index()
> > > > 
> > > > so that they match with spapr_hotplug_req_add_by_count().
> > > > 
> > > > Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
> > > > ---
> > > >  hw/ppc/spapr.c         |  2 +-
> > > >  hw/ppc/spapr_events.c  | 47 ++++++++++++++++++++++++++++++++++++++---------
> > > >  hw/ppc/spapr_pci.c     |  4 ++--
> > > >  include/hw/ppc/spapr.h |  8 ++++++--
> > > >  4 files changed, 47 insertions(+), 14 deletions(-)
> > > > 
> > > > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> > > > index 669dc43..13af9be 100644
> > > > --- a/hw/ppc/spapr.c
> > > > +++ b/hw/ppc/spapr.c
> > > > @@ -2072,7 +2072,7 @@ static void spapr_add_lmbs(DeviceState *dev, uint64_t addr, uint64_t size,
> > > >  
> > > >          drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
> > > >          drck->attach(drc, dev, fdt, fdt_offset, !dev->hotplugged, errp);
> > > > -        spapr_hotplug_req_add_event(drc);
> > > > +        spapr_hotplug_req_add_by_index(drc);
> > > >          addr += SPAPR_MEMORY_BLOCK_SIZE;
> > > >      }
> > > >  }
> > > > diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
> > > > index 98bf7ae..744ea62 100644
> > > > --- a/hw/ppc/spapr_events.c
> > > > +++ b/hw/ppc/spapr_events.c
> > > > @@ -386,7 +386,9 @@ static void spapr_powerdown_req(Notifier *n, void *opaque)
> > > >      qemu_irq_pulse(xics_get_qirq(spapr->icp, spapr->check_exception_irq));
> > > >  }
> > > >  
> > > > -static void spapr_hotplug_req_event(sPAPRDRConnector *drc, uint8_t hp_action)
> > > > +static void spapr_hotplug_req_event(uint8_t hp_id, uint8_t hp_action,
> > > > +                                    sPAPRDRConnectorType drc_type,
> > > > +                                    uint32_t drc)
> > > >  {
> > > >      sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> > > >      struct hp_log_full *new_hp;
> > > > @@ -395,8 +397,6 @@ static void spapr_hotplug_req_event(sPAPRDRConnector *drc, uint8_t hp_action)
> > > >      struct rtas_event_log_v6_maina *maina;
> > > >      struct rtas_event_log_v6_mainb *mainb;
> > > >      struct rtas_event_log_v6_hp *hp;
> > > > -    sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
> > > > -    sPAPRDRConnectorType drc_type = drck->get_type(drc);
> > > >  
> > > >      new_hp = g_malloc0(sizeof(struct hp_log_full));
> > > >      hdr = &new_hp->hdr;
> > > > @@ -427,8 +427,7 @@ static void spapr_hotplug_req_event(sPAPRDRConnector *drc, uint8_t hp_action)
> > > >      hp->hdr.section_length = cpu_to_be16(sizeof(*hp));
> > > >      hp->hdr.section_version = 1; /* includes extended modifier */
> > > >      hp->hotplug_action = hp_action;
> > > > -    hp->drc.index = cpu_to_be32(drck->get_index(drc));
> > > > -    hp->hotplug_identifier = RTAS_LOG_V6_HP_ID_DRC_INDEX;
> > > > +    hp->hotplug_identifier = hp_id;
> > > >  
> > > >      switch (drc_type) {
> > > >      case SPAPR_DR_CONNECTOR_TYPE_PCI:
> > > > @@ -445,19 +444,49 @@ static void spapr_hotplug_req_event(sPAPRDRConnector *drc, uint8_t hp_action)
> > > >          return;
> > > >      }
> > > >  
> > > > +    if (hp_id == RTAS_LOG_V6_HP_ID_DRC_COUNT) {
> > > > +        hp->drc.count = cpu_to_be32(drc);
> > > 
> > > I'm a bit confused as to how this message can work with *only* a
> > > count and not some sort of base index.
> 
> Talked a bit with Nathan for some more insight on *why* it works:
> 
> drc-count is actually how hotplug add/remove is done for pHyp cpu/mem
> resources, drc-index is used in special situations like memory/cpu
> failures, and for other drc types like pci.
> 
> So in general, drc-count with no drc-index is supposed to work fine
> with existing guests/tools.
> 
> As far as the *how*:
> 
> guests just attempt to add LMBs until they fulfill the request, if
> there's a failure they abort/rewind and try the next LMB.

Huh, ok.

> There are some complications with QEMU though. Namely, our LMBs are
> hotplugged in the form of DIMM-backed LMBs, as opposed to pHyp where
> we hotplug from a logical pool of LMBs where there's no DIMM-level
> modeling of resources. So, with both pHyp and pKVM, LMB hotplug/unplug
> can fail for various reasons, but in the case of QEMU we have the
> unique no-dimm-backing-this-lmb failure. We fail this at the DRC
> level: if a guest attempts to set DRC/LMB allocation state to USED,
> and QEMU decides there's no DIMM backing this memory, it simply reports
> an error. This is in line with PAPR+. And due to the
> try/abort/rewind/next handling this all still works.
> 
> Memory *unplug* is a little hairier:
> 
> All the above applies, but prior to getting to the point of QEMU failing
> the DRC/LMB removal (via set-isolation-state:ISOLATED,
> set-allocation-state:UNUSED returning failure), the guest kernel will
> offline the LMB, migrating the memory if need be, so when the guest hits
> the DRC failure it has to bring the LMB back online. This unecessary
> churn and memory migration is obviously not desirable, but we've yet
> to look into the performance aspects of it.

Ah, and presumably this migrate shuffle could continue for a long
time, as it works its way though each LMB until reaching the ones
connected to the removed DIMM.

> I think a drc-index being provided as a hint/starting point would be
> the right way to address this potential performance issue, but I'm hoping
> the approach in this series is sufficient for mem hotplug/unplug
> for existing guests that wouldn't support such a hint.

I guess so.  Although it does make me wonder if using the PC derived
DIMM model was really what we wanted for PAPR guests.  I guess
implementing the phyp LMB-pool model probably would have required a
lot of messing with the core qemu hotplug model, though.

> This drc-index hint would likely be introduced in tandem with a new
> ibm,client-set-architecture flag so QEMU can gracefully switch over for
> newer guests. Nathan is working the PAPR+ spec for this in the process
> of his work to move memory hotplug handling into guest kernel.
> 
> > 
> > Right and this can be an issue when we start supporting memory removal
> > and the currrent userspace drmgr tool will just go ahead and remove
> > 'count' number of LMBs that have been marked for removal and there is
> > really no association b/n these LMBs to the LMBs that make up the
> > pc-dimm device that is being removed.
> > 
> > One solution is to extend the rtas event header and include the base
> > drc-index (with count type of identifier)  so that we know exactly which
> > set of LMBs to remove for the given pc-dimm device.
> > 
> > Michael has some thoughts about alternative ways in thich we can achieve
> > removal of correct LMBs without needing to pass the base drc-index.
> > 
> > Regards,
> > Bharata.
> > 
>
diff mbox

Patch

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 669dc43..13af9be 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2072,7 +2072,7 @@  static void spapr_add_lmbs(DeviceState *dev, uint64_t addr, uint64_t size,
 
         drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
         drck->attach(drc, dev, fdt, fdt_offset, !dev->hotplugged, errp);
-        spapr_hotplug_req_add_event(drc);
+        spapr_hotplug_req_add_by_index(drc);
         addr += SPAPR_MEMORY_BLOCK_SIZE;
     }
 }
diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
index 98bf7ae..744ea62 100644
--- a/hw/ppc/spapr_events.c
+++ b/hw/ppc/spapr_events.c
@@ -386,7 +386,9 @@  static void spapr_powerdown_req(Notifier *n, void *opaque)
     qemu_irq_pulse(xics_get_qirq(spapr->icp, spapr->check_exception_irq));
 }
 
-static void spapr_hotplug_req_event(sPAPRDRConnector *drc, uint8_t hp_action)
+static void spapr_hotplug_req_event(uint8_t hp_id, uint8_t hp_action,
+                                    sPAPRDRConnectorType drc_type,
+                                    uint32_t drc)
 {
     sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
     struct hp_log_full *new_hp;
@@ -395,8 +397,6 @@  static void spapr_hotplug_req_event(sPAPRDRConnector *drc, uint8_t hp_action)
     struct rtas_event_log_v6_maina *maina;
     struct rtas_event_log_v6_mainb *mainb;
     struct rtas_event_log_v6_hp *hp;
-    sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
-    sPAPRDRConnectorType drc_type = drck->get_type(drc);
 
     new_hp = g_malloc0(sizeof(struct hp_log_full));
     hdr = &new_hp->hdr;
@@ -427,8 +427,7 @@  static void spapr_hotplug_req_event(sPAPRDRConnector *drc, uint8_t hp_action)
     hp->hdr.section_length = cpu_to_be16(sizeof(*hp));
     hp->hdr.section_version = 1; /* includes extended modifier */
     hp->hotplug_action = hp_action;
-    hp->drc.index = cpu_to_be32(drck->get_index(drc));
-    hp->hotplug_identifier = RTAS_LOG_V6_HP_ID_DRC_INDEX;
+    hp->hotplug_identifier = hp_id;
 
     switch (drc_type) {
     case SPAPR_DR_CONNECTOR_TYPE_PCI:
@@ -445,19 +444,49 @@  static void spapr_hotplug_req_event(sPAPRDRConnector *drc, uint8_t hp_action)
         return;
     }
 
+    if (hp_id == RTAS_LOG_V6_HP_ID_DRC_COUNT) {
+        hp->drc.count = cpu_to_be32(drc);
+    } else if (hp_id == RTAS_LOG_V6_HP_ID_DRC_INDEX) {
+        hp->drc.index = cpu_to_be32(drc);
+    }
+
     rtas_event_log_queue(RTAS_LOG_TYPE_HOTPLUG, new_hp, true);
 
     qemu_irq_pulse(xics_get_qirq(spapr->icp, spapr->check_exception_irq));
 }
 
-void spapr_hotplug_req_add_event(sPAPRDRConnector *drc)
+void spapr_hotplug_req_add_by_index(sPAPRDRConnector *drc)
+{
+    sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
+    sPAPRDRConnectorType drc_type = drck->get_type(drc);
+    uint32 index = drck->get_index(drc);
+
+    spapr_hotplug_req_event(RTAS_LOG_V6_HP_ID_DRC_INDEX,
+                            RTAS_LOG_V6_HP_ACTION_ADD, drc_type, index);
+}
+
+void spapr_hotplug_req_remove_by_index(sPAPRDRConnector *drc)
+{
+    sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
+    sPAPRDRConnectorType drc_type = drck->get_type(drc);
+    uint32 index = drck->get_index(drc);
+
+    spapr_hotplug_req_event(RTAS_LOG_V6_HP_ID_DRC_INDEX,
+                            RTAS_LOG_V6_HP_ACTION_REMOVE, drc_type, index);
+}
+
+void spapr_hotplug_req_add_by_count(sPAPRDRConnectorType drc_type,
+                                       uint32_t count)
 {
-    spapr_hotplug_req_event(drc, RTAS_LOG_V6_HP_ACTION_ADD);
+    spapr_hotplug_req_event(RTAS_LOG_V6_HP_ID_DRC_COUNT,
+                            RTAS_LOG_V6_HP_ACTION_ADD, drc_type, count);
 }
 
-void spapr_hotplug_req_remove_event(sPAPRDRConnector *drc)
+void spapr_hotplug_req_remove_by_count(sPAPRDRConnectorType drc_type,
+                                          uint32_t count)
 {
-    spapr_hotplug_req_event(drc, RTAS_LOG_V6_HP_ACTION_REMOVE);
+    spapr_hotplug_req_event(RTAS_LOG_V6_HP_ID_DRC_COUNT,
+                            RTAS_LOG_V6_HP_ACTION_REMOVE, drc_type, count);
 }
 
 static void check_exception(PowerPCCPU *cpu, sPAPRMachineState *spapr,
diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index cfd3b7b..9d41060 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -1179,7 +1179,7 @@  static void spapr_phb_hot_plug_child(HotplugHandler *plug_handler,
         return;
     }
     if (plugged_dev->hotplugged) {
-        spapr_hotplug_req_add_event(drc);
+        spapr_hotplug_req_add_by_index(drc);
     }
 }
 
@@ -1207,7 +1207,7 @@  static void spapr_phb_hot_unplug_child(HotplugHandler *plug_handler,
             error_propagate(errp, local_err);
             return;
         }
-        spapr_hotplug_req_remove_event(drc);
+        spapr_hotplug_req_remove_by_index(drc);
     }
 }
 
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index b6cb0d0..d87c6d4 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -588,8 +588,12 @@  int spapr_dma_dt(void *fdt, int node_off, const char *propname,
 int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
                       sPAPRTCETable *tcet);
 void spapr_pci_switch_vga(bool big_endian);
-void spapr_hotplug_req_add_event(sPAPRDRConnector *drc);
-void spapr_hotplug_req_remove_event(sPAPRDRConnector *drc);
+void spapr_hotplug_req_add_by_index(sPAPRDRConnector *drc);
+void spapr_hotplug_req_remove_by_index(sPAPRDRConnector *drc);
+void spapr_hotplug_req_add_by_count(sPAPRDRConnectorType drc_type,
+                                       uint32_t count);
+void spapr_hotplug_req_remove_by_count(sPAPRDRConnectorType drc_type,
+                                          uint32_t count);
 
 /* rtas-configure-connector state */
 struct sPAPRConfigureConnectorState {