diff mbox

[RFC,v1] spapr: Memory hot-unplug support

Message ID 1445853185-22518-1-git-send-email-bharata@linux.vnet.ibm.com
State New
Headers show

Commit Message

Bharata B Rao Oct. 26, 2015, 9:53 a.m. UTC
Add support to hot remove pc-dimm memory devices.

TODO: In response to memory hot removal operation on a DIMM device,
guest kernel might refuse to offline a few LMBs that are part of that device.
In such cases, we will have a DIMM device that has some LMBs online and some
LMBs offline. To avoid this situation, drmgr could be enhanced to support
a command line option that results in removal of all the LMBs or none.

Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
---
Changes in v1:
- Got rid of the patch that introduced a field in PCDIMMDevice to track
  DIMM marked for removal since we can track that using within DRC
  object.
- Removed the patch that added return value to rtas_set_indicator()
  since the required changes are already pushed by Michael Roth.

v0:

 hw/ppc/spapr.c     | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 hw/ppc/spapr_drc.c | 18 +++++++++++
 2 files changed, 107 insertions(+), 1 deletion(-)

Comments

Nathan Fontenot Oct. 27, 2015, 7:52 p.m. UTC | #1
On 10/26/2015 04:53 AM, Bharata B Rao wrote:
> Add support to hot remove pc-dimm memory devices.
> 
> TODO: In response to memory hot removal operation on a DIMM device,
> guest kernel might refuse to offline a few LMBs that are part of that device.
> In such cases, we will have a DIMM device that has some LMBs online and some
> LMBs offline. To avoid this situation, drmgr could be enhanced to support
> a command line option that results in removal of all the LMBs or none.
>

We had discussed an update to drmgr previously that would allow one to specify
that they wanted to remove a certain number of LMBs starting at a specified drc
index. I would assume we could incorporate the notion that it is an all or
nothing request into that.

Unfortunately right now drmgr will attempt to remove all of the requested LMBs
but it does a best effort. This results in situations where some LMB's are not
removed and some are.

-Nathan 
 
> Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
> ---
> Changes in v1:
> - Got rid of the patch that introduced a field in PCDIMMDevice to track
>   DIMM marked for removal since we can track that using within DRC
>   object.
> - Removed the patch that added return value to rtas_set_indicator()
>   since the required changes are already pushed by Michael Roth.
> 
> v0:
> 
>  hw/ppc/spapr.c     | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  hw/ppc/spapr_drc.c | 18 +++++++++++
>  2 files changed, 107 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index e1202ce..f5b1ac2 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -2174,6 +2174,85 @@ out:
>      error_propagate(errp, local_err);
>  }
> 
> +typedef struct sPAPRDIMMState {
> +    uint32_t nr_lmbs;
> +} sPAPRDIMMState;
> +
> +static void spapr_lmb_release(DeviceState *dev, void *opaque)
> +{
> +    sPAPRDIMMState *ds = (sPAPRDIMMState *)opaque;
> +    HotplugHandler *hotplug_ctrl = NULL;
> +    Error *local_err = NULL;
> +
> +    if (--ds->nr_lmbs) {
> +        return;
> +    }
> +
> +    g_free(ds);
> +
> +    /*
> +     * Now that all the LMBs have been removed by the guest, call the
> +     * pc-dimm unplug handler to cleanup up the pc-dimm device.
> +     */
> +    hotplug_ctrl = qdev_get_hotplug_handler(dev);
> +    hotplug_handler_unplug(hotplug_ctrl, dev, &local_err);
> +}
> +
> +static void spapr_del_lmbs(DeviceState *dev, uint64_t addr, uint64_t size,
> +                           Error **errp)
> +{
> +    sPAPRDRConnector *drc;
> +    sPAPRDRConnectorClass *drck;
> +    uint32_t nr_lmbs = size/SPAPR_MEMORY_BLOCK_SIZE;
> +    Error *local_err = NULL;
> +    int i;
> +    sPAPRDIMMState *ds = g_malloc0(sizeof(sPAPRDIMMState));
> +
> +    ds->nr_lmbs = nr_lmbs;
> +    for (i = 0; i < nr_lmbs; i++) {
> +        drc = spapr_dr_connector_by_id(SPAPR_DR_CONNECTOR_TYPE_LMB,
> +                addr/SPAPR_MEMORY_BLOCK_SIZE);
> +        g_assert(drc);
> +
> +        drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
> +        drck->detach(drc, dev, spapr_lmb_release, ds, &local_err);
> +        addr += SPAPR_MEMORY_BLOCK_SIZE;
> +    }
> +    spapr_hotplug_req_remove_by_count(SPAPR_DR_CONNECTOR_TYPE_LMB, nr_lmbs);
> +}
> +
> +static void spapr_memory_unplug(HotplugHandler *hotplug_dev, DeviceState *dev,
> +                                Error **errp)
> +{
> +    sPAPRMachineState *ms = SPAPR_MACHINE(hotplug_dev);
> +    PCDIMMDevice *dimm = PC_DIMM(dev);
> +    PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm);
> +    MemoryRegion *mr = ddc->get_memory_region(dimm);
> +
> +    pc_dimm_memory_unplug(dev, &ms->hotplug_memory, mr);
> +    object_unparent(OBJECT(dev));
> +}
> +
> +static void spapr_memory_unplug_request(HotplugHandler *hotplug_dev,
> +                                        DeviceState *dev, Error **errp)
> +{
> +    Error *local_err = NULL;
> +    PCDIMMDevice *dimm = PC_DIMM(dev);
> +    PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm);
> +    MemoryRegion *mr = ddc->get_memory_region(dimm);
> +    uint64_t size = memory_region_size(mr);
> +    uint64_t addr;
> +
> +    addr = object_property_get_int(OBJECT(dimm), PC_DIMM_ADDR_PROP, &local_err);
> +    if (local_err) {
> +        goto out;
> +    }
> +
> +    spapr_del_lmbs(dev, addr, size, &local_err);
> +out:
> +    error_propagate(errp, local_err);
> +}
> +
>  static void spapr_machine_device_plug(HotplugHandler *hotplug_dev,
>                                        DeviceState *dev, Error **errp)
>  {
> @@ -2221,7 +2300,15 @@ static void spapr_machine_device_unplug(HotplugHandler *hotplug_dev,
>                                        DeviceState *dev, Error **errp)
>  {
>      if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
> -        error_setg(errp, "Memory hot unplug not supported by sPAPR");
> +        spapr_memory_unplug(hotplug_dev, dev, errp);
> +    }
> +}
> +
> +static void spapr_machine_device_unplug_request(HotplugHandler *hotplug_dev,
> +                                                DeviceState *dev, Error **errp)
> +{
> +    if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
> +        spapr_memory_unplug_request(hotplug_dev, dev, errp);
>      }
>  }
> 
> @@ -2263,6 +2350,7 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data)
>      hc->plug = spapr_machine_device_plug;
>      hc->unplug = spapr_machine_device_unplug;
>      mc->cpu_index_to_socket_id = spapr_cpu_index_to_socket_id;
> +    hc->unplug_request = spapr_machine_device_unplug_request;
> 
>      smc->dr_lmb_enabled = false;
>      fwc->get_dev_path = spapr_get_fw_dev_path;
> diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c
> index 5d6ea7c..59b6ea9 100644
> --- a/hw/ppc/spapr_drc.c
> +++ b/hw/ppc/spapr_drc.c
> @@ -11,6 +11,7 @@
>   */
> 
>  #include "hw/ppc/spapr_drc.h"
> +#include "hw/ppc/spapr.h"
>  #include "qom/object.h"
>  #include "hw/qdev.h"
>  #include "qapi/visitor.h"
> @@ -77,6 +78,23 @@ static uint32_t set_isolation_state(sPAPRDRConnector *drc,
>          }
>      }
> 
> +    /*
> +     * Fail any request to ISOLATE the LMB DRC if this LMB doesn't
> +     * belong to a DIMM device that is marked for removal.
> +     *
> +     * Currently the guest userspace tool drmgr that drives the memory
> +     * hotplug/unplug will just try to remove a set of 'removable' LMBs
> +     * in response to a hot unplug request that is based on drc-count.
> +     * If the LMB being removed doesn't belong to a DIMM device that is
> +     * actually being unplugged, fail the isolation request here.
> +     */
> +    if (drc->type == SPAPR_DR_CONNECTOR_TYPE_LMB) {
> +        if ((state == SPAPR_DR_ISOLATION_STATE_ISOLATED) &&
> +             !drc->awaiting_release) {
> +            return RTAS_OUT_HW_ERROR;
> +        }
> +    }
> +
>      drc->isolation_state = state;
> 
>      if (drc->isolation_state == SPAPR_DR_ISOLATION_STATE_ISOLATED) {
>
Bharata B Rao Oct. 28, 2015, 9:17 a.m. UTC | #2
On Tue, Oct 27, 2015 at 02:52:13PM -0500, Nathan Fontenot wrote:
> On 10/26/2015 04:53 AM, Bharata B Rao wrote:
> > Add support to hot remove pc-dimm memory devices.
> > 
> > TODO: In response to memory hot removal operation on a DIMM device,
> > guest kernel might refuse to offline a few LMBs that are part of that device.
> > In such cases, we will have a DIMM device that has some LMBs online and some
> > LMBs offline. To avoid this situation, drmgr could be enhanced to support
> > a command line option that results in removal of all the LMBs or none.
> >
> 
> We had discussed an update to drmgr previously that would allow one to specify
> that they wanted to remove a certain number of LMBs starting at a specified drc
> index. I would assume we could incorporate the notion that it is an all or
> nothing request into that.

Right, but since that requires PAPR spec changes, I thought for now we
could add such an option to drmgr and use that from rtas_errd on PowerKVM
to correctly do memory hot unplug. Not feasible ?

Regards,
Bharata.
David Gibson Nov. 11, 2015, 1:36 a.m. UTC | #3
On Mon, Oct 26, 2015 at 03:23:05PM +0530, Bharata B Rao wrote:
> Add support to hot remove pc-dimm memory devices.

Sorry it's taken me so long to look at this.

> TODO: In response to memory hot removal operation on a DIMM device,
> guest kernel might refuse to offline a few LMBs that are part of that device.
> In such cases, we will have a DIMM device that has some LMBs online and some
> LMBs offline. To avoid this situation, drmgr could be enhanced to support
> a command line option that results in removal of all the LMBs or none.

Hm.. what would be the end result of such a situation?  We want to
handle it as gracefully as we can, even if the guest has old tools.
Is there some way we can detect this failure condition, and re-connect
the DIMM?

It does highlight the fact that the PAPR hotplug interface and the
pc-dimm model don't work together terribly well.  I think we have to
try to support it for the sake of management layers, but I do wonder
if we ought to thinkg about an alternative "lmb-pool" backend, where
the precise location of memory blocks isn't so important.  With some
thought such a backend might also be useful for paravirt x86.

Which also makes me think, I wonder if it would be possible to wire up
a PAPR compatible interface to qemu's balloon backend, since in some
ways the PAPR memory hotplug model acts more like a balloon (in that
the guest physical address of removed LMBs isn't usually important to
the host).

Still, we need to get the dimm backed model working first, I guess.

Apart from those overall considerations, the patch looks good.

> Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
> ---
> Changes in v1:
> - Got rid of the patch that introduced a field in PCDIMMDevice to track
>   DIMM marked for removal since we can track that using within DRC
>   object.
> - Removed the patch that added return value to rtas_set_indicator()
>   since the required changes are already pushed by Michael Roth.
> 
> v0:
> 
>  hw/ppc/spapr.c     | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  hw/ppc/spapr_drc.c | 18 +++++++++++
>  2 files changed, 107 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index e1202ce..f5b1ac2 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -2174,6 +2174,85 @@ out:
>      error_propagate(errp, local_err);
>  }
>  
> +typedef struct sPAPRDIMMState {
> +    uint32_t nr_lmbs;
> +} sPAPRDIMMState;
> +
> +static void spapr_lmb_release(DeviceState *dev, void *opaque)
> +{
> +    sPAPRDIMMState *ds = (sPAPRDIMMState *)opaque;
> +    HotplugHandler *hotplug_ctrl = NULL;
> +    Error *local_err = NULL;
> +
> +    if (--ds->nr_lmbs) {
> +        return;
> +    }
> +
> +    g_free(ds);
> +
> +    /*
> +     * Now that all the LMBs have been removed by the guest, call the
> +     * pc-dimm unplug handler to cleanup up the pc-dimm device.
> +     */
> +    hotplug_ctrl = qdev_get_hotplug_handler(dev);
> +    hotplug_handler_unplug(hotplug_ctrl, dev, &local_err);
> +}
> +
> +static void spapr_del_lmbs(DeviceState *dev, uint64_t addr, uint64_t size,
> +                           Error **errp)
> +{
> +    sPAPRDRConnector *drc;
> +    sPAPRDRConnectorClass *drck;
> +    uint32_t nr_lmbs = size/SPAPR_MEMORY_BLOCK_SIZE;
> +    Error *local_err = NULL;
> +    int i;
> +    sPAPRDIMMState *ds = g_malloc0(sizeof(sPAPRDIMMState));
> +
> +    ds->nr_lmbs = nr_lmbs;
> +    for (i = 0; i < nr_lmbs; i++) {
> +        drc = spapr_dr_connector_by_id(SPAPR_DR_CONNECTOR_TYPE_LMB,
> +                addr/SPAPR_MEMORY_BLOCK_SIZE);
> +        g_assert(drc);
> +
> +        drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
> +        drck->detach(drc, dev, spapr_lmb_release, ds, &local_err);
> +        addr += SPAPR_MEMORY_BLOCK_SIZE;
> +    }
> +    spapr_hotplug_req_remove_by_count(SPAPR_DR_CONNECTOR_TYPE_LMB, nr_lmbs);
> +}
> +
> +static void spapr_memory_unplug(HotplugHandler *hotplug_dev, DeviceState *dev,
> +                                Error **errp)
> +{
> +    sPAPRMachineState *ms = SPAPR_MACHINE(hotplug_dev);
> +    PCDIMMDevice *dimm = PC_DIMM(dev);
> +    PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm);
> +    MemoryRegion *mr = ddc->get_memory_region(dimm);
> +
> +    pc_dimm_memory_unplug(dev, &ms->hotplug_memory, mr);
> +    object_unparent(OBJECT(dev));
> +}
> +
> +static void spapr_memory_unplug_request(HotplugHandler *hotplug_dev,
> +                                        DeviceState *dev, Error **errp)
> +{
> +    Error *local_err = NULL;
> +    PCDIMMDevice *dimm = PC_DIMM(dev);
> +    PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm);
> +    MemoryRegion *mr = ddc->get_memory_region(dimm);
> +    uint64_t size = memory_region_size(mr);
> +    uint64_t addr;
> +
> +    addr = object_property_get_int(OBJECT(dimm), PC_DIMM_ADDR_PROP, &local_err);
> +    if (local_err) {
> +        goto out;
> +    }
> +
> +    spapr_del_lmbs(dev, addr, size, &local_err);
> +out:
> +    error_propagate(errp, local_err);
> +}
> +
>  static void spapr_machine_device_plug(HotplugHandler *hotplug_dev,
>                                        DeviceState *dev, Error **errp)
>  {
> @@ -2221,7 +2300,15 @@ static void spapr_machine_device_unplug(HotplugHandler *hotplug_dev,
>                                        DeviceState *dev, Error **errp)
>  {
>      if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
> -        error_setg(errp, "Memory hot unplug not supported by sPAPR");
> +        spapr_memory_unplug(hotplug_dev, dev, errp);
> +    }
> +}
> +
> +static void spapr_machine_device_unplug_request(HotplugHandler *hotplug_dev,
> +                                                DeviceState *dev, Error **errp)
> +{
> +    if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
> +        spapr_memory_unplug_request(hotplug_dev, dev, errp);
>      }
>  }
>  
> @@ -2263,6 +2350,7 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data)
>      hc->plug = spapr_machine_device_plug;
>      hc->unplug = spapr_machine_device_unplug;
>      mc->cpu_index_to_socket_id = spapr_cpu_index_to_socket_id;
> +    hc->unplug_request = spapr_machine_device_unplug_request;
>  
>      smc->dr_lmb_enabled = false;
>      fwc->get_dev_path = spapr_get_fw_dev_path;
> diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c
> index 5d6ea7c..59b6ea9 100644
> --- a/hw/ppc/spapr_drc.c
> +++ b/hw/ppc/spapr_drc.c
> @@ -11,6 +11,7 @@
>   */
>  
>  #include "hw/ppc/spapr_drc.h"
> +#include "hw/ppc/spapr.h"
>  #include "qom/object.h"
>  #include "hw/qdev.h"
>  #include "qapi/visitor.h"
> @@ -77,6 +78,23 @@ static uint32_t set_isolation_state(sPAPRDRConnector *drc,
>          }
>      }
>  
> +    /*
> +     * Fail any request to ISOLATE the LMB DRC if this LMB doesn't
> +     * belong to a DIMM device that is marked for removal.
> +     *
> +     * Currently the guest userspace tool drmgr that drives the memory
> +     * hotplug/unplug will just try to remove a set of 'removable' LMBs
> +     * in response to a hot unplug request that is based on drc-count.
> +     * If the LMB being removed doesn't belong to a DIMM device that is
> +     * actually being unplugged, fail the isolation request here.
> +     */
> +    if (drc->type == SPAPR_DR_CONNECTOR_TYPE_LMB) {
> +        if ((state == SPAPR_DR_ISOLATION_STATE_ISOLATED) &&
> +             !drc->awaiting_release) {
> +            return RTAS_OUT_HW_ERROR;
> +        }
> +    }
> +
>      drc->isolation_state = state;
>  
>      if (drc->isolation_state == SPAPR_DR_ISOLATION_STATE_ISOLATED) {
Bharata B Rao Nov. 11, 2015, 7:20 a.m. UTC | #4
On Wed, Nov 11, 2015 at 12:36:30PM +1100, David Gibson wrote:
> On Mon, Oct 26, 2015 at 03:23:05PM +0530, Bharata B Rao wrote:
> > Add support to hot remove pc-dimm memory devices.
> 
> Sorry it's taken me so long to look at this.
> 
> > TODO: In response to memory hot removal operation on a DIMM device,
> > guest kernel might refuse to offline a few LMBs that are part of that device.
> > In such cases, we will have a DIMM device that has some LMBs online and some
> > LMBs offline. To avoid this situation, drmgr could be enhanced to support
> > a command line option that results in removal of all the LMBs or none.

I am realizing that it might not be possible to support the above notion of
zero or full removal of DIMM unless we have an association of LMBs
to DIMMs. drmgr could be handling unplug requests of multiple DIMMs
at a time and it is really not possible to determine how many LMBs of
which DIMM got off-lined successfully.

With this memory unplug patch that I have posted here and with the existing
drmgr, this is the behaviour:

- If guest kernel refuses to offline some LMBs of the DIMM for which
  hot unplug request is done, such LMBs will remain online while the
  rest of the LMBs from the DIMM will be offline. The DIMM device will
  continue to exist from QEMU point of view and will be removed only when
  all the LMBs belonging to it are removed.

- Since we implement LMB addition/removal by count, during an unplug request,
  drmgr will walk through all the removable/hotplugged DIMMs and will attempt
  to unplug the specified number of LMBs. In addition to other things, this
  involves marking the LMB offline first (via
  /sys/devices/system/memory/memoryXX/online) and then releasing the DRC
  which involves isolating the DRC.

  QEMU will fail isolation request for any LMB DRC that has not been marked
  for removal via prior DIMM device_del operation. In response to this
  failure, drmgr will bring the LMB back online by writing to
  /sys/devices/system/memory/memoryXX/online.

  Thus some LMBs that are unrelated to the current DIMM removal request
  are unnecessarily made offline only to be brought online immediately.

  There is another situation possible here. Let DIMM1 have 4 LMBs out of
  which 2 LMBs became offline in response to unplug request. Now during
  the removal of DIMM2, there is nothing that prevents those 2 LMBs from
  DIMM1 from going away (if guest allows). So we have a situation where
  some LMBs of DIMM1 got offline when running unplug request for DIMM2.
  Though I have not seen this in practice, I believe it is possible.

> 
> Hm.. what would be the end result of such a situation?  We want to
> handle it as gracefully as we can, even if the guest has old tools.
> Is there some way we can detect this failure condition, and re-connect
> the DIMM?
> 
> It does highlight the fact that the PAPR hotplug interface and the
> pc-dimm model don't work together terribly well.  I think we have to

I believe we will have a saner model when PAPR is updated to support
removal of specified number of LMBs starting at a particular DRC index.
With that in place, we could support pc-dimm model removal correctly.

> try to support it for the sake of management layers, but I do wonder
> if we ought to thinkg about an alternative "lmb-pool" backend, where
> the precise location of memory blocks isn't so important.  With some
> thought such a backend might also be useful for paravirt x86.
> 
> Which also makes me think, I wonder if it would be possible to wire up
> a PAPR compatible interface to qemu's balloon backend, since in some
> ways the PAPR memory hotplug model acts more like a balloon (in that
> the guest physical address of removed LMBs isn't usually important to
> the host).
> 
> Still, we need to get the dimm backed model working first, I guess.
> 

So

1. can we live with the current hotunplug behaviour that I described above
  for now or
2. should we put pc-dimm based memory hotunplug on backburner until we get
  PAPR spec changes in place or
3. start exploring lmb-pool model ?

Regards,
Bharata.
David Gibson Nov. 12, 2015, 6:03 a.m. UTC | #5
On Wed, Nov 11, 2015 at 12:50:57PM +0530, Bharata B Rao wrote:
> On Wed, Nov 11, 2015 at 12:36:30PM +1100, David Gibson wrote:
> > On Mon, Oct 26, 2015 at 03:23:05PM +0530, Bharata B Rao wrote:
> > > Add support to hot remove pc-dimm memory devices.
> > 
> > Sorry it's taken me so long to look at this.
> > 
> > > TODO: In response to memory hot removal operation on a DIMM device,
> > > guest kernel might refuse to offline a few LMBs that are part of that device.
> > > In such cases, we will have a DIMM device that has some LMBs online and some
> > > LMBs offline. To avoid this situation, drmgr could be enhanced to support
> > > a command line option that results in removal of all the LMBs or none.
> 
> I am realizing that it might not be possible to support the above notion of
> zero or full removal of DIMM unless we have an association of LMBs
> to DIMMs.

Aren't LMBs implicitly associated to DIMMs by physical address?

> drmgr could be handling unplug requests of multiple DIMMs
> at a time and it is really not possible to determine how many LMBs of
> which DIMM got off-lined successfully.

I think I'm missing something.  So far I can see that that would be a
bit fiddly, but I don't see an inherent problem.

> With this memory unplug patch that I have posted here and with the existing
> drmgr, this is the behaviour:
> 
> - If guest kernel refuses to offline some LMBs of the DIMM for which
>   hot unplug request is done, such LMBs will remain online while the
>   rest of the LMBs from the DIMM will be offline. The DIMM device will
>   continue to exist from QEMU point of view and will be removed only when
>   all the LMBs belonging to it are removed.

Yeah, that's a bit nasty :/

> - Since we implement LMB addition/removal by count, during an unplug request,
>   drmgr will walk through all the removable/hotplugged DIMMs and will attempt
>   to unplug the specified number of LMBs. In addition to other things, this
>   involves marking the LMB offline first (via
>   /sys/devices/system/memory/memoryXX/online) and then releasing the DRC
>   which involves isolating the DRC.
> 
>   QEMU will fail isolation request for any LMB DRC that has not been marked
>   for removal via prior DIMM device_del operation. In response to this
>   failure, drmgr will bring the LMB back online by writing to
>   /sys/devices/system/memory/memoryXX/online.
> 
>   Thus some LMBs that are unrelated to the current DIMM removal request
>   are unnecessarily made offline only to be brought online immediately.

"Some".  As far as I can tell, with no hint address, you'd expect to
go through on average half of the LMBs other than the ones in the
right DIMM before you hit the right DIMM.  Which means with a
reasonably large number of DIMMs you'll churn through nearly half your
memory (on average) on every attempted DIMM removal.

I suspect "awful" would rather understate the performance impact of
this on a well-utilized system.

>   There is another situation possible here. Let DIMM1 have 4 LMBs out of
>   which 2 LMBs became offline in response to unplug request. Now during
>   the removal of DIMM2, there is nothing that prevents those 2 LMBs from
>   DIMM1 from going away (if guest allows). So we have a situation where
>   some LMBs of DIMM1 got offline when running unplug request for DIMM2.
>   Though I have not seen this in practice, I believe it is possible.

Ick.  Yeah, it seems like we could only really keep this sane with
both the pc-dimm and PAPR interfaces if we forced serialization of
each DIMM unplug, somehow detected failure of one and rolled it back
before attempting another one.

> > Hm.. what would be the end result of such a situation?  We want to
> > handle it as gracefully as we can, even if the guest has old tools.
> > Is there some way we can detect this failure condition, and re-connect
> > the DIMM?
> > 
> > It does highlight the fact that the PAPR hotplug interface and the
> > pc-dimm model don't work together terribly well.  I think we have to
> 
> I believe we will have a saner model when PAPR is updated to support
> removal of specified number of LMBs starting at a particular DRC index.
> With that in place, we could support pc-dimm model removal correctly.

Right.  Do we have any guess at a timeline for this?

> > try to support it for the sake of management layers, but I do wonder
> > if we ought to thinkg about an alternative "lmb-pool" backend, where
> > the precise location of memory blocks isn't so important.  With some
> > thought such a backend might also be useful for paravirt x86.
> > 
> > Which also makes me think, I wonder if it would be possible to wire up
> > a PAPR compatible interface to qemu's balloon backend, since in some
> > ways the PAPR memory hotplug model acts more like a balloon (in that
> > the guest physical address of removed LMBs isn't usually important to
> > the host).
> > 
> > Still, we need to get the dimm backed model working first, I guess.
> > 
> 
> So
> 
> 1. can we live with the current hotunplug behaviour that I described above
>   for now or

Unless I've missed something to mitigate the memmove()-half-of-RAM
problem, then no, I don't think we can.

> 2. should we put pc-dimm based memory hotunplug on backburner until we get
>   PAPR spec changes in place or
> 3. start exploring lmb-pool model ?

I'm not sure which of these will work best.  From the qemu and kernel
perspective alone, I think (3) will give sane results faster, but
updating libvirt and the rest of the management stack to handle this
different way of doing things might be an awful lot of extra work and
time.
diff mbox

Patch

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index e1202ce..f5b1ac2 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2174,6 +2174,85 @@  out:
     error_propagate(errp, local_err);
 }
 
+typedef struct sPAPRDIMMState {
+    uint32_t nr_lmbs;
+} sPAPRDIMMState;
+
+static void spapr_lmb_release(DeviceState *dev, void *opaque)
+{
+    sPAPRDIMMState *ds = (sPAPRDIMMState *)opaque;
+    HotplugHandler *hotplug_ctrl = NULL;
+    Error *local_err = NULL;
+
+    if (--ds->nr_lmbs) {
+        return;
+    }
+
+    g_free(ds);
+
+    /*
+     * Now that all the LMBs have been removed by the guest, call the
+     * pc-dimm unplug handler to cleanup up the pc-dimm device.
+     */
+    hotplug_ctrl = qdev_get_hotplug_handler(dev);
+    hotplug_handler_unplug(hotplug_ctrl, dev, &local_err);
+}
+
+static void spapr_del_lmbs(DeviceState *dev, uint64_t addr, uint64_t size,
+                           Error **errp)
+{
+    sPAPRDRConnector *drc;
+    sPAPRDRConnectorClass *drck;
+    uint32_t nr_lmbs = size/SPAPR_MEMORY_BLOCK_SIZE;
+    Error *local_err = NULL;
+    int i;
+    sPAPRDIMMState *ds = g_malloc0(sizeof(sPAPRDIMMState));
+
+    ds->nr_lmbs = nr_lmbs;
+    for (i = 0; i < nr_lmbs; i++) {
+        drc = spapr_dr_connector_by_id(SPAPR_DR_CONNECTOR_TYPE_LMB,
+                addr/SPAPR_MEMORY_BLOCK_SIZE);
+        g_assert(drc);
+
+        drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
+        drck->detach(drc, dev, spapr_lmb_release, ds, &local_err);
+        addr += SPAPR_MEMORY_BLOCK_SIZE;
+    }
+    spapr_hotplug_req_remove_by_count(SPAPR_DR_CONNECTOR_TYPE_LMB, nr_lmbs);
+}
+
+static void spapr_memory_unplug(HotplugHandler *hotplug_dev, DeviceState *dev,
+                                Error **errp)
+{
+    sPAPRMachineState *ms = SPAPR_MACHINE(hotplug_dev);
+    PCDIMMDevice *dimm = PC_DIMM(dev);
+    PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm);
+    MemoryRegion *mr = ddc->get_memory_region(dimm);
+
+    pc_dimm_memory_unplug(dev, &ms->hotplug_memory, mr);
+    object_unparent(OBJECT(dev));
+}
+
+static void spapr_memory_unplug_request(HotplugHandler *hotplug_dev,
+                                        DeviceState *dev, Error **errp)
+{
+    Error *local_err = NULL;
+    PCDIMMDevice *dimm = PC_DIMM(dev);
+    PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm);
+    MemoryRegion *mr = ddc->get_memory_region(dimm);
+    uint64_t size = memory_region_size(mr);
+    uint64_t addr;
+
+    addr = object_property_get_int(OBJECT(dimm), PC_DIMM_ADDR_PROP, &local_err);
+    if (local_err) {
+        goto out;
+    }
+
+    spapr_del_lmbs(dev, addr, size, &local_err);
+out:
+    error_propagate(errp, local_err);
+}
+
 static void spapr_machine_device_plug(HotplugHandler *hotplug_dev,
                                       DeviceState *dev, Error **errp)
 {
@@ -2221,7 +2300,15 @@  static void spapr_machine_device_unplug(HotplugHandler *hotplug_dev,
                                       DeviceState *dev, Error **errp)
 {
     if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
-        error_setg(errp, "Memory hot unplug not supported by sPAPR");
+        spapr_memory_unplug(hotplug_dev, dev, errp);
+    }
+}
+
+static void spapr_machine_device_unplug_request(HotplugHandler *hotplug_dev,
+                                                DeviceState *dev, Error **errp)
+{
+    if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
+        spapr_memory_unplug_request(hotplug_dev, dev, errp);
     }
 }
 
@@ -2263,6 +2350,7 @@  static void spapr_machine_class_init(ObjectClass *oc, void *data)
     hc->plug = spapr_machine_device_plug;
     hc->unplug = spapr_machine_device_unplug;
     mc->cpu_index_to_socket_id = spapr_cpu_index_to_socket_id;
+    hc->unplug_request = spapr_machine_device_unplug_request;
 
     smc->dr_lmb_enabled = false;
     fwc->get_dev_path = spapr_get_fw_dev_path;
diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c
index 5d6ea7c..59b6ea9 100644
--- a/hw/ppc/spapr_drc.c
+++ b/hw/ppc/spapr_drc.c
@@ -11,6 +11,7 @@ 
  */
 
 #include "hw/ppc/spapr_drc.h"
+#include "hw/ppc/spapr.h"
 #include "qom/object.h"
 #include "hw/qdev.h"
 #include "qapi/visitor.h"
@@ -77,6 +78,23 @@  static uint32_t set_isolation_state(sPAPRDRConnector *drc,
         }
     }
 
+    /*
+     * Fail any request to ISOLATE the LMB DRC if this LMB doesn't
+     * belong to a DIMM device that is marked for removal.
+     *
+     * Currently the guest userspace tool drmgr that drives the memory
+     * hotplug/unplug will just try to remove a set of 'removable' LMBs
+     * in response to a hot unplug request that is based on drc-count.
+     * If the LMB being removed doesn't belong to a DIMM device that is
+     * actually being unplugged, fail the isolation request here.
+     */
+    if (drc->type == SPAPR_DR_CONNECTOR_TYPE_LMB) {
+        if ((state == SPAPR_DR_ISOLATION_STATE_ISOLATED) &&
+             !drc->awaiting_release) {
+            return RTAS_OUT_HW_ERROR;
+        }
+    }
+
     drc->isolation_state = state;
 
     if (drc->isolation_state == SPAPR_DR_ISOLATION_STATE_ISOLATED) {