diff mbox

[13/27] acpi: memory hotplug ACPI hardware implementation

Message ID 1385001528-12003-14-git-send-email-imammedo@redhat.com
State New
Headers show

Commit Message

Igor Mammedov Nov. 21, 2013, 2:38 a.m. UTC
- implements QEMU hardware part of memory hotplug protocol
  described at "docs/specs/acpi_mem_hotplug.txt"
- handles only memory add notification event for now

Signed-off-by: Igor Mammedov <imammedo@redhat.com>
---
 docs/specs/acpi_mem_hotplug.txt |   38 ++++++++++
 hw/acpi/core.c                  |  144 +++++++++++++++++++++++++++++++++++++++
 include/hw/acpi/acpi.h          |   24 +++++++
 3 files changed, 206 insertions(+), 0 deletions(-)
 create mode 100644 docs/specs/acpi_mem_hotplug.txt

Comments

Michael S. Tsirkin Nov. 21, 2013, 9:42 a.m. UTC | #1
On Thu, Nov 21, 2013 at 03:38:34AM +0100, Igor Mammedov wrote:
> - implements QEMU hardware part of memory hotplug protocol
>   described at "docs/specs/acpi_mem_hotplug.txt"
> - handles only memory add notification event for now
> 
> Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> ---
>  docs/specs/acpi_mem_hotplug.txt |   38 ++++++++++
>  hw/acpi/core.c                  |  144 +++++++++++++++++++++++++++++++++++++++
>  include/hw/acpi/acpi.h          |   24 +++++++
>  3 files changed, 206 insertions(+), 0 deletions(-)
>  create mode 100644 docs/specs/acpi_mem_hotplug.txt
> 
> diff --git a/docs/specs/acpi_mem_hotplug.txt b/docs/specs/acpi_mem_hotplug.txt
> new file mode 100644
> index 0000000..fc34142
> --- /dev/null
> +++ b/docs/specs/acpi_mem_hotplug.txt
> @@ -0,0 +1,38 @@
> +QEMU<->ACPI BIOS memory hotplug interface
> +--------------------------------------
> +
> +ACPI BIOS GPE.3 handler is dedicated for notifying OS about memory hot-add
> +or hot-remove events.
> +
> +Memory hot-plug interface (IO port 0xa00-0xa17, 1-4 byte access):
> +---------------------------------------------------------------
> +0xa00:
> +  read access:
> +      [0x0-0x3] Lo part of memory device phys address
> +      [0x4-0x7] Hi part of memory device phys address
> +      [0x8-0xb] Lo part of memory device size in bytes
> +      [0xc-0xf] Hi part of memory device size in bytes
> +      [0x14] highest memory hot-plug interface version supported by QEMU

So this can make guest fail gracefully but it appears that
detecting guest version would be nicer?
It would let us actually support old guests ...

> +      [0x15] Memory device status fields
> +          bits:
> +              1: device is enabled and may be used by guest
> +              2: device insert event, used by ACPI BIOS to distinguish
> +                 device for which no device check event to OSPM was issued

what does the above mean?
what if device is not present?

> +      [0x16-0x17] reserved
> +
> +  write access:
> +      [0x0-0x3] Memory device slot selector, selects active memory device.
> +                All following accesses to other registers in 0xaf80-0xaf97
> +                region will read/store data from/to selected memory device.
> +      [0x4-0x7] OST event code reported by OSPM
> +      [0x8-0xb] OST status code reported by OSPM
> +      [0x15] Memory device status fields

this is control, not status?

> +          bits:
> +              2: if set to 1 clears device insert event, set by ACPI BIOS
> +                 after it has sent device check event to OSPM for
> +                 seleted memory device

selected?

How about we actually require guest to enable memory?

This way if we hotplug, but old guest does not enable
and puts a PCI device there, it just works.


> +
> +Selecting memory device slot beyond present range has no effect on platform:
> +   - not documented above write accesses to memory hot-plug registers
> +     are ignored;
> +   - not documented above read accesses to memory hot-plug registers return 0xFF

00 would be cleaner (matches not enabled - no event)?

> diff --git a/hw/acpi/core.c b/hw/acpi/core.c
> index 8c0d48c..18e169c 100644
> --- a/hw/acpi/core.c
> +++ b/hw/acpi/core.c
> @@ -680,3 +680,147 @@ void acpi_update_sci(ACPIREGS *regs, qemu_irq irq, uint32_t gpe0_sts_mask)
>                         (regs->pm1.evt.en & ACPI_BITMASK_TIMER_ENABLE) &&
>                         !(pm1a_sts & ACPI_BITMASK_TIMER_STATUS));
>  }
> +
> +static uint64_t acpi_memory_hotplug_read(void *opaque, hwaddr addr,
> +                                         unsigned int size)
> +{
> +    uint32_t val = 0;
> +    MemHotplugState *mem_st = opaque;
> +    MemStatus *mdev;
> +
> +    if (mem_st->selector >= mem_st->dev_count) {
> +        return 0;
> +    }
> +
> +    mdev = &mem_st->devs[mem_st->selector];
> +    switch (addr) {
> +    case 0x0: /* Lo part of phys address where DIMM is mapped */
> +        val = object_property_get_int(OBJECT(mdev->dimm), "start", NULL);
> +        break;
> +    case 0x4: /* Hi part of phys address where DIMM is mapped */
> +        val = object_property_get_int(OBJECT(mdev->dimm), "start", NULL) >> 32;
> +        break;
> +    case 0x8: /* Lo part of DIMM size */
> +        val = object_property_get_int(OBJECT(mdev->dimm), "size", NULL);
> +        break;
> +    case 0xc: /* Hi part of DIMM size */
> +        val = object_property_get_int(OBJECT(mdev->dimm), "size", NULL) >> 32;
> +        break;
> +    case 0x10: /* node proximity for _PXM method */
> +        val = object_property_get_int(OBJECT(mdev->dimm), "node", NULL);
> +        break;
> +    case 0x14: /* intf version */
> +        val = 1;
> +        break;
> +    case 0x15: /* pack and return is_* fields */
> +        val |= mdev->is_enabled   ? 1 : 0;
> +        val |= mdev->is_inserting ? 2 : 0;
> +        break;
> +    }
> +    return val;
> +}
> +
> +static void acpi_memory_hotplug_write(void *opaque, hwaddr addr, uint64_t data,
> +                                      unsigned int size)
> +{
> +    MemHotplugState *mem_st = opaque;
> +    MemStatus *mdev;
> +
> +    if (!mem_st->dev_count) {
> +        return;
> +    }
> +
> +    if (addr) {
> +        if (mem_st->selector >= mem_st->dev_count) {
> +            return;
> +        }
> +    }
> +
> +    switch (addr) {
> +    case 0x0: /* DIMM slot selector */
> +        mem_st->selector = data;
> +        break;
> +    case 0x4: /* _OST event  */
> +        mdev = &mem_st->devs[mem_st->selector];
> +        if (data == 1) {
> +            /* TODO: handle device insert OST event */
> +        } else if (data == 3) {
> +            /* TODO: handle device remove OST event */
> +        }
> +        mdev->ost_event = data;
> +        break;
> +    case 0x8: /* _OST status */
> +        mdev = &mem_st->devs[mem_st->selector];
> +        mdev->ost_status = data;
> +        /* TODO: report async error */
> +        /* TODO: implement memory removal on guest signal */
> +        break;
> +    case 0x15:
> +        mdev = &mem_st->devs[mem_st->selector];
> +        if (data & 2) { /* clear insert event */
> +            mdev->is_inserting  = false;
> +        }
> +        break;
> +    }
> +
> +}
> +static const MemoryRegionOps acpi_memory_hotplug_ops = {
> +    .read = acpi_memory_hotplug_read,
> +    .write = acpi_memory_hotplug_write,
> +    .endianness = DEVICE_LITTLE_ENDIAN,
> +    .valid = {
> +        .min_access_size = 1,
> +        .max_access_size = 4,
> +    },
> +};
> +
> +void acpi_memory_hotplug_init(Object *owner, MemoryRegion *io,
> +                              MemHotplugState *state)
> +{
> +    QemuOpts *opts = qemu_opts_find(qemu_find_opts("memory-opts"), NULL);
> +    g_assert(opts);
> +
> +    state->dev_count = qemu_opt_get_number(opts, "slots", 0);
> +
> +    if (!state->dev_count) {
> +        return;
> +    }
> +
> +    state->devs = g_malloc0(sizeof(*state->devs) * state->dev_count);
> +    memory_region_init_io(io, owner, &acpi_memory_hotplug_ops, state,
> +                          "apci-mem-hotplug", ACPI_MEMORY_HOTPLUG_IO_LEN);
> +}
> +
> +int acpi_memory_hotplug_cb(ACPIREGS *regs, MemHotplugState *mem_st,
> +                           DeviceState *dev, HotplugState state)
> +{
> +    MemStatus *mdev;
> +    Error *local_err = NULL;
> +    int slot = object_property_get_int(OBJECT(dev), "slot", &local_err);
> +
> +    if (error_is_set(&local_err)) {
> +        qerror_report_err(local_err);
> +        error_free(local_err);
> +        return -1;
> +    }
> +
> +    if (slot >= mem_st->dev_count) {
> +        char *dev_path = object_get_canonical_path(OBJECT(dev));
> +        qerror_report(ERROR_CLASS_GENERIC_ERROR, "acpi_memory_hotplug_cb: "
> +                      "device [%s] returned invalid memory slot[%d]",
> +                      dev_path, slot);
> +        g_free(dev_path);
> +        return -1;
> +    }
> +
> +    mdev = &mem_st->devs[slot];
> +    if (state == HOTPLUG_ENABLED) {
> +        mdev->dimm = dev;
> +        mdev->is_enabled = true;
> +        mdev->is_inserting = true;
> +    }
> +
> +    /* do ACPI magic */
> +    regs->gpe.sts[0] |= ACPI_MEMORY_HOTPLUG_STATUS;
> +    return 0;
> +}
> diff --git a/include/hw/acpi/acpi.h b/include/hw/acpi/acpi.h
> index c4ae7d7..e5717df 100644
> --- a/include/hw/acpi/acpi.h
> +++ b/include/hw/acpi/acpi.h
> @@ -168,6 +168,30 @@ uint32_t acpi_gpe_ioport_readb(ACPIREGS *ar, uint32_t addr);
>  
>  void acpi_update_sci(ACPIREGS *acpi_regs, qemu_irq irq, uint32_t gpe0_sts_mask);
>  
> +#define ACPI_MEMORY_HOTPLUG_IO_LEN 24
> +#define ACPI_MEMORY_HOTPLUG_BASE 0x0a00
> +
> +#define ACPI_MEMORY_HOTPLUG_STATUS 8
> +
> +typedef struct MemStatus {
> +    DeviceState *dimm;
> +    bool is_enabled;
> +    bool is_inserting;
> +    uint32_t ost_event;
> +    uint32_t ost_status;
> +} MemStatus;
> +
> +typedef struct MemHotplugState {
> +    uint32_t selector;
> +    uint32_t dev_count;
> +    MemStatus *devs;
> +} MemHotplugState;
> +
> +void acpi_memory_hotplug_init(Object *owner, MemoryRegion *io,
> +                              MemHotplugState *state);
> +
> +int acpi_memory_hotplug_cb(ACPIREGS *regs, MemHotplugState *mem_st,
> +                           DeviceState *dev, HotplugState state);
>  /* acpi.c */
>  extern int acpi_enabled;
>  extern char unsigned *acpi_tables;
> -- 
> 1.7.1
Igor Mammedov Nov. 21, 2013, 2:21 p.m. UTC | #2
On Thu, 21 Nov 2013 11:42:02 +0200
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Thu, Nov 21, 2013 at 03:38:34AM +0100, Igor Mammedov wrote:
> > - implements QEMU hardware part of memory hotplug protocol
> >   described at "docs/specs/acpi_mem_hotplug.txt"
> > - handles only memory add notification event for now
> > 
> > Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> > ---
> >  docs/specs/acpi_mem_hotplug.txt |   38 ++++++++++
> >  hw/acpi/core.c                  |  144 +++++++++++++++++++++++++++++++++++++++
> >  include/hw/acpi/acpi.h          |   24 +++++++
> >  3 files changed, 206 insertions(+), 0 deletions(-)
> >  create mode 100644 docs/specs/acpi_mem_hotplug.txt
> > 
> > diff --git a/docs/specs/acpi_mem_hotplug.txt b/docs/specs/acpi_mem_hotplug.txt
> > new file mode 100644
> > index 0000000..fc34142
> > --- /dev/null
> > +++ b/docs/specs/acpi_mem_hotplug.txt
> > @@ -0,0 +1,38 @@
> > +QEMU<->ACPI BIOS memory hotplug interface
> > +--------------------------------------
> > +
> > +ACPI BIOS GPE.3 handler is dedicated for notifying OS about memory hot-add
> > +or hot-remove events.
> > +
> > +Memory hot-plug interface (IO port 0xa00-0xa17, 1-4 byte access):
> > +---------------------------------------------------------------
> > +0xa00:
> > +  read access:
> > +      [0x0-0x3] Lo part of memory device phys address
> > +      [0x4-0x7] Hi part of memory device phys address
> > +      [0x8-0xb] Lo part of memory device size in bytes
> > +      [0xc-0xf] Hi part of memory device size in bytes
> > +      [0x14] highest memory hot-plug interface version supported by QEMU
> 
> So this can make guest fail gracefully but it appears that
> detecting guest version would be nicer?
> It would let us actually support old guests ...

my idea of how to it was,
guest writes its version into [0x14] register and reads QEMU version
from it back, if they do not match then than BIOS ignores GPE.3 event
effectively disabling hotplug on guest side.
I haven't thought about supporting multiple implementations in QEMU though.
Do we really want it?

> 
> > +      [0x15] Memory device status fields
> > +          bits:
> > +              1: device is enabled and may be used by guest
> > +              2: device insert event, used by ACPI BIOS to distinguish
> > +                 device for which no device check event to OSPM was issued
> 
> what does the above mean?
After OSPM issued device check on selected device it clears this bit to mark event
as handled.
It allows to avoid keeping this state in ASL (as it's done for CPU hotplug, see CPON)

> what if device is not present?
ASL will issue device check and clear bit, it might be a bug since _STA would report
not present but no eject event was issued.

Papering over it ASL could check present bit first and issue device check only if
it's present.

> 
> > +      [0x16-0x17] reserved
> > +
> > +  write access:
> > +      [0x0-0x3] Memory device slot selector, selects active memory device.
> > +                All following accesses to other registers in 0xaf80-0xaf97
> > +                region will read/store data from/to selected memory device.
> > +      [0x4-0x7] OST event code reported by OSPM
> > +      [0x8-0xb] OST status code reported by OSPM
> > +      [0x15] Memory device status fields
> 
> this is control, not status?
Thanks, I'll fix it.

> 
> > +          bits:
> > +              2: if set to 1 clears device insert event, set by ACPI BIOS
> > +                 after it has sent device check event to OSPM for
> > +                 seleted memory device
> 
> selected?
see "write access: [0x0-0x3]"

> 
> How about we actually require guest to enable memory?
> 
> This way if we hotplug, but old guest does not enable
> and puts a PCI device there, it just works.
I've lost you here, could you elaborate pls?

> 
> > +
> > +Selecting memory device slot beyond present range has no effect on platform:
> > +   - not documented above write accesses to memory hot-plug registers
> > +     are ignored;
> > +   - not documented above read accesses to memory hot-plug registers return 0xFF
> 
> 00 would be cleaner (matches not enabled - no event)?
I'm following pattern, where reading from not present IO port returns 0xFF on hardware.
Fact that ASL reads 0xFF could be used as not supported indication.

> 
> > diff --git a/hw/acpi/core.c b/hw/acpi/core.c
> > index 8c0d48c..18e169c 100644
> > --- a/hw/acpi/core.c
> > +++ b/hw/acpi/core.c
> > @@ -680,3 +680,147 @@ void acpi_update_sci(ACPIREGS *regs, qemu_irq irq, uint32_t gpe0_sts_mask)
> >                         (regs->pm1.evt.en & ACPI_BITMASK_TIMER_ENABLE) &&
> >                         !(pm1a_sts & ACPI_BITMASK_TIMER_STATUS));
> >  }
> > +
> > +static uint64_t acpi_memory_hotplug_read(void *opaque, hwaddr addr,
> > +                                         unsigned int size)
> > +{
> > +    uint32_t val = 0;
> > +    MemHotplugState *mem_st = opaque;
> > +    MemStatus *mdev;
> > +
> > +    if (mem_st->selector >= mem_st->dev_count) {
> > +        return 0;
> > +    }
> > +
> > +    mdev = &mem_st->devs[mem_st->selector];
> > +    switch (addr) {
> > +    case 0x0: /* Lo part of phys address where DIMM is mapped */
> > +        val = object_property_get_int(OBJECT(mdev->dimm), "start", NULL);
> > +        break;
> > +    case 0x4: /* Hi part of phys address where DIMM is mapped */
> > +        val = object_property_get_int(OBJECT(mdev->dimm), "start", NULL) >> 32;
> > +        break;
> > +    case 0x8: /* Lo part of DIMM size */
> > +        val = object_property_get_int(OBJECT(mdev->dimm), "size", NULL);
> > +        break;
> > +    case 0xc: /* Hi part of DIMM size */
> > +        val = object_property_get_int(OBJECT(mdev->dimm), "size", NULL) >> 32;
> > +        break;
> > +    case 0x10: /* node proximity for _PXM method */
> > +        val = object_property_get_int(OBJECT(mdev->dimm), "node", NULL);
> > +        break;
> > +    case 0x14: /* intf version */
> > +        val = 1;
> > +        break;
> > +    case 0x15: /* pack and return is_* fields */
> > +        val |= mdev->is_enabled   ? 1 : 0;
> > +        val |= mdev->is_inserting ? 2 : 0;
> > +        break;
> > +    }
> > +    return val;
> > +}
> > +
> > +static void acpi_memory_hotplug_write(void *opaque, hwaddr addr, uint64_t data,
> > +                                      unsigned int size)
> > +{
> > +    MemHotplugState *mem_st = opaque;
> > +    MemStatus *mdev;
> > +
> > +    if (!mem_st->dev_count) {
> > +        return;
> > +    }
> > +
> > +    if (addr) {
> > +        if (mem_st->selector >= mem_st->dev_count) {
> > +            return;
> > +        }
> > +    }
> > +
> > +    switch (addr) {
> > +    case 0x0: /* DIMM slot selector */
> > +        mem_st->selector = data;
> > +        break;
> > +    case 0x4: /* _OST event  */
> > +        mdev = &mem_st->devs[mem_st->selector];
> > +        if (data == 1) {
> > +            /* TODO: handle device insert OST event */
> > +        } else if (data == 3) {
> > +            /* TODO: handle device remove OST event */
> > +        }
> > +        mdev->ost_event = data;
> > +        break;
> > +    case 0x8: /* _OST status */
> > +        mdev = &mem_st->devs[mem_st->selector];
> > +        mdev->ost_status = data;
> > +        /* TODO: report async error */
> > +        /* TODO: implement memory removal on guest signal */
> > +        break;
> > +    case 0x15:
> > +        mdev = &mem_st->devs[mem_st->selector];
> > +        if (data & 2) { /* clear insert event */
> > +            mdev->is_inserting  = false;
> > +        }
> > +        break;
> > +    }
> > +
> > +}
> > +static const MemoryRegionOps acpi_memory_hotplug_ops = {
> > +    .read = acpi_memory_hotplug_read,
> > +    .write = acpi_memory_hotplug_write,
> > +    .endianness = DEVICE_LITTLE_ENDIAN,
> > +    .valid = {
> > +        .min_access_size = 1,
> > +        .max_access_size = 4,
> > +    },
> > +};
> > +
> > +void acpi_memory_hotplug_init(Object *owner, MemoryRegion *io,
> > +                              MemHotplugState *state)
> > +{
> > +    QemuOpts *opts = qemu_opts_find(qemu_find_opts("memory-opts"), NULL);
> > +    g_assert(opts);
> > +
> > +    state->dev_count = qemu_opt_get_number(opts, "slots", 0);
> > +
> > +    if (!state->dev_count) {
> > +        return;
> > +    }
> > +
> > +    state->devs = g_malloc0(sizeof(*state->devs) * state->dev_count);
> > +    memory_region_init_io(io, owner, &acpi_memory_hotplug_ops, state,
> > +                          "apci-mem-hotplug", ACPI_MEMORY_HOTPLUG_IO_LEN);
> > +}
> > +
> > +int acpi_memory_hotplug_cb(ACPIREGS *regs, MemHotplugState *mem_st,
> > +                           DeviceState *dev, HotplugState state)
> > +{
> > +    MemStatus *mdev;
> > +    Error *local_err = NULL;
> > +    int slot = object_property_get_int(OBJECT(dev), "slot", &local_err);
> > +
> > +    if (error_is_set(&local_err)) {
> > +        qerror_report_err(local_err);
> > +        error_free(local_err);
> > +        return -1;
> > +    }
> > +
> > +    if (slot >= mem_st->dev_count) {
> > +        char *dev_path = object_get_canonical_path(OBJECT(dev));
> > +        qerror_report(ERROR_CLASS_GENERIC_ERROR, "acpi_memory_hotplug_cb: "
> > +                      "device [%s] returned invalid memory slot[%d]",
> > +                      dev_path, slot);
> > +        g_free(dev_path);
> > +        return -1;
> > +    }
> > +
> > +    mdev = &mem_st->devs[slot];
> > +    if (state == HOTPLUG_ENABLED) {
> > +        mdev->dimm = dev;
> > +        mdev->is_enabled = true;
> > +        mdev->is_inserting = true;
> > +    }
> > +
> > +    /* do ACPI magic */
> > +    regs->gpe.sts[0] |= ACPI_MEMORY_HOTPLUG_STATUS;
> > +    return 0;
> > +}
> > diff --git a/include/hw/acpi/acpi.h b/include/hw/acpi/acpi.h
> > index c4ae7d7..e5717df 100644
> > --- a/include/hw/acpi/acpi.h
> > +++ b/include/hw/acpi/acpi.h
> > @@ -168,6 +168,30 @@ uint32_t acpi_gpe_ioport_readb(ACPIREGS *ar, uint32_t addr);
> >  
> >  void acpi_update_sci(ACPIREGS *acpi_regs, qemu_irq irq, uint32_t gpe0_sts_mask);
> >  
> > +#define ACPI_MEMORY_HOTPLUG_IO_LEN 24
> > +#define ACPI_MEMORY_HOTPLUG_BASE 0x0a00
> > +
> > +#define ACPI_MEMORY_HOTPLUG_STATUS 8
> > +
> > +typedef struct MemStatus {
> > +    DeviceState *dimm;
> > +    bool is_enabled;
> > +    bool is_inserting;
> > +    uint32_t ost_event;
> > +    uint32_t ost_status;
> > +} MemStatus;
> > +
> > +typedef struct MemHotplugState {
> > +    uint32_t selector;
> > +    uint32_t dev_count;
> > +    MemStatus *devs;
> > +} MemHotplugState;
> > +
> > +void acpi_memory_hotplug_init(Object *owner, MemoryRegion *io,
> > +                              MemHotplugState *state);
> > +
> > +int acpi_memory_hotplug_cb(ACPIREGS *regs, MemHotplugState *mem_st,
> > +                           DeviceState *dev, HotplugState state);
> >  /* acpi.c */
> >  extern int acpi_enabled;
> >  extern char unsigned *acpi_tables;
> > -- 
> > 1.7.1
Michael S. Tsirkin Nov. 21, 2013, 2:38 p.m. UTC | #3
On Thu, Nov 21, 2013 at 03:21:37PM +0100, Igor Mammedov wrote:
> On Thu, 21 Nov 2013 11:42:02 +0200
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> 
> > On Thu, Nov 21, 2013 at 03:38:34AM +0100, Igor Mammedov wrote:
> > > - implements QEMU hardware part of memory hotplug protocol
> > >   described at "docs/specs/acpi_mem_hotplug.txt"
> > > - handles only memory add notification event for now
> > > 
> > > Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> > > ---
> > >  docs/specs/acpi_mem_hotplug.txt |   38 ++++++++++
> > >  hw/acpi/core.c                  |  144 +++++++++++++++++++++++++++++++++++++++
> > >  include/hw/acpi/acpi.h          |   24 +++++++
> > >  3 files changed, 206 insertions(+), 0 deletions(-)
> > >  create mode 100644 docs/specs/acpi_mem_hotplug.txt
> > > 
> > > diff --git a/docs/specs/acpi_mem_hotplug.txt b/docs/specs/acpi_mem_hotplug.txt
> > > new file mode 100644
> > > index 0000000..fc34142
> > > --- /dev/null
> > > +++ b/docs/specs/acpi_mem_hotplug.txt
> > > @@ -0,0 +1,38 @@
> > > +QEMU<->ACPI BIOS memory hotplug interface
> > > +--------------------------------------
> > > +
> > > +ACPI BIOS GPE.3 handler is dedicated for notifying OS about memory hot-add
> > > +or hot-remove events.
> > > +
> > > +Memory hot-plug interface (IO port 0xa00-0xa17, 1-4 byte access):
> > > +---------------------------------------------------------------
> > > +0xa00:
> > > +  read access:
> > > +      [0x0-0x3] Lo part of memory device phys address
> > > +      [0x4-0x7] Hi part of memory device phys address
> > > +      [0x8-0xb] Lo part of memory device size in bytes
> > > +      [0xc-0xf] Hi part of memory device size in bytes
> > > +      [0x14] highest memory hot-plug interface version supported by QEMU
> > 
> > So this can make guest fail gracefully but it appears that
> > detecting guest version would be nicer?
> > It would let us actually support old guests ...
> 
> my idea of how to it was,
> guest writes its version into [0x14] register and reads QEMU version
> from it back, if they do not match then than BIOS ignores GPE.3 event
> effectively disabling hotplug on guest side.
> I haven't thought about supporting multiple implementations in QEMU though.
> Do we really want it?

I'm talking about old bios which does not read acpi from qemu.
We want it to work even if it can't see hotplugged memory.

> > 
> > > +      [0x15] Memory device status fields
> > > +          bits:
> > > +              1: device is enabled and may be used by guest
> > > +              2: device insert event, used by ACPI BIOS to distinguish
> > > +                 device for which no device check event to OSPM was issued
> > 
> > what does the above mean?
> After OSPM issued device check on selected device it clears this bit to mark event
> as handled.
> It allows to avoid keeping this state in ASL (as it's done for CPU hotplug, see CPON)

That's fine.

> > what if device is not present?
> ASL will issue device check and clear bit, it might be a bug since _STA would report
> not present but no eject event was issued.
> 
> Papering over it ASL could check present bit first and issue device check only if
> it's present.

Is this a problem? If yes - that will still be racy won't it?


Also, should guest reset eject memory that we requested unplug for?

> > 
> > > +      [0x16-0x17] reserved
> > > +
> > > +  write access:
> > > +      [0x0-0x3] Memory device slot selector, selects active memory device.
> > > +                All following accesses to other registers in 0xaf80-0xaf97
> > > +                region will read/store data from/to selected memory device.
> > > +      [0x4-0x7] OST event code reported by OSPM
> > > +      [0x8-0xb] OST status code reported by OSPM
> > > +      [0x15] Memory device status fields
> > 
> > this is control, not status?
> Thanks, I'll fix it.
> 
> > 
> > > +          bits:
> > > +              2: if set to 1 clears device insert event, set by ACPI BIOS
> > > +                 after it has sent device check event to OSPM for
> > > +                 seleted memory device
> > 
> > selected?
> see "write access: [0x0-0x3]"

yes but you have a typo above

> > 
> > How about we actually require guest to enable memory?
> > 
> > This way if we hotplug, but old guest does not enable
> > and puts a PCI device there, it just works.
> I've lost you here, could you elaborate pls?


Assume qemu adds memory by hotplug.
Is it immediately enabled?
I suggest it's not enabled, and only enable after ACPI
enables it (or after reboot?)

> > 
> > > +
> > > +Selecting memory device slot beyond present range has no effect on platform:
> > > +   - not documented above write accesses to memory hot-plug registers
> > > +     are ignored;
> > > +   - not documented above read accesses to memory hot-plug registers return 0xFF
> > 
> > 00 would be cleaner (matches not enabled - no event)?
> I'm following pattern, where reading from not present IO port returns 0xFF on hardware.
> Fact that ASL reads 0xFF could be used as not supported indication.

But isn't this a valid pattern when all bits are set?

> > 
> > > diff --git a/hw/acpi/core.c b/hw/acpi/core.c
> > > index 8c0d48c..18e169c 100644
> > > --- a/hw/acpi/core.c
> > > +++ b/hw/acpi/core.c
> > > @@ -680,3 +680,147 @@ void acpi_update_sci(ACPIREGS *regs, qemu_irq irq, uint32_t gpe0_sts_mask)
> > >                         (regs->pm1.evt.en & ACPI_BITMASK_TIMER_ENABLE) &&
> > >                         !(pm1a_sts & ACPI_BITMASK_TIMER_STATUS));
> > >  }
> > > +
> > > +static uint64_t acpi_memory_hotplug_read(void *opaque, hwaddr addr,
> > > +                                         unsigned int size)
> > > +{
> > > +    uint32_t val = 0;
> > > +    MemHotplugState *mem_st = opaque;
> > > +    MemStatus *mdev;
> > > +
> > > +    if (mem_st->selector >= mem_st->dev_count) {
> > > +        return 0;
> > > +    }
> > > +
> > > +    mdev = &mem_st->devs[mem_st->selector];
> > > +    switch (addr) {
> > > +    case 0x0: /* Lo part of phys address where DIMM is mapped */
> > > +        val = object_property_get_int(OBJECT(mdev->dimm), "start", NULL);
> > > +        break;
> > > +    case 0x4: /* Hi part of phys address where DIMM is mapped */
> > > +        val = object_property_get_int(OBJECT(mdev->dimm), "start", NULL) >> 32;
> > > +        break;
> > > +    case 0x8: /* Lo part of DIMM size */
> > > +        val = object_property_get_int(OBJECT(mdev->dimm), "size", NULL);
> > > +        break;
> > > +    case 0xc: /* Hi part of DIMM size */
> > > +        val = object_property_get_int(OBJECT(mdev->dimm), "size", NULL) >> 32;
> > > +        break;
> > > +    case 0x10: /* node proximity for _PXM method */
> > > +        val = object_property_get_int(OBJECT(mdev->dimm), "node", NULL);
> > > +        break;
> > > +    case 0x14: /* intf version */
> > > +        val = 1;
> > > +        break;
> > > +    case 0x15: /* pack and return is_* fields */
> > > +        val |= mdev->is_enabled   ? 1 : 0;
> > > +        val |= mdev->is_inserting ? 2 : 0;
> > > +        break;
> > > +    }
> > > +    return val;
> > > +}
> > > +
> > > +static void acpi_memory_hotplug_write(void *opaque, hwaddr addr, uint64_t data,
> > > +                                      unsigned int size)
> > > +{
> > > +    MemHotplugState *mem_st = opaque;
> > > +    MemStatus *mdev;
> > > +
> > > +    if (!mem_st->dev_count) {
> > > +        return;
> > > +    }
> > > +
> > > +    if (addr) {
> > > +        if (mem_st->selector >= mem_st->dev_count) {
> > > +            return;
> > > +        }
> > > +    }
> > > +
> > > +    switch (addr) {
> > > +    case 0x0: /* DIMM slot selector */
> > > +        mem_st->selector = data;
> > > +        break;
> > > +    case 0x4: /* _OST event  */
> > > +        mdev = &mem_st->devs[mem_st->selector];
> > > +        if (data == 1) {
> > > +            /* TODO: handle device insert OST event */
> > > +        } else if (data == 3) {
> > > +            /* TODO: handle device remove OST event */
> > > +        }
> > > +        mdev->ost_event = data;
> > > +        break;
> > > +    case 0x8: /* _OST status */
> > > +        mdev = &mem_st->devs[mem_st->selector];
> > > +        mdev->ost_status = data;
> > > +        /* TODO: report async error */
> > > +        /* TODO: implement memory removal on guest signal */
> > > +        break;
> > > +    case 0x15:
> > > +        mdev = &mem_st->devs[mem_st->selector];
> > > +        if (data & 2) { /* clear insert event */
> > > +            mdev->is_inserting  = false;
> > > +        }
> > > +        break;
> > > +    }
> > > +
> > > +}
> > > +static const MemoryRegionOps acpi_memory_hotplug_ops = {
> > > +    .read = acpi_memory_hotplug_read,
> > > +    .write = acpi_memory_hotplug_write,
> > > +    .endianness = DEVICE_LITTLE_ENDIAN,
> > > +    .valid = {
> > > +        .min_access_size = 1,
> > > +        .max_access_size = 4,
> > > +    },
> > > +};
> > > +
> > > +void acpi_memory_hotplug_init(Object *owner, MemoryRegion *io,
> > > +                              MemHotplugState *state)
> > > +{
> > > +    QemuOpts *opts = qemu_opts_find(qemu_find_opts("memory-opts"), NULL);
> > > +    g_assert(opts);
> > > +
> > > +    state->dev_count = qemu_opt_get_number(opts, "slots", 0);
> > > +
> > > +    if (!state->dev_count) {
> > > +        return;
> > > +    }
> > > +
> > > +    state->devs = g_malloc0(sizeof(*state->devs) * state->dev_count);
> > > +    memory_region_init_io(io, owner, &acpi_memory_hotplug_ops, state,
> > > +                          "apci-mem-hotplug", ACPI_MEMORY_HOTPLUG_IO_LEN);
> > > +}
> > > +
> > > +int acpi_memory_hotplug_cb(ACPIREGS *regs, MemHotplugState *mem_st,
> > > +                           DeviceState *dev, HotplugState state)
> > > +{
> > > +    MemStatus *mdev;
> > > +    Error *local_err = NULL;
> > > +    int slot = object_property_get_int(OBJECT(dev), "slot", &local_err);
> > > +
> > > +    if (error_is_set(&local_err)) {
> > > +        qerror_report_err(local_err);
> > > +        error_free(local_err);
> > > +        return -1;
> > > +    }
> > > +
> > > +    if (slot >= mem_st->dev_count) {
> > > +        char *dev_path = object_get_canonical_path(OBJECT(dev));
> > > +        qerror_report(ERROR_CLASS_GENERIC_ERROR, "acpi_memory_hotplug_cb: "
> > > +                      "device [%s] returned invalid memory slot[%d]",
> > > +                      dev_path, slot);
> > > +        g_free(dev_path);
> > > +        return -1;
> > > +    }
> > > +
> > > +    mdev = &mem_st->devs[slot];
> > > +    if (state == HOTPLUG_ENABLED) {
> > > +        mdev->dimm = dev;
> > > +        mdev->is_enabled = true;
> > > +        mdev->is_inserting = true;
> > > +    }
> > > +
> > > +    /* do ACPI magic */
> > > +    regs->gpe.sts[0] |= ACPI_MEMORY_HOTPLUG_STATUS;
> > > +    return 0;
> > > +}
> > > diff --git a/include/hw/acpi/acpi.h b/include/hw/acpi/acpi.h
> > > index c4ae7d7..e5717df 100644
> > > --- a/include/hw/acpi/acpi.h
> > > +++ b/include/hw/acpi/acpi.h
> > > @@ -168,6 +168,30 @@ uint32_t acpi_gpe_ioport_readb(ACPIREGS *ar, uint32_t addr);
> > >  
> > >  void acpi_update_sci(ACPIREGS *acpi_regs, qemu_irq irq, uint32_t gpe0_sts_mask);
> > >  
> > > +#define ACPI_MEMORY_HOTPLUG_IO_LEN 24
> > > +#define ACPI_MEMORY_HOTPLUG_BASE 0x0a00
> > > +
> > > +#define ACPI_MEMORY_HOTPLUG_STATUS 8
> > > +
> > > +typedef struct MemStatus {
> > > +    DeviceState *dimm;
> > > +    bool is_enabled;
> > > +    bool is_inserting;
> > > +    uint32_t ost_event;
> > > +    uint32_t ost_status;
> > > +} MemStatus;
> > > +
> > > +typedef struct MemHotplugState {
> > > +    uint32_t selector;
> > > +    uint32_t dev_count;
> > > +    MemStatus *devs;
> > > +} MemHotplugState;
> > > +
> > > +void acpi_memory_hotplug_init(Object *owner, MemoryRegion *io,
> > > +                              MemHotplugState *state);
> > > +
> > > +int acpi_memory_hotplug_cb(ACPIREGS *regs, MemHotplugState *mem_st,
> > > +                           DeviceState *dev, HotplugState state);
> > >  /* acpi.c */
> > >  extern int acpi_enabled;
> > >  extern char unsigned *acpi_tables;
> > > -- 
> > > 1.7.1
Igor Mammedov Nov. 22, 2013, 5:14 p.m. UTC | #4
On Thu, 21 Nov 2013 16:38:47 +0200
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Thu, Nov 21, 2013 at 03:21:37PM +0100, Igor Mammedov wrote:
> > On Thu, 21 Nov 2013 11:42:02 +0200
> > "Michael S. Tsirkin" <mst@redhat.com> wrote:
[...]
> > > > +      [0x14] highest memory hot-plug interface version supported by QEMU
> > > 
> > > So this can make guest fail gracefully but it appears that
> > > detecting guest version would be nicer?
> > > It would let us actually support old guests ...
> > 
> > my idea of how to it was,
> > guest writes its version into [0x14] register and reads QEMU version
> > from it back, if they do not match then than BIOS ignores GPE.3 event
> > effectively disabling hotplug on guest side.
> > I haven't thought about supporting multiple implementations in QEMU though.
> > Do we really want it?
> 
> I'm talking about old bios which does not read acpi from qemu.
> We want it to work even if it can't see hotplugged memory.
to sum up cases that were discussed on IRC.

 * migration from NEW to OLD machine leads us to state when we run new BIOS with
   ACPI tables supporting memory hotplug on OLD machine.
   In general should work since NEW machine has to be started without memory
   hotplug enabled, which leads to disabling related ASL handler in ACPI tables.
   Case where NEW machine with memory hotplug enabled migrating to OLD machine
   without memory hotplug is invalid since migration will fail early due to
   missing devices (DimmBus/DIMM devices)

 * running OLD BIOS on new machine with memory hotplug turned on at QEMU CLI
        qemu -m X,slots=Y,maxmem=Z -bios old-bios-image -M pc-1.8

   Problem here is that OLD BIOS doesn't know about 'etc/reserved-memory-end' fwcfg,
   so it can start mapping PCI BARs right after high memory (i.e. in memory hotplug
   reserved space)

   That's fine as far as there is no DIMM devices since access will fall through
   hotplug-mem container to PCI address space.

   But if DIMM device added at startup or runtime, its MemoryRegion will overshadow
   PCI BARs mapped at its range.

   If it were only hotadd case, then QEMU could start with enabled but not active
   memory hotplug and require BIOS to acknowledge (activate) hotlug so that adding
   DIMM devices wouldn't be possible without hotplug enabled BIOS. 
   That would guarantee empty hotplug-mem container for OLD BIOS since it can't
   activate hotplug and avoid conflict.

   But if there are cold-plugged DIMM devices on QEMU CLI, that wouldn't work
   because OLD BIOS has no idea about them (i.e. it reads above4gb memory CMOS value)
   and it is bound to map 64-bit BARs in invalid place.
 
   It would be nice to run OLD BIOS on new machine and it will even work IF memory
   hotplug is not enabled.
   But I'm not sure we should care about case when not compatible BIOS is running
   on machine with active memory hotplug.

> 
> > > 
> > > > +      [0x15] Memory device status fields
> > > > +          bits:
> > > > +              1: device is enabled and may be used by guest
> > > > +              2: device insert event, used by ACPI BIOS to distinguish
> > > > +                 device for which no device check event to OSPM was issued
> > > 
> > > what does the above mean?
> > After OSPM issued device check on selected device it clears this bit to mark event
> > as handled.
> > It allows to avoid keeping this state in ASL (as it's done for CPU hotplug, see CPON)
> 
> That's fine.
> 
> > > what if device is not present?
> > ASL will issue device check and clear bit, it might be a bug since _STA would report
> > not present but no eject event was issued.
> > 
> > Papering over it ASL could check present bit first and issue device check only if
> > it's present.
> 
> Is this a problem? If yes - that will still be racy won't it?
I thought about it some more and I don't see why it would be racy.
bit 0x15.1 is set when there is initialized DIMM device in slot so 0x15.2 couldn't
be set if 0x15.1 == 0.

May be following description would be better:

   2: device insert event, used by ACPI BIOS to distinguish
      device for which no device check event to OSPM was issued.
      Bit is valid only when 0x15.1 bit is set.

It's possible to remove is_inserting bit altogether and just send device check to
each present memory device. Device check will be NOP for memory devices that OSPM
processed.
That will work fine with small amount of slots but would increase load on guest
in case of 1000s devices. That's the main reason for dedicated is_inserting bit,
allowing to handle hotadd effectively.

      
> 
> 
> Also, should guest reset eject memory that we requested unplug for?
I'm not sure what you saying here in general.
In particular unplug is not part of this series.


[...]
> > > selected?
> > see "write access: [0x0-0x3]"
> 
> yes but you have a typo above
Ahh, blind me. Thanks, I'll fix it.

> 
> > > 
> > > How about we actually require guest to enable memory?
> > > 
> > > This way if we hotplug, but old guest does not enable
> > > and puts a PCI device there, it just works.
> > I've lost you here, could you elaborate pls?
> 
> 
> Assume qemu adds memory by hotplug.
> Is it immediately enabled?
> I suggest it's not enabled, and only enable after ACPI
> enables it (or after reboot?)
I guess answer is in "running OLD BIOS on new machine with memory hotplug turned on"
case above. i.e. Don't run not compatible BIOS with turned on memory hotplug.
I think simpler interface/work-flow is better than a more complicated one,
that is required to support not totally sane configuration that is bound to be broken anyway.

> 
> > > 
> > > > +
> > > > +Selecting memory device slot beyond present range has no effect on platform:
> > > > +   - not documented above write accesses to memory hot-plug registers
> > > > +     are ignored;
> > > > +   - not documented above read accesses to memory hot-plug registers return 0xFF
> > > 
> > > 00 would be cleaner (matches not enabled - no event)?
> > I'm following pattern, where reading from not present IO port returns 0xFF on hardware.
> > Fact that ASL reads 0xFF could be used as not supported indication.
> 
> But isn't this a valid pattern when all bits are set?
It's to the same extent as 0x0. I'll change it to 0x0 if you still think it's better.

[...]
Michael S. Tsirkin Nov. 24, 2013, 10:58 a.m. UTC | #5
On Fri, Nov 22, 2013 at 06:14:46PM +0100, Igor Mammedov wrote:
> On Thu, 21 Nov 2013 16:38:47 +0200
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> 
> > On Thu, Nov 21, 2013 at 03:21:37PM +0100, Igor Mammedov wrote:
> > > On Thu, 21 Nov 2013 11:42:02 +0200
> > > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> [...]
> > > > > +      [0x14] highest memory hot-plug interface version supported by QEMU
> > > > 
> > > > So this can make guest fail gracefully but it appears that
> > > > detecting guest version would be nicer?
> > > > It would let us actually support old guests ...
> > > 
> > > my idea of how to it was,
> > > guest writes its version into [0x14] register and reads QEMU version
> > > from it back, if they do not match then than BIOS ignores GPE.3 event
> > > effectively disabling hotplug on guest side.
> > > I haven't thought about supporting multiple implementations in QEMU though.
> > > Do we really want it?
> > 
> > I'm talking about old bios which does not read acpi from qemu.
> > We want it to work even if it can't see hotplugged memory.
> to sum up cases that were discussed on IRC.
> 
>  * migration from NEW to OLD machine leads us to state when we run new BIOS with
>    ACPI tables supporting memory hotplug on OLD machine.
>    In general should work since NEW machine has to be started without memory
>    hotplug enabled, which leads to disabling related ASL handler in ACPI tables.
>    Case where NEW machine with memory hotplug enabled migrating to OLD machine
>    without memory hotplug is invalid since migration will fail early due to
>    missing devices (DimmBus/DIMM devices)
> 
>  * running OLD BIOS on new machine with memory hotplug turned on at QEMU CLI
>         qemu -m X,slots=Y,maxmem=Z -bios old-bios-image -M pc-1.8
> 
>    Problem here is that OLD BIOS doesn't know about 'etc/reserved-memory-end' fwcfg,
>    so it can start mapping PCI BARs right after high memory (i.e. in memory hotplug
>    reserved space)
> 
>    That's fine as far as there is no DIMM devices since access will fall through
>    hotplug-mem container to PCI address space.
> 
>    But if DIMM device added at startup or runtime, its MemoryRegion will overshadow
>    PCI BARs mapped at its range.
> 
>    If it were only hotadd case, then QEMU could start with enabled but not active
>    memory hotplug and require BIOS to acknowledge (activate) hotlug so that adding
>    DIMM devices wouldn't be possible without hotplug enabled BIOS. 
>    That would guarantee empty hotplug-mem container for OLD BIOS since it can't
>    activate hotplug and avoid conflict.
> 
>    But if there are cold-plugged DIMM devices on QEMU CLI, that wouldn't work
>    because OLD BIOS has no idea about them (i.e. it reads above4gb memory CMOS value)
>    and it is bound to map 64-bit BARs in invalid place.
>  
>    It would be nice to run OLD BIOS on new machine and it will even work IF memory
>    hotplug is not enabled.
>    But I'm not sure we should care about case when not compatible BIOS is running
>    on machine with active memory hotplug.

It's not a must, I agree. It's just a useful excercise that might
help us think of potential problems.

Here this made me think of the following question:
should not memory available on boot be listed in CMOS?
Let's assume new BIOS gets interrupt telling it there's new memory.
It clears the interrupt and then system is reset.

Will system after reset detect the new hotplugged memory?
How?

> > 
> > > > 
> > > > > +      [0x15] Memory device status fields
> > > > > +          bits:
> > > > > +              1: device is enabled and may be used by guest
> > > > > +              2: device insert event, used by ACPI BIOS to distinguish
> > > > > +                 device for which no device check event to OSPM was issued
> > > > 
> > > > what does the above mean?
> > > After OSPM issued device check on selected device it clears this bit to mark event
> > > as handled.
> > > It allows to avoid keeping this state in ASL (as it's done for CPU hotplug, see CPON)
> > 
> > That's fine.
> > 
> > > > what if device is not present?
> > > ASL will issue device check and clear bit, it might be a bug since _STA would report
> > > not present but no eject event was issued.
> > > 
> > > Papering over it ASL could check present bit first and issue device check only if
> > > it's present.
> > 
> > Is this a problem? If yes - that will still be racy won't it?
> I thought about it some more and I don't see why it would be racy.
> bit 0x15.1 is set when there is initialized DIMM device in slot so 0x15.2 couldn't
> be set if 0x15.1 == 0.
> 
> May be following description would be better:
> 
>    2: device insert event, used by ACPI BIOS to distinguish
>       device for which no device check event to OSPM was issued.
>       Bit is valid only when 0x15.1 bit is set.
> 
> It's possible to remove is_inserting bit altogether and just send device check to
> each present memory device. Device check will be NOP for memory devices that OSPM
> processed.
> That will work fine with small amount of slots but would increase load on guest
> in case of 1000s devices. That's the main reason for dedicated is_inserting bit,
> allowing to handle hotadd effectively.
> 
>       
> > 
> > 
> > Also, should guest reset eject memory that we requested unplug for?
> I'm not sure what you saying here in general.
> In particular unplug is not part of this series.
> 
> 
> [...]
> > > > selected?
> > > see "write access: [0x0-0x3]"
> > 
> > yes but you have a typo above
> Ahh, blind me. Thanks, I'll fix it.
> 
> > 
> > > > 
> > > > How about we actually require guest to enable memory?
> > > > 
> > > > This way if we hotplug, but old guest does not enable
> > > > and puts a PCI device there, it just works.
> > > I've lost you here, could you elaborate pls?
> > 
> > 
> > Assume qemu adds memory by hotplug.
> > Is it immediately enabled?
> > I suggest it's not enabled, and only enable after ACPI
> > enables it (or after reboot?)
> I guess answer is in "running OLD BIOS on new machine with memory hotplug turned on"
> case above. i.e. Don't run not compatible BIOS with turned on memory hotplug.
> I think simpler interface/work-flow is better than a more complicated one,
> that is required to support not totally sane configuration that is bound to be broken anyway.
> 
> > 
> > > > 
> > > > > +
> > > > > +Selecting memory device slot beyond present range has no effect on platform:
> > > > > +   - not documented above write accesses to memory hot-plug registers
> > > > > +     are ignored;
> > > > > +   - not documented above read accesses to memory hot-plug registers return 0xFF
> > > > 
> > > > 00 would be cleaner (matches not enabled - no event)?
> > > I'm following pattern, where reading from not present IO port returns 0xFF on hardware.
> > > Fact that ASL reads 0xFF could be used as not supported indication.
> > 
> > But isn't this a valid pattern when all bits are set?
> It's to the same extent as 0x0. I'll change it to 0x0 if you still think it's better.
> 
> [...]
Markus Armbruster Nov. 25, 2013, 7:27 a.m. UTC | #6
"Michael S. Tsirkin" <mst@redhat.com> writes:

[...]
> Here this made me think of the following question:
> should not memory available on boot be listed in CMOS?
> Let's assume new BIOS gets interrupt telling it there's new memory.
> It clears the interrupt and then system is reset.
>
> Will system after reset detect the new hotplugged memory?
> How?

Do physical systems with hot-pluggable memory exist?  How do they
behave?

[...]
Paolo Bonzini Nov. 25, 2013, 1:45 p.m. UTC | #7
Il 25/11/2013 08:27, Markus Armbruster ha scritto:
> "Michael S. Tsirkin" <mst@redhat.com> writes:
> 
> [...]
>> Here this made me think of the following question:
>> should not memory available on boot be listed in CMOS?
>> Let's assume new BIOS gets interrupt telling it there's new memory.
>> It clears the interrupt and then system is reset.
>>
>> Will system after reset detect the new hotplugged memory?

Can it just scan all slots from MHPD._INI?

For example on reset all slots could be disabled by the
ACPIHotpluggableDimmBus (even coldplugged ones), and scanned + enabled
by ASL.

>> How?
> 
> Do physical systems with hot-pluggable memory exist?  How do they
> behave?

I guess they assume BIOS is updated in lockstep with the introduction of
the feature.

Paolo
Igor Mammedov Nov. 25, 2013, 1:56 p.m. UTC | #8
On Sun, 24 Nov 2013 12:58:57 +0200
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Fri, Nov 22, 2013 at 06:14:46PM +0100, Igor Mammedov wrote:
> > On Thu, 21 Nov 2013 16:38:47 +0200
> > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > 
> > > On Thu, Nov 21, 2013 at 03:21:37PM +0100, Igor Mammedov wrote:
> > > > On Thu, 21 Nov 2013 11:42:02 +0200
> > > > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > [...]
> > > > > > +      [0x14] highest memory hot-plug interface version supported by QEMU
> > > > > 
> > > > > So this can make guest fail gracefully but it appears that
> > > > > detecting guest version would be nicer?
> > > > > It would let us actually support old guests ...
> > > > 
> > > > my idea of how to it was,
> > > > guest writes its version into [0x14] register and reads QEMU version
> > > > from it back, if they do not match then than BIOS ignores GPE.3 event
> > > > effectively disabling hotplug on guest side.
> > > > I haven't thought about supporting multiple implementations in QEMU though.
> > > > Do we really want it?
> > > 
> > > I'm talking about old bios which does not read acpi from qemu.
> > > We want it to work even if it can't see hotplugged memory.
> > to sum up cases that were discussed on IRC.
> > 
> >  * migration from NEW to OLD machine leads us to state when we run new BIOS with
> >    ACPI tables supporting memory hotplug on OLD machine.
> >    In general should work since NEW machine has to be started without memory
> >    hotplug enabled, which leads to disabling related ASL handler in ACPI tables.
> >    Case where NEW machine with memory hotplug enabled migrating to OLD machine
> >    without memory hotplug is invalid since migration will fail early due to
> >    missing devices (DimmBus/DIMM devices)
> > 
> >  * running OLD BIOS on new machine with memory hotplug turned on at QEMU CLI
> >         qemu -m X,slots=Y,maxmem=Z -bios old-bios-image -M pc-1.8
> > 
> >    Problem here is that OLD BIOS doesn't know about 'etc/reserved-memory-end' fwcfg,
> >    so it can start mapping PCI BARs right after high memory (i.e. in memory hotplug
> >    reserved space)
> > 
> >    That's fine as far as there is no DIMM devices since access will fall through
> >    hotplug-mem container to PCI address space.
> > 
> >    But if DIMM device added at startup or runtime, its MemoryRegion will overshadow
> >    PCI BARs mapped at its range.
> > 
> >    If it were only hotadd case, then QEMU could start with enabled but not active
> >    memory hotplug and require BIOS to acknowledge (activate) hotlug so that adding
> >    DIMM devices wouldn't be possible without hotplug enabled BIOS. 
> >    That would guarantee empty hotplug-mem container for OLD BIOS since it can't
> >    activate hotplug and avoid conflict.
> > 
> >    But if there are cold-plugged DIMM devices on QEMU CLI, that wouldn't work
> >    because OLD BIOS has no idea about them (i.e. it reads above4gb memory CMOS value)
> >    and it is bound to map 64-bit BARs in invalid place.
> >  
> >    It would be nice to run OLD BIOS on new machine and it will even work IF memory
> >    hotplug is not enabled.
> >    But I'm not sure we should care about case when not compatible BIOS is running
> >    on machine with active memory hotplug.
> 
> It's not a must, I agree. It's just a useful excercise that might
> help us think of potential problems.
> 
> Here this made me think of the following question:
> should not memory available on boot be listed in CMOS?
> Let's assume new BIOS gets interrupt telling it there's new memory.
> It clears the interrupt and then system is reset.
> 
> Will system after reset detect the new hotplugged memory?
> How?

I don't have memory hotplug enabled real hardware and can only deduce what is
needed from guest behavior. So here is my observations:
 * windows: hotpluggable memory doesn't have to be in E820, OS sees
   and automatically onlines present hotpluggable memory.
 * linux: kernel enumerates present memory devices and sends events to udev.
   It's up to user space to provide udev scripts for onlining hotpluggable
   memory now.

So in general rebooting at any time shouldn't hurt, and guest would see
hotplugged memory. But linux kernel won't use it until udev onlines it.

I see adding present at boot hotpluggable memory to E820 as a nice additional
feature which is not must have but I'm looking to implementing it as follow-up,
along with moving generation of related smbios tables into QEMU as you have
done for ACPI tables.

> > > 
> > > > > 
> > > > > > +      [0x15] Memory device status fields
> > > > > > +          bits:
> > > > > > +              1: device is enabled and may be used by guest
> > > > > > +              2: device insert event, used by ACPI BIOS to distinguish
> > > > > > +                 device for which no device check event to OSPM was issued
> > > > > 
> > > > > what does the above mean?
> > > > After OSPM issued device check on selected device it clears this bit to mark event
> > > > as handled.
> > > > It allows to avoid keeping this state in ASL (as it's done for CPU hotplug, see CPON)
> > > 
> > > That's fine.
> > > 
> > > > > what if device is not present?
> > > > ASL will issue device check and clear bit, it might be a bug since _STA would report
> > > > not present but no eject event was issued.
> > > > 
> > > > Papering over it ASL could check present bit first and issue device check only if
> > > > it's present.
> > > 
> > > Is this a problem? If yes - that will still be racy won't it?
> > I thought about it some more and I don't see why it would be racy.
> > bit 0x15.1 is set when there is initialized DIMM device in slot so 0x15.2 couldn't
> > be set if 0x15.1 == 0.
> > 
> > May be following description would be better:
> > 
> >    2: device insert event, used by ACPI BIOS to distinguish
> >       device for which no device check event to OSPM was issued.
> >       Bit is valid only when 0x15.1 bit is set.
> > 
> > It's possible to remove is_inserting bit altogether and just send device check to
> > each present memory device. Device check will be NOP for memory devices that OSPM
> > processed.
> > That will work fine with small amount of slots but would increase load on guest
> > in case of 1000s devices. That's the main reason for dedicated is_inserting bit,
> > allowing to handle hotadd effectively.
> > 
> >       
> > > 
> > > 
> > > Also, should guest reset eject memory that we requested unplug for?
> > I'm not sure what you saying here in general.
> > In particular unplug is not part of this series.
> > 
> > 
> > [...]
> > > > > selected?
> > > > see "write access: [0x0-0x3]"
> > > 
> > > yes but you have a typo above
> > Ahh, blind me. Thanks, I'll fix it.
> > 
> > > 
> > > > > 
> > > > > How about we actually require guest to enable memory?
> > > > > 
> > > > > This way if we hotplug, but old guest does not enable
> > > > > and puts a PCI device there, it just works.
> > > > I've lost you here, could you elaborate pls?
> > > 
> > > 
> > > Assume qemu adds memory by hotplug.
> > > Is it immediately enabled?
> > > I suggest it's not enabled, and only enable after ACPI
> > > enables it (or after reboot?)
> > I guess answer is in "running OLD BIOS on new machine with memory hotplug turned on"
> > case above. i.e. Don't run not compatible BIOS with turned on memory hotplug.
> > I think simpler interface/work-flow is better than a more complicated one,
> > that is required to support not totally sane configuration that is bound to be broken anyway.
> > 
> > > 
> > > > > 
> > > > > > +
> > > > > > +Selecting memory device slot beyond present range has no effect on platform:
> > > > > > +   - not documented above write accesses to memory hot-plug registers
> > > > > > +     are ignored;
> > > > > > +   - not documented above read accesses to memory hot-plug registers return 0xFF
> > > > > 
> > > > > 00 would be cleaner (matches not enabled - no event)?
> > > > I'm following pattern, where reading from not present IO port returns 0xFF on hardware.
> > > > Fact that ASL reads 0xFF could be used as not supported indication.
> > > 
> > > But isn't this a valid pattern when all bits are set?
> > It's to the same extent as 0x0. I'll change it to 0x0 if you still think it's better.
> > 
> > [...]
Igor Mammedov Nov. 25, 2013, 2:18 p.m. UTC | #9
On Mon, 25 Nov 2013 14:45:15 +0100
Paolo Bonzini <pbonzini@redhat.com> wrote:

> Il 25/11/2013 08:27, Markus Armbruster ha scritto:
> > "Michael S. Tsirkin" <mst@redhat.com> writes:
> > 
> > [...]
> >> Here this made me think of the following question:
> >> should not memory available on boot be listed in CMOS?
> >> Let's assume new BIOS gets interrupt telling it there's new memory.
> >> It clears the interrupt and then system is reset.
> >>
> >> Will system after reset detect the new hotplugged memory?
> 
> Can it just scan all slots from MHPD._INI?
> 
> For example on reset all slots could be disabled by the
> ACPIHotpluggableDimmBus (even coldplugged ones), and scanned + enabled
> by ASL.
memory devices are scanned by OSPM during ACPI tables parsing and 
guest (can) use present devices right away.

Currently I don't see necessity for enabling individual memory devices from
ASL since it adds one more round trip for QEMU<->guest protocol for no
particular benefit. The only use case Michael suggested would be running
OLD BIOS (without memory hotplug support) on machine with actively used
memory hotplug (if OLD BIOS runs on machine with memory hotplug but there is
no hotplugged memory devices, it runs just fine), so case looks like not worth
an effort and complexity it would bring.

I believe having completely functional DIMM device after its realize() method
is completed is more strait-forward and easier to understand/support than
having guest to complete it's initialization.
That would allow later to put present at boot DIMMs into E820 (way before ASL
is executed) if that would be necessary and also help to convert initial
memory into hotpluggable one as well.

> 
> >> How?
> > 
> > Do physical systems with hot-pluggable memory exist?  How do they
> > behave?
> 
> I guess they assume BIOS is updated in lockstep with the introduction of
> the feature.
> 
> Paolo
>
Paolo Bonzini Nov. 25, 2013, 2:31 p.m. UTC | #10
Il 25/11/2013 15:18, Igor Mammedov ha scritto:
>> > For example on reset all slots could be disabled by the
>> > ACPIHotpluggableDimmBus (even coldplugged ones), and scanned + enabled
>> > by ASL.
> memory devices are scanned by OSPM during ACPI tables parsing and 
> guest (can) use present devices right away.

Yes, what I'm saying is scan them to enable them so that OSPM sees them
already as online.

> Currently I don't see necessity for enabling individual memory devices from
> ASL since it adds one more round trip for QEMU<->guest protocol for no
> particular benefit. The only use case Michael suggested would be running
> OLD BIOS (without memory hotplug support) on machine with actively used
> memory hotplug (if OLD BIOS runs on machine with memory hotplug but there is
> no hotplugged memory devices, it runs just fine), so case looks like not worth
     ^^^^^^^^^^

*Hotpluggable*, not hotplugged.  It would also break just with
coldplugged memory devices.

> That would allow later to put present at boot DIMMs into E820 (way before ASL
> is executed) if that would be necessary and also help to convert initial
> memory into hotpluggable one as well.

I am afraid that mixing hot-unplug with E820 could cause some
unanticipated trouble.  OSes may not expect memory in the E820 ranges to
disappear.

But since we're speaking about e820, do guests work if you make the
hotpluggable memory region reserved in the e820?  That should work with
old BIOS too using QEMU_CFG_E820_TABLE.  If that works, I see no need
for an "enabled" bit either.

Paolo

> an effort and complexity it would bring.
Igor Mammedov Nov. 25, 2013, 2:57 p.m. UTC | #11
On Mon, 25 Nov 2013 15:31:22 +0100
Paolo Bonzini <pbonzini@redhat.com> wrote:

> Il 25/11/2013 15:18, Igor Mammedov ha scritto:
> >> > For example on reset all slots could be disabled by the
> >> > ACPIHotpluggableDimmBus (even coldplugged ones), and scanned + enabled
> >> > by ASL.
> > memory devices are scanned by OSPM during ACPI tables parsing and 
> > guest (can) use present devices right away.
> 
> Yes, what I'm saying is scan them to enable them so that OSPM sees them
> already as online.
> 
> > Currently I don't see necessity for enabling individual memory devices from
> > ASL since it adds one more round trip for QEMU<->guest protocol for no
> > particular benefit. The only use case Michael suggested would be running
> > OLD BIOS (without memory hotplug support) on machine with actively used
> > memory hotplug (if OLD BIOS runs on machine with memory hotplug but there is
> > no hotplugged memory devices, it runs just fine), so case looks like not worth
>      ^^^^^^^^^^
> 
> *Hotpluggable*, not hotplugged.  It would also break just with
> coldplugged memory devices.
> 
> > That would allow later to put present at boot DIMMs into E820 (way before ASL
> > is executed) if that would be necessary and also help to convert initial
> > memory into hotpluggable one as well.
> 
> I am afraid that mixing hot-unplug with E820 could cause some
> unanticipated trouble.  OSes may not expect memory in the E820 ranges to
> disappear.
> 
> But since we're speaking about e820, do guests work if you make the
> hotpluggable memory region reserved in the e820?  That should work with
> old BIOS too using QEMU_CFG_E820_TABLE.  If that works, I see no need
> for an "enabled" bit either.
It's on my to do list to make more code reading/experiments/testing to see if
that is ok with guest. I can't say it right now, that's why E820 changes is
not part of this series.

As for OLD BIOS it should be aware about not continuous memory ranges and
calculate present ram below/above4g via E820, there was Gerd's patch
to introduce new E820 fwcfg from QEMU and I hope to use/extend it for
telling BIOS about present coldplugged DIMMs. But I guess it's not for
OLD BIOS anyway, since it OLD SeaBIOS uses CMOS for getting ram sizes.

> 
> Paolo
> 
> > an effort and complexity it would bring.
>
Eric Blake Nov. 27, 2013, 5:59 p.m. UTC | #12
On 11/20/2013 07:38 PM, Igor Mammedov wrote:
> - implements QEMU hardware part of memory hotplug protocol
>   described at "docs/specs/acpi_mem_hotplug.txt"
> - handles only memory add notification event for now
> 
> Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> ---

> +0xa00:
> +  read access:
> +      [0x0-0x3] Lo part of memory device phys address
> +      [0x4-0x7] Hi part of memory device phys address
> +      [0x8-0xb] Lo part of memory device size in bytes
> +      [0xc-0xf] Hi part of memory device size in bytes
> +      [0x14] highest memory hot-plug interface version supported by QEMU

Nothing about [0x10-0x13]? [Ah, I see you mention it later]

> +      [0x15] Memory device status fields
> +          bits:
> +              1: device is enabled and may be used by guest
> +              2: device insert event, used by ACPI BIOS to distinguish
> +                 device for which no device check event to OSPM was issued

Why start at bit 1 instead of bit 0?  Also, should you document that
remaining bits (2-7) should be ignored on read?

> +      [0x16-0x17] reserved
> +
> +  write access:
> +      [0x0-0x3] Memory device slot selector, selects active memory device.
> +                All following accesses to other registers in 0xaf80-0xaf97
> +                region will read/store data from/to selected memory device.
> +      [0x4-0x7] OST event code reported by OSPM
> +      [0x8-0xb] OST status code reported by OSPM
> +      [0x15] Memory device status fields

Nothing about behavior of [0xc-0x14]?  Is it an error to write there, or
are the writes just ignored, or...?

> +          bits:
> +              2: if set to 1 clears device insert event, set by ACPI BIOS
> +                 after it has sent device check event to OSPM for
> +                 seleted memory device

s/seleted/selected/

What must be written into the other bits?  Must reserved bits be written
as 0, or the user do a read-modify-write, or...?

> +
> +Selecting memory device slot beyond present range has no effect on platform:
> +   - not documented above write accesses to memory hot-plug registers
> +     are ignored;

Reads awkwardly.  Better is:

write accesses to memory hot-plug registers not documented above are ignored

> +   - not documented above read accesses to memory hot-plug registers return 0xFF

Similar awkward wording.

> +    case 0xc: /* Hi part of DIMM size */
> +        val = object_property_get_int(OBJECT(mdev->dimm), "size", NULL) >> 32;
> +        break;
> +    case 0x10: /* node proximity for _PXM method */
> +        val = object_property_get_int(OBJECT(mdev->dimm), "node", NULL);
> +        break;

This wasn't documented.

> +    case 0x14: /* intf version */
> +        val = 1;
> +        break;
> +    case 0x15: /* pack and return is_* fields */
> +        val |= mdev->is_enabled   ? 1 : 0;
> +        val |= mdev->is_inserting ? 2 : 0;

This is bit 0 and 1, not bit 1 and 2.
diff mbox

Patch

diff --git a/docs/specs/acpi_mem_hotplug.txt b/docs/specs/acpi_mem_hotplug.txt
new file mode 100644
index 0000000..fc34142
--- /dev/null
+++ b/docs/specs/acpi_mem_hotplug.txt
@@ -0,0 +1,38 @@ 
+QEMU<->ACPI BIOS memory hotplug interface
+--------------------------------------
+
+ACPI BIOS GPE.3 handler is dedicated for notifying OS about memory hot-add
+or hot-remove events.
+
+Memory hot-plug interface (IO port 0xa00-0xa17, 1-4 byte access):
+---------------------------------------------------------------
+0xa00:
+  read access:
+      [0x0-0x3] Lo part of memory device phys address
+      [0x4-0x7] Hi part of memory device phys address
+      [0x8-0xb] Lo part of memory device size in bytes
+      [0xc-0xf] Hi part of memory device size in bytes
+      [0x14] highest memory hot-plug interface version supported by QEMU
+      [0x15] Memory device status fields
+          bits:
+              1: device is enabled and may be used by guest
+              2: device insert event, used by ACPI BIOS to distinguish
+                 device for which no device check event to OSPM was issued
+      [0x16-0x17] reserved
+
+  write access:
+      [0x0-0x3] Memory device slot selector, selects active memory device.
+                All following accesses to other registers in 0xaf80-0xaf97
+                region will read/store data from/to selected memory device.
+      [0x4-0x7] OST event code reported by OSPM
+      [0x8-0xb] OST status code reported by OSPM
+      [0x15] Memory device status fields
+          bits:
+              2: if set to 1 clears device insert event, set by ACPI BIOS
+                 after it has sent device check event to OSPM for
+                 seleted memory device
+
+Selecting memory device slot beyond present range has no effect on platform:
+   - not documented above write accesses to memory hot-plug registers
+     are ignored;
+   - not documented above read accesses to memory hot-plug registers return 0xFF
diff --git a/hw/acpi/core.c b/hw/acpi/core.c
index 8c0d48c..18e169c 100644
--- a/hw/acpi/core.c
+++ b/hw/acpi/core.c
@@ -680,3 +680,147 @@  void acpi_update_sci(ACPIREGS *regs, qemu_irq irq, uint32_t gpe0_sts_mask)
                        (regs->pm1.evt.en & ACPI_BITMASK_TIMER_ENABLE) &&
                        !(pm1a_sts & ACPI_BITMASK_TIMER_STATUS));
 }
+
+static uint64_t acpi_memory_hotplug_read(void *opaque, hwaddr addr,
+                                         unsigned int size)
+{
+    uint32_t val = 0;
+    MemHotplugState *mem_st = opaque;
+    MemStatus *mdev;
+
+    if (mem_st->selector >= mem_st->dev_count) {
+        return 0;
+    }
+
+    mdev = &mem_st->devs[mem_st->selector];
+    switch (addr) {
+    case 0x0: /* Lo part of phys address where DIMM is mapped */
+        val = object_property_get_int(OBJECT(mdev->dimm), "start", NULL);
+        break;
+    case 0x4: /* Hi part of phys address where DIMM is mapped */
+        val = object_property_get_int(OBJECT(mdev->dimm), "start", NULL) >> 32;
+        break;
+    case 0x8: /* Lo part of DIMM size */
+        val = object_property_get_int(OBJECT(mdev->dimm), "size", NULL);
+        break;
+    case 0xc: /* Hi part of DIMM size */
+        val = object_property_get_int(OBJECT(mdev->dimm), "size", NULL) >> 32;
+        break;
+    case 0x10: /* node proximity for _PXM method */
+        val = object_property_get_int(OBJECT(mdev->dimm), "node", NULL);
+        break;
+    case 0x14: /* intf version */
+        val = 1;
+        break;
+    case 0x15: /* pack and return is_* fields */
+        val |= mdev->is_enabled   ? 1 : 0;
+        val |= mdev->is_inserting ? 2 : 0;
+        break;
+    }
+    return val;
+}
+
+static void acpi_memory_hotplug_write(void *opaque, hwaddr addr, uint64_t data,
+                                      unsigned int size)
+{
+    MemHotplugState *mem_st = opaque;
+    MemStatus *mdev;
+
+    if (!mem_st->dev_count) {
+        return;
+    }
+
+    if (addr) {
+        if (mem_st->selector >= mem_st->dev_count) {
+            return;
+        }
+    }
+
+    switch (addr) {
+    case 0x0: /* DIMM slot selector */
+        mem_st->selector = data;
+        break;
+    case 0x4: /* _OST event  */
+        mdev = &mem_st->devs[mem_st->selector];
+        if (data == 1) {
+            /* TODO: handle device insert OST event */
+        } else if (data == 3) {
+            /* TODO: handle device remove OST event */
+        }
+        mdev->ost_event = data;
+        break;
+    case 0x8: /* _OST status */
+        mdev = &mem_st->devs[mem_st->selector];
+        mdev->ost_status = data;
+        /* TODO: report async error */
+        /* TODO: implement memory removal on guest signal */
+        break;
+    case 0x15:
+        mdev = &mem_st->devs[mem_st->selector];
+        if (data & 2) { /* clear insert event */
+            mdev->is_inserting  = false;
+        }
+        break;
+    }
+
+}
+static const MemoryRegionOps acpi_memory_hotplug_ops = {
+    .read = acpi_memory_hotplug_read,
+    .write = acpi_memory_hotplug_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+    .valid = {
+        .min_access_size = 1,
+        .max_access_size = 4,
+    },
+};
+
+void acpi_memory_hotplug_init(Object *owner, MemoryRegion *io,
+                              MemHotplugState *state)
+{
+    QemuOpts *opts = qemu_opts_find(qemu_find_opts("memory-opts"), NULL);
+    g_assert(opts);
+
+    state->dev_count = qemu_opt_get_number(opts, "slots", 0);
+
+    if (!state->dev_count) {
+        return;
+    }
+
+    state->devs = g_malloc0(sizeof(*state->devs) * state->dev_count);
+    memory_region_init_io(io, owner, &acpi_memory_hotplug_ops, state,
+                          "apci-mem-hotplug", ACPI_MEMORY_HOTPLUG_IO_LEN);
+}
+
+int acpi_memory_hotplug_cb(ACPIREGS *regs, MemHotplugState *mem_st,
+                           DeviceState *dev, HotplugState state)
+{
+    MemStatus *mdev;
+    Error *local_err = NULL;
+    int slot = object_property_get_int(OBJECT(dev), "slot", &local_err);
+
+    if (error_is_set(&local_err)) {
+        qerror_report_err(local_err);
+        error_free(local_err);
+        return -1;
+    }
+
+    if (slot >= mem_st->dev_count) {
+        char *dev_path = object_get_canonical_path(OBJECT(dev));
+        qerror_report(ERROR_CLASS_GENERIC_ERROR, "acpi_memory_hotplug_cb: "
+                      "device [%s] returned invalid memory slot[%d]",
+                      dev_path, slot);
+        g_free(dev_path);
+        return -1;
+    }
+
+    mdev = &mem_st->devs[slot];
+    if (state == HOTPLUG_ENABLED) {
+        mdev->dimm = dev;
+        mdev->is_enabled = true;
+        mdev->is_inserting = true;
+    }
+
+    /* do ACPI magic */
+    regs->gpe.sts[0] |= ACPI_MEMORY_HOTPLUG_STATUS;
+    return 0;
+}
diff --git a/include/hw/acpi/acpi.h b/include/hw/acpi/acpi.h
index c4ae7d7..e5717df 100644
--- a/include/hw/acpi/acpi.h
+++ b/include/hw/acpi/acpi.h
@@ -168,6 +168,30 @@  uint32_t acpi_gpe_ioport_readb(ACPIREGS *ar, uint32_t addr);
 
 void acpi_update_sci(ACPIREGS *acpi_regs, qemu_irq irq, uint32_t gpe0_sts_mask);
 
+#define ACPI_MEMORY_HOTPLUG_IO_LEN 24
+#define ACPI_MEMORY_HOTPLUG_BASE 0x0a00
+
+#define ACPI_MEMORY_HOTPLUG_STATUS 8
+
+typedef struct MemStatus {
+    DeviceState *dimm;
+    bool is_enabled;
+    bool is_inserting;
+    uint32_t ost_event;
+    uint32_t ost_status;
+} MemStatus;
+
+typedef struct MemHotplugState {
+    uint32_t selector;
+    uint32_t dev_count;
+    MemStatus *devs;
+} MemHotplugState;
+
+void acpi_memory_hotplug_init(Object *owner, MemoryRegion *io,
+                              MemHotplugState *state);
+
+int acpi_memory_hotplug_cb(ACPIREGS *regs, MemHotplugState *mem_st,
+                           DeviceState *dev, HotplugState state);
 /* acpi.c */
 extern int acpi_enabled;
 extern char unsigned *acpi_tables;