Patchwork [RFC,v3,05/19] Implement dimm device abstraction

login
register
mail settings
Submitter Vasilis Liaskovitis
Date Sept. 21, 2012, 11:17 a.m.
Message ID <1348226255-4226-6-git-send-email-vasilis.liaskovitis@profitbricks.com>
Download mbox | patch
Permalink /patch/185712/
State New
Headers show

Comments

Vasilis Liaskovitis - Sept. 21, 2012, 11:17 a.m.
Each hotplug-able memory slot is a DimmDevice. All DimmDevices are attached
to a new bus called DimmBus. This bus is introduced so that we no longer
depend on hotplug-capability of main system bus (the main bus does not allow
hotplugging). The DimmBus should be attached to a chipset Device (i440fx in case
of the pc)

A hot-add operation for a particular dimm:
- creates a new DimmDevice and attaches it to the DimmBus
- creates a new MemoryRegion of the given physical address offset, size and
node proximity, and attaches it to main system memory as a sub_region.

A successful hot-remove operation detaches and frees the MemoryRegion from
system memory, and removes the DimmDevice from the DimmBus.

Hotplug operations are done through normal device_add /device_del commands.
Also add properties to DimmDevice.

Signed-off-by: Vasilis Liaskovitis <vasilis.liaskovitis@profitbricks.com>
---
 hw/dimm.c |  305 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 hw/dimm.h |   90 ++++++++++++++++++
 2 files changed, 395 insertions(+), 0 deletions(-)
 create mode 100644 hw/dimm.c
 create mode 100644 hw/dimm.h
Wen Congyang - Sept. 24, 2012, 6:02 a.m.
At 09/21/2012 07:17 PM, Vasilis Liaskovitis Wrote:
> Each hotplug-able memory slot is a DimmDevice. All DimmDevices are attached
> to a new bus called DimmBus. This bus is introduced so that we no longer
> depend on hotplug-capability of main system bus (the main bus does not allow
> hotplugging). The DimmBus should be attached to a chipset Device (i440fx in case
> of the pc)
> 
> A hot-add operation for a particular dimm:
> - creates a new DimmDevice and attaches it to the DimmBus
> - creates a new MemoryRegion of the given physical address offset, size and
> node proximity, and attaches it to main system memory as a sub_region.
> 
> A successful hot-remove operation detaches and frees the MemoryRegion from
> system memory, and removes the DimmDevice from the DimmBus.
> 
> Hotplug operations are done through normal device_add /device_del commands.
> Also add properties to DimmDevice.
> 
> Signed-off-by: Vasilis Liaskovitis <vasilis.liaskovitis@profitbricks.com>
> ---
>  hw/dimm.c |  305 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  hw/dimm.h |   90 ++++++++++++++++++
>  2 files changed, 395 insertions(+), 0 deletions(-)
>  create mode 100644 hw/dimm.c
>  create mode 100644 hw/dimm.h
> 
> diff --git a/hw/dimm.c b/hw/dimm.c
> new file mode 100644
> index 0000000..288b997
> --- /dev/null
> +++ b/hw/dimm.c
> @@ -0,0 +1,305 @@
> +/*
> + * Dimm device for Memory Hotplug
> + *
> + * Copyright ProfitBricks GmbH 2012
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library; if not, see <http://www.gnu.org/licenses/>
> + */
> +
> +#include "trace.h"
> +#include "qdev.h"
> +#include "dimm.h"
> +#include <time.h>
> +#include "../exec-memory.h"
> +#include "qmp-commands.h"
> +
> +/* the system-wide memory bus. */
> +static DimmBus *main_memory_bus;
> +/* the following list is used to hold dimm config info before machine
> + * initialization. After machine init, the list is emptied and not used anymore.*/
> +static DimmConfiglist dimmconfig_list = QTAILQ_HEAD_INITIALIZER(dimmconfig_list);
> +
> +static void dimmbus_dev_print(Monitor *mon, DeviceState *dev, int indent);
> +static char *dimmbus_get_fw_dev_path(DeviceState *dev);
> +
> +static Property dimm_properties[] = {
> +    DEFINE_PROP_UINT64("start", DimmDevice, start, 0),
> +    DEFINE_PROP_UINT64("size", DimmDevice, size, DEFAULT_DIMMSIZE),
> +    DEFINE_PROP_UINT32("node", DimmDevice, node, 0),
> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
> +static void dimmbus_dev_print(Monitor *mon, DeviceState *dev, int indent)
> +{
> +}
> +
> +static char *dimmbus_get_fw_dev_path(DeviceState *dev)
> +{
> +    char path[40];
> +
> +    snprintf(path, sizeof(path), "%s", qdev_fw_name(dev));
> +    return strdup(path);
> +}
> +
> +static void dimm_bus_class_init(ObjectClass *klass, void *data)
> +{
> +    BusClass *k = BUS_CLASS(klass);
> +
> +    k->print_dev = dimmbus_dev_print;
> +    k->get_fw_dev_path = dimmbus_get_fw_dev_path;
> +}
> +
> +static void dimm_bus_initfn(Object *obj)
> +{
> +    DimmConfig *dimm_cfg, *next_dimm_cfg;
> +    DimmBus *bus = DIMM_BUS(obj);
> +    QTAILQ_INIT(&bus->dimmconfig_list);
> +    QTAILQ_INIT(&bus->dimmlist);
> +
> +    QTAILQ_FOREACH_SAFE(dimm_cfg, &dimmconfig_list, nextdimmcfg, next_dimm_cfg) {
> +        QTAILQ_REMOVE(&dimmconfig_list, dimm_cfg, nextdimmcfg);
> +        QTAILQ_INSERT_TAIL(&bus->dimmconfig_list, dimm_cfg, nextdimmcfg);
> +    }
> +}
> +
> +static const TypeInfo dimm_bus_info = {
> +    .name = TYPE_DIMM_BUS,
> +    .parent = TYPE_BUS,
> +    .instance_size = sizeof(DimmBus),
> +    .instance_init = dimm_bus_initfn,
> +    .class_init = dimm_bus_class_init,
> +};
> +
> +void main_memory_bus_create(Object *parent)
> +{
> +    main_memory_bus = g_malloc0(dimm_bus_info.instance_size);
> +    main_memory_bus->qbus.glib_allocated = true;
> +    qbus_create_inplace(&main_memory_bus->qbus, TYPE_DIMM_BUS, DEVICE(parent),
> +                        "membus");
> +}
> +
> +static void dimm_populate(DimmDevice *s)
> +{
> +    DeviceState *dev= (DeviceState*)s;
> +    MemoryRegion *new = NULL;
> +
> +    new = g_malloc(sizeof(MemoryRegion));
> +    memory_region_init_ram(new, dev->id, s->size);
> +    vmstate_register_ram_global(new);
> +    memory_region_add_subregion(get_system_memory(), s->start, new);
> +    s->mr = new;
> +}
> +
> +static void dimm_depopulate(DimmDevice *s)
> +{
> +    assert(s);
> +    vmstate_unregister_ram(s->mr, NULL);
> +    memory_region_del_subregion(get_system_memory(), s->mr);
> +    memory_region_destroy(s->mr);
> +    s->mr = NULL;
> +}
> +
> +void dimm_config_create(char *id, uint64_t size, uint64_t node, uint32_t
> +        dimm_idx, uint32_t populated)
> +{
> +    DimmConfig *dimm_cfg;
> +    dimm_cfg = (DimmConfig*) g_malloc0(sizeof(DimmConfig));
> +    dimm_cfg->name = id;
> +    dimm_cfg->idx = dimm_idx;
> +    dimm_cfg->start = 0;
> +    dimm_cfg->size = size;
> +    dimm_cfg->node = node;
> +    dimm_cfg->populated = populated;
> +
> +    QTAILQ_INSERT_TAIL(&dimmconfig_list, dimm_cfg, nextdimmcfg);
> +}
> +
> +void dimm_bus_hotplug(dimm_hotplug_fn hotplug, DeviceState *qdev)
> +{
> +    DimmBus *bus = main_memory_bus;
> +    bus->qbus.allow_hotplug = 1;
> +    bus->dimm_hotplug_qdev = qdev;
> +    bus->dimm_hotplug = hotplug;
> +}
> +
> +static void dimm_plug_device(DimmDevice *slot)
> +{
> +    DimmBus *bus = main_memory_bus;
> +
> +    dimm_populate(slot);
> +    if (bus->dimm_hotplug)
> +        bus->dimm_hotplug(bus->dimm_hotplug_qdev, slot, 1);
> +}
> +
> +static int dimm_unplug_device(DeviceState *qdev)
> +{
> +    DimmBus *bus = main_memory_bus;
> +
> +    if (bus->dimm_hotplug)
> +        bus->dimm_hotplug(bus->dimm_hotplug_qdev, DIMM(qdev), 0);
> +    return 1;
> +}
> +
> +static DimmConfig *dimmcfg_find_from_name(const char *name)
> +{
> +    DimmConfig *slot;
> +    DimmBus *bus = main_memory_bus;
> +
> +    QTAILQ_FOREACH(slot, &bus->dimmconfig_list, nextdimmcfg) {
> +        if (!strcmp(slot->name, name)) {
> +            return slot;
> +        }
> +    }
> +    return NULL;
> +}
> +
> +static DimmDevice *dimm_find_from_idx(uint32_t idx)
> +{
> +    DimmDevice *slot;
> +    DimmBus *bus = main_memory_bus;
> +
> +    QTAILQ_FOREACH(slot, &bus->dimmlist, nextdimm) {
> +        if (slot->idx == idx) {
> +            return slot;
> +        }
> +    }
> +    return NULL;
> +}
> +
> +/* used to create a dimm device, only on incoming migration of a hotplugged
> + * RAMBlock
> + */
> +int dimm_add(char *id)
> +{
> +    DimmConfig *slotcfg = NULL;
> +    QemuOpts *devopts;
> +    char buf[256];
> +
> +    if (!id) {
> +        fprintf(stderr, "ERROR %s invalid id\n",__FUNCTION__);
> +        return 1;
> +    }
> +
> +    slotcfg = dimmcfg_find_from_name(id);
> +
> +    if (!slotcfg) {
> +        fprintf(stderr, "%s no slot %s found\n", __FUNCTION__, id);
> +        return 1;
> +    }
> +
> +    devopts = qemu_opts_create(qemu_find_opts("device"), id, 0, NULL);
> +    qemu_opt_set(devopts, "driver", "dimm");
> +
> +    snprintf(buf, sizeof(buf), "%lu", slotcfg->size);
> +    qemu_opt_set(devopts, "size", buf);
> +    snprintf(buf, sizeof(buf), "%u", slotcfg->node);
> +    qemu_opt_set(devopts, "node", buf);
> +    qdev_device_add(devopts);
> +
> +    return 0;
> +}
> +
> +/* used to calculate physical address offsets for all dimms */
> +void dimm_calc_offsets(dimm_calcoffset_fn calcfn)
> +{
> +    DimmConfig *slot;
> +    QTAILQ_FOREACH(slot, &dimmconfig_list, nextdimmcfg) {
> +        if (!slot->start) {
> +            slot->start = calcfn(slot->size);
> +        }
> +    }
> +}
> +
> +void setup_fwcfg_hp_dimms(uint64_t *fw_cfg_slots)
> +{
> +    DimmConfig *slot;
> +
> +    QTAILQ_FOREACH(slot, &dimmconfig_list, nextdimmcfg) {
> +        assert(slot->start);
> +        fw_cfg_slots[3 * slot->idx] = cpu_to_le64(slot->start);
> +        fw_cfg_slots[3 * slot->idx + 1] = cpu_to_le64(slot->size);
> +        fw_cfg_slots[3 * slot->idx + 2] = cpu_to_le64(slot->node);
> +    }
> +}
> +
> +void dimm_notify(uint32_t idx, uint32_t event)
> +{
> +    DimmBus *bus = main_memory_bus;
> +    DimmDevice *s;
> +    s = dimm_find_from_idx(idx);
> +    assert(s != NULL);
> +
> +    switch(event) {
> +        case DIMM_REMOVE_SUCCESS:
> +            dimm_depopulate(s);
> +            qdev_simple_unplug_cb((DeviceState*)s);
> +            QTAILQ_REMOVE(&bus->dimmlist, s, nextdimm);
> +            break;
> +        default:
> +            break;
> +    }
> +}
> +
> +static int dimm_init(DeviceState *s)
> +{
> +    DimmBus *bus = main_memory_bus;
> +    DimmDevice *slot;
> +    DimmConfig *slotcfg;
> +
> +    slot = DIMM(s);
> +    slot->mr = NULL;
> +
> +    slotcfg = dimmcfg_find_from_name(s->id);
> +
> +    if (!slotcfg) {
> +        fprintf(stderr, "%s no config for slot %s found\n",
> +                __FUNCTION__, s->id);
> +        return 1;
> +    }
> +
> +    slot->idx = slotcfg->idx;
> +    assert(slotcfg->start);
> +    slot->start = slotcfg->start;
> +    slot->size = slotcfg->size;
> +    slot->node = slotcfg->node;
> +
> +    QTAILQ_INSERT_TAIL(&bus->dimmlist, slot, nextdimm);
> +    dimm_plug_device(slot);
> +
> +    return 0;
> +}
> +
> +
> +static void dimm_class_init(ObjectClass *klass, void *data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +
> +    dc->props = dimm_properties;
> +    dc->unplug = dimm_unplug_device;
> +    dc->init = dimm_init;
> +}
> +
> +static TypeInfo dimm_info = {
> +    .name          = TYPE_DIMM,
> +    .parent        = TYPE_DEVICE,
> +    .instance_size = sizeof(DimmDevice),
> +    .class_init    = dimm_class_init,
> +};
> +
> +static void dimm_register_types(void)
> +{
> +    type_register_static(&dimm_bus_info);
> +    type_register_static(&dimm_info);
> +}
> +
> +type_init(dimm_register_types)
> diff --git a/hw/dimm.h b/hw/dimm.h
> new file mode 100644
> index 0000000..5e991a6
> --- /dev/null
> +++ b/hw/dimm.h
> @@ -0,0 +1,90 @@
> +#ifndef QEMU_DIMM_H
> +#define QEMU_DIMM_H
> +
> +#include "qemu-common.h"
> +#include "memory.h"
> +#include "sysbus.h"
> +#include "qapi-types.h"
> +#include "qemu-queue.h"
> +#include "cpus.h"
> +#define MAX_DIMMS 255
> +#define DIMM_BITMAP_BYTES (MAX_DIMMS + 7) / 8
> +#define DEFAULT_DIMMSIZE 1024*1024*1024
> +
> +typedef enum {
> +    DIMM_REMOVE_SUCCESS = 0,
> +    DIMM_REMOVE_FAIL = 1,
> +    DIMM_ADD_SUCCESS = 2,
> +    DIMM_ADD_FAIL = 3
> +} dimm_hp_result_code;
> +
> +#define TYPE_DIMM "dimm"
> +#define DIMM(obj) \
> +    OBJECT_CHECK(DimmDevice, (obj), TYPE_DIMM)
> +#define DIMM_CLASS(klass) \
> +    OBJECT_CLASS_CHECK(DimmDeviceClass, (klass), TYPE_DIMM)
> +#define DIMM_GET_CLASS(obj) \
> +    OBJECT_GET_CLASS(DimmDeviceClass, (obj), TYPE_DIMM)
> +
> +typedef struct DimmDevice DimmDevice;
> +typedef QTAILQ_HEAD(DimmConfiglist, DimmConfig) DimmConfiglist;
> +
> +typedef struct DimmDeviceClass {
> +    DeviceClass parent_class;
> +
> +    int (*init)(DimmDevice *dev);
> +} DimmDeviceClass;
> +
> +typedef struct DimmDevice {

typedef is unnecessay here, and it will break building:
  CC    hmp.o
In file included from /home/wency/source/qemu/hw/acpi_piix4.c:32:
/home/wency/source/qemu/hw/dimm.h:54: error: redefinition of typedef ‘DimmDevice’
/home/wency/source/qemu/hw/dimm.h:36: note: previous declaration of ‘DimmDevice’ was here
make[1]: *** [hw/acpi_piix4.o] Error 1
make[1]: *** Waiting for unfinished jobs....
  CC    audio/audio.o
make: *** [subdir-libhw64] Error 2
make: *** Waiting for unfinished jobs....

Thanks
Wen Congyang


> +    DeviceState qdev;
> +    uint32_t idx; /* index in memory hotplug register/bitmap */
> +    ram_addr_t start; /* starting physical address */
> +    ram_addr_t size;
> +    uint32_t node; /* numa node proximity */
> +    MemoryRegion *mr; /* MemoryRegion for this slot. !NULL only if populated */
> +    QTAILQ_ENTRY (DimmDevice) nextdimm;
> +} DimmDevice;
> +
> +typedef struct DimmConfig
> +{
> +    const char *name;
> +    uint32_t idx; /* index in memory hotplug register/bitmap */
> +    ram_addr_t start; /* starting physical address */
> +    ram_addr_t size;
> +    uint32_t node; /* numa node proximity */
> +    uint32_t populated; /* 1 means device has been hotplugged. Default is 0. */
> +    QTAILQ_ENTRY (DimmConfig) nextdimmcfg;
> +} DimmConfig;
> +
> +typedef int (*dimm_hotplug_fn)(DeviceState *qdev, DimmDevice *dev, int add);
> +typedef target_phys_addr_t (*dimm_calcoffset_fn)(uint64_t size);
> +
> +#define TYPE_DIMM_BUS "dimmbus"
> +#define DIMM_BUS(obj) OBJECT_CHECK(DimmBus, (obj), TYPE_DIMM_BUS)
> +
> +typedef struct DimmBus {
> +    BusState qbus;
> +    DeviceState *dimm_hotplug_qdev;
> +    dimm_hotplug_fn dimm_hotplug;
> +    dimm_calcoffset_fn dimm_calcoffset;
> +    DimmConfiglist dimmconfig_list;
> +    QTAILQ_HEAD(Dimmlist, DimmDevice) dimmlist;
> +} DimmBus;
> +
> +struct dimm_hp_result {
> +    const char *dimmname;
> +    dimm_hp_result_code ret;
> +    QTAILQ_ENTRY (dimm_hp_result) next;
> +};
> +
> +void dimm_calc_offsets(dimm_calcoffset_fn calcfn);
> +void dimm_notify(uint32_t idx, uint32_t event);
> +void dimm_bus_hotplug(dimm_hotplug_fn hotplug, DeviceState *qdev);
> +void setup_fwcfg_hp_dimms(uint64_t *fw_cfg_slots);
> +int dimm_add(char *id);
> +void main_memory_bus_create(Object *parent);
> +void dimm_config_create(char *id, uint64_t size, uint64_t node,
> +        uint32_t dimm_idx, uint32_t populated);
> +
> +
> +#endif
Stefan Hajnoczi - Oct. 23, 2012, 12:25 p.m.
On Fri, Sep 21, 2012 at 01:17:21PM +0200, Vasilis Liaskovitis wrote:
> +static void dimm_populate(DimmDevice *s)
> +{
> +    DeviceState *dev= (DeviceState*)s;
> +    MemoryRegion *new = NULL;
> +
> +    new = g_malloc(sizeof(MemoryRegion));
> +    memory_region_init_ram(new, dev->id, s->size);
> +    vmstate_register_ram_global(new);
> +    memory_region_add_subregion(get_system_memory(), s->start, new);
> +    s->mr = new;
> +}
> +
> +static void dimm_depopulate(DimmDevice *s)
> +{
> +    assert(s);
> +    vmstate_unregister_ram(s->mr, NULL);
> +    memory_region_del_subregion(get_system_memory(), s->mr);
> +    memory_region_destroy(s->mr);
> +    s->mr = NULL;
> +}

How is dimm hot unplug protected against callers who currently have RAM
mapped (from cpu_physical_memory_map())?

Emulated devices call cpu_physical_memory_map() directly or indirectly
through DMA emulation code.  The RAM pointer may be held for arbitrary
lengths of time, across main loop iterations, etc.

It's not clear to me that it is safe to unplug a DIMM that has network
or disk I/O buffers, for example.  We also need to be robust against
malicious guests who abuse the hotplug lifecycle.  QEMU should never be
left with dangling pointers.

Stefan
pingfan liu - Oct. 24, 2012, 8:06 a.m.
On Tue, Oct 23, 2012 at 8:25 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> On Fri, Sep 21, 2012 at 01:17:21PM +0200, Vasilis Liaskovitis wrote:
>> +static void dimm_populate(DimmDevice *s)
>> +{
>> +    DeviceState *dev= (DeviceState*)s;
>> +    MemoryRegion *new = NULL;
>> +
>> +    new = g_malloc(sizeof(MemoryRegion));
>> +    memory_region_init_ram(new, dev->id, s->size);
>> +    vmstate_register_ram_global(new);
>> +    memory_region_add_subregion(get_system_memory(), s->start, new);
>> +    s->mr = new;
>> +}
>> +
>> +static void dimm_depopulate(DimmDevice *s)
>> +{
>> +    assert(s);
>> +    vmstate_unregister_ram(s->mr, NULL);
>> +    memory_region_del_subregion(get_system_memory(), s->mr);
>> +    memory_region_destroy(s->mr);
>> +    s->mr = NULL;
>> +}
>
> How is dimm hot unplug protected against callers who currently have RAM
> mapped (from cpu_physical_memory_map())?
>
> Emulated devices call cpu_physical_memory_map() directly or indirectly
> through DMA emulation code.  The RAM pointer may be held for arbitrary
> lengths of time, across main loop iterations, etc.
>
> It's not clear to me that it is safe to unplug a DIMM that has network
> or disk I/O buffers, for example.  We also need to be robust against
> malicious guests who abuse the hotplug lifecycle.  QEMU should never be
> left with dangling pointers.
>
Not sure about the block layer. But I think those thread are already
out of big lock, so there should be a MemoryListener to catch the
RAM-unplug event, and if needed, bdrv_flush.

Regards,
pingfan
> Stefan
>
Stefan Hajnoczi - Oct. 24, 2012, 10:15 a.m.
On Wed, Oct 24, 2012 at 10:06 AM, liu ping fan <qemulist@gmail.com> wrote:
> On Tue, Oct 23, 2012 at 8:25 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
>> On Fri, Sep 21, 2012 at 01:17:21PM +0200, Vasilis Liaskovitis wrote:
>>> +static void dimm_populate(DimmDevice *s)
>>> +{
>>> +    DeviceState *dev= (DeviceState*)s;
>>> +    MemoryRegion *new = NULL;
>>> +
>>> +    new = g_malloc(sizeof(MemoryRegion));
>>> +    memory_region_init_ram(new, dev->id, s->size);
>>> +    vmstate_register_ram_global(new);
>>> +    memory_region_add_subregion(get_system_memory(), s->start, new);
>>> +    s->mr = new;
>>> +}
>>> +
>>> +static void dimm_depopulate(DimmDevice *s)
>>> +{
>>> +    assert(s);
>>> +    vmstate_unregister_ram(s->mr, NULL);
>>> +    memory_region_del_subregion(get_system_memory(), s->mr);
>>> +    memory_region_destroy(s->mr);
>>> +    s->mr = NULL;
>>> +}
>>
>> How is dimm hot unplug protected against callers who currently have RAM
>> mapped (from cpu_physical_memory_map())?
>>
>> Emulated devices call cpu_physical_memory_map() directly or indirectly
>> through DMA emulation code.  The RAM pointer may be held for arbitrary
>> lengths of time, across main loop iterations, etc.
>>
>> It's not clear to me that it is safe to unplug a DIMM that has network
>> or disk I/O buffers, for example.  We also need to be robust against
>> malicious guests who abuse the hotplug lifecycle.  QEMU should never be
>> left with dangling pointers.
>>
> Not sure about the block layer. But I think those thread are already
> out of big lock, so there should be a MemoryListener to catch the
> RAM-unplug event, and if needed, bdrv_flush.

Here is the detailed scenario:

1. Emulated device does cpu_physical_memory_map() and gets a pointer
to guest RAM.
2. Return to vcpu or iothread, continue processing...
3. Hot unplug of RAM causes the guest RAM to disappear.
4. Pending I/O completes and overwrites memory from dangling guest RAM pointer.

Any I/O device that does zero-copy I/O in QEMU faces this problem:
 * The block layer is affected.
 * The net layer is unaffected because it doesn't do zero-copy tx/rx
across returns to the main loop (#2 above).
 * Not sure about other devices classes (e.g. USB).

How should the MemoryListener callback work?  For block I/O it may not
be possible to cancel pending I/O asynchronously - if you try to
cancel then your thread may block until the I/O completes.
Synchronous cancel behavior is not workable since it can lead to poor
latency or hangs in the guest.

Stefan
Vasilis Liaskovitis - Oct. 24, 2012, 5:16 p.m.
Hi,

On Wed, Oct 24, 2012 at 12:15:17PM +0200, Stefan Hajnoczi wrote:
> On Wed, Oct 24, 2012 at 10:06 AM, liu ping fan <qemulist@gmail.com> wrote:
> > On Tue, Oct 23, 2012 at 8:25 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> >> On Fri, Sep 21, 2012 at 01:17:21PM +0200, Vasilis Liaskovitis wrote:
> >>> +static void dimm_populate(DimmDevice *s)
> >>> +{
> >>> +    DeviceState *dev= (DeviceState*)s;
> >>> +    MemoryRegion *new = NULL;
> >>> +
> >>> +    new = g_malloc(sizeof(MemoryRegion));
> >>> +    memory_region_init_ram(new, dev->id, s->size);
> >>> +    vmstate_register_ram_global(new);
> >>> +    memory_region_add_subregion(get_system_memory(), s->start, new);
> >>> +    s->mr = new;
> >>> +}
> >>> +
> >>> +static void dimm_depopulate(DimmDevice *s)
> >>> +{
> >>> +    assert(s);
> >>> +    vmstate_unregister_ram(s->mr, NULL);
> >>> +    memory_region_del_subregion(get_system_memory(), s->mr);
> >>> +    memory_region_destroy(s->mr);
> >>> +    s->mr = NULL;
> >>> +}
> >>
> >> How is dimm hot unplug protected against callers who currently have RAM
> >> mapped (from cpu_physical_memory_map())?
> >>
> >> Emulated devices call cpu_physical_memory_map() directly or indirectly
> >> through DMA emulation code.  The RAM pointer may be held for arbitrary
> >> lengths of time, across main loop iterations, etc.
> >>
> >> It's not clear to me that it is safe to unplug a DIMM that has network
> >> or disk I/O buffers, for example.  We also need to be robust against
> >> malicious guests who abuse the hotplug lifecycle.  QEMU should never be
> >> left with dangling pointers.
> >>
> > Not sure about the block layer. But I think those thread are already
> > out of big lock, so there should be a MemoryListener to catch the
> > RAM-unplug event, and if needed, bdrv_flush.

do we want bdrv_flush, or some kind of cancel request e.g. bdrv_aio_cancel?

> 
> Here is the detailed scenario:
> 
> 1. Emulated device does cpu_physical_memory_map() and gets a pointer
> to guest RAM.
> 2. Return to vcpu or iothread, continue processing...
> 3. Hot unplug of RAM causes the guest RAM to disappear.
> 4. Pending I/O completes and overwrites memory from dangling guest RAM pointer.
> 
> Any I/O device that does zero-copy I/O in QEMU faces this problem:
>  * The block layer is affected.
>  * The net layer is unaffected because it doesn't do zero-copy tx/rx
> across returns to the main loop (#2 above).
>  * Not sure about other devices classes (e.g. USB).
> 
> How should the MemoryListener callback work?  For block I/O it may not
> be possible to cancel pending I/O asynchronously - if you try to
> cancel then your thread may block until the I/O completes.

e.g. paio_cancel does this?
is there already an API to asynchronously cancel all in flight operations in a
BlockDriverState? Afaict block_job_cancel refers to streaming jobs only and
doesn't help here.

Can we make the RAM unplug initiate async I/O cancellations, prevent further I/Os,
and only free the memory in a callback, after all DMA I/O to the associated memory
region has been cancelled or completed?

Also iiuc the MemoryListener should be registered from users of
cpu_physical_memory_map e.g. hw/virtio.c

By the way dimm_depopulate only frees the qemu memory on an ACPI _EJ request, which
means that a well-behaved guest will have already offlined the memory and is not
using it anymore. If the guest still uses the memory e.g. for a DMA buffer, the
logical memory offlining will fail and the _EJ/qemu memory freeing will never
happen.

But in theory a malicious acpi guest driver could trigger _EJ requests to do step
3 above. 

Or perhaps the backing block driver can finish an I/O request for a zero-copy
block device that the guest doesn't care for anymore? I 'll think about this a
bit more.

> Synchronous cancel behavior is not workable since it can lead to poor
> latency or hangs in the guest.

ok

thanks,

- Vasilis
pingfan liu - Oct. 25, 2012, 8 a.m.
On Thu, Oct 25, 2012 at 1:16 AM, Vasilis Liaskovitis
<vasilis.liaskovitis@profitbricks.com> wrote:
> Hi,
>
> On Wed, Oct 24, 2012 at 12:15:17PM +0200, Stefan Hajnoczi wrote:
>> On Wed, Oct 24, 2012 at 10:06 AM, liu ping fan <qemulist@gmail.com> wrote:
>> > On Tue, Oct 23, 2012 at 8:25 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
>> >> On Fri, Sep 21, 2012 at 01:17:21PM +0200, Vasilis Liaskovitis wrote:
>> >>> +static void dimm_populate(DimmDevice *s)
>> >>> +{
>> >>> +    DeviceState *dev= (DeviceState*)s;
>> >>> +    MemoryRegion *new = NULL;
>> >>> +
>> >>> +    new = g_malloc(sizeof(MemoryRegion));
>> >>> +    memory_region_init_ram(new, dev->id, s->size);
>> >>> +    vmstate_register_ram_global(new);
>> >>> +    memory_region_add_subregion(get_system_memory(), s->start, new);
>> >>> +    s->mr = new;
>> >>> +}
>> >>> +
>> >>> +static void dimm_depopulate(DimmDevice *s)
>> >>> +{
>> >>> +    assert(s);
>> >>> +    vmstate_unregister_ram(s->mr, NULL);
>> >>> +    memory_region_del_subregion(get_system_memory(), s->mr);
>> >>> +    memory_region_destroy(s->mr);
>> >>> +    s->mr = NULL;
>> >>> +}
>> >>
>> >> How is dimm hot unplug protected against callers who currently have RAM
>> >> mapped (from cpu_physical_memory_map())?
>> >>
>> >> Emulated devices call cpu_physical_memory_map() directly or indirectly
>> >> through DMA emulation code.  The RAM pointer may be held for arbitrary
>> >> lengths of time, across main loop iterations, etc.
>> >>
>> >> It's not clear to me that it is safe to unplug a DIMM that has network
>> >> or disk I/O buffers, for example.  We also need to be robust against
>> >> malicious guests who abuse the hotplug lifecycle.  QEMU should never be
>> >> left with dangling pointers.
>> >>
>> > Not sure about the block layer. But I think those thread are already
>> > out of big lock, so there should be a MemoryListener to catch the
>> > RAM-unplug event, and if needed, bdrv_flush.
>
> do we want bdrv_flush, or some kind of cancel request e.g. bdrv_aio_cancel?
>
My original meaning is that flush out the dangling pointer.
>>
>> Here is the detailed scenario:
>>
>> 1. Emulated device does cpu_physical_memory_map() and gets a pointer
>> to guest RAM.
>> 2. Return to vcpu or iothread, continue processing...
>> 3. Hot unplug of RAM causes the guest RAM to disappear.
>> 4. Pending I/O completes and overwrites memory from dangling guest RAM pointer.
>>
>> Any I/O device that does zero-copy I/O in QEMU faces this problem:
>>  * The block layer is affected.
>>  * The net layer is unaffected because it doesn't do zero-copy tx/rx
>> across returns to the main loop (#2 above).
>>  * Not sure about other devices classes (e.g. USB).
>>
>> How should the MemoryListener callback work?  For block I/O it may not
>> be possible to cancel pending I/O asynchronously - if you try to
>> cancel then your thread may block until the I/O completes.
>
For current code, I think to block on the listener to wait for the
completion of flushing out. But after mr->ops's ref/unref patchset
accept, we can release the ref of RAM device after we have done with
it (it is a very raw idea, need to improve).

> e.g. paio_cancel does this?
> is there already an API to asynchronously cancel all in flight operations in a
> BlockDriverState? Afaict block_job_cancel refers to streaming jobs only and
> doesn't help here.
>
> Can we make the RAM unplug initiate async I/O cancellations, prevent further I/Os,
> and only free the memory in a callback, after all DMA I/O to the associated memory
> region has been cancelled or completed?
>
> Also iiuc the MemoryListener should be registered from users of
> cpu_physical_memory_map e.g. hw/virtio.c
>
Yes.
> By the way dimm_depopulate only frees the qemu memory on an ACPI _EJ request, which
> means that a well-behaved guest will have already offlined the memory and is not
> using it anymore. If the guest still uses the memory e.g. for a DMA buffer, the
> logical memory offlining will fail and the _EJ/qemu memory freeing will never
> happen.
>
Yes.
> But in theory a malicious acpi guest driver could trigger _EJ requests to do step
> 3 above.
>
> Or perhaps the backing block driver can finish an I/O request for a zero-copy
> block device that the guest doesn't care for anymore? I 'll think about this a
> bit more.
>
The guest is one of the users of  dimm device, and block layer is another one.

Regards,
pingfan
>> Synchronous cancel behavior is not workable since it can lead to poor
>> latency or hangs in the guest.
>
> ok
>
> thanks,
>
> - Vasilis
>
Avi Kivity - Oct. 31, 2012, 11:15 a.m.
On 10/24/2012 10:06 AM, liu ping fan wrote:
> On Tue, Oct 23, 2012 at 8:25 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
>> On Fri, Sep 21, 2012 at 01:17:21PM +0200, Vasilis Liaskovitis wrote:
>>> +static void dimm_populate(DimmDevice *s)
>>> +{
>>> +    DeviceState *dev= (DeviceState*)s;
>>> +    MemoryRegion *new = NULL;
>>> +
>>> +    new = g_malloc(sizeof(MemoryRegion));
>>> +    memory_region_init_ram(new, dev->id, s->size);
>>> +    vmstate_register_ram_global(new);
>>> +    memory_region_add_subregion(get_system_memory(), s->start, new);
>>> +    s->mr = new;
>>> +}
>>> +
>>> +static void dimm_depopulate(DimmDevice *s)
>>> +{
>>> +    assert(s);
>>> +    vmstate_unregister_ram(s->mr, NULL);
>>> +    memory_region_del_subregion(get_system_memory(), s->mr);
>>> +    memory_region_destroy(s->mr);
>>> +    s->mr = NULL;
>>> +}
>>
>> How is dimm hot unplug protected against callers who currently have RAM
>> mapped (from cpu_physical_memory_map())?
>>
>> Emulated devices call cpu_physical_memory_map() directly or indirectly
>> through DMA emulation code.  The RAM pointer may be held for arbitrary
>> lengths of time, across main loop iterations, etc.
>>
>> It's not clear to me that it is safe to unplug a DIMM that has network
>> or disk I/O buffers, for example.  We also need to be robust against
>> malicious guests who abuse the hotplug lifecycle.  QEMU should never be
>> left with dangling pointers.
>>
> Not sure about the block layer. But I think those thread are already
> out of big lock, so there should be a MemoryListener to catch the
> RAM-unplug event, and if needed, bdrv_flush.


IMO we should use the same mechanism as proposed for other devices:
address_space_map() should grab a reference on the dimm device, and
address_space_unmap() can release it.  This way device destruction will
be deferred as soon as all devices complete I/O.

We will have to be careful with network receive buffers though, since
they can be held indefinitely.
Stefan Hajnoczi - Oct. 31, 2012, 12:18 p.m.
On Wed, Oct 31, 2012 at 12:15 PM, Avi Kivity <avi@redhat.com> wrote:
> On 10/24/2012 10:06 AM, liu ping fan wrote:
>> On Tue, Oct 23, 2012 at 8:25 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
>>> On Fri, Sep 21, 2012 at 01:17:21PM +0200, Vasilis Liaskovitis wrote:
>>>> +static void dimm_populate(DimmDevice *s)
>>>> +{
>>>> +    DeviceState *dev= (DeviceState*)s;
>>>> +    MemoryRegion *new = NULL;
>>>> +
>>>> +    new = g_malloc(sizeof(MemoryRegion));
>>>> +    memory_region_init_ram(new, dev->id, s->size);
>>>> +    vmstate_register_ram_global(new);
>>>> +    memory_region_add_subregion(get_system_memory(), s->start, new);
>>>> +    s->mr = new;
>>>> +}
>>>> +
>>>> +static void dimm_depopulate(DimmDevice *s)
>>>> +{
>>>> +    assert(s);
>>>> +    vmstate_unregister_ram(s->mr, NULL);
>>>> +    memory_region_del_subregion(get_system_memory(), s->mr);
>>>> +    memory_region_destroy(s->mr);
>>>> +    s->mr = NULL;
>>>> +}
>>>
>>> How is dimm hot unplug protected against callers who currently have RAM
>>> mapped (from cpu_physical_memory_map())?
>>>
>>> Emulated devices call cpu_physical_memory_map() directly or indirectly
>>> through DMA emulation code.  The RAM pointer may be held for arbitrary
>>> lengths of time, across main loop iterations, etc.
>>>
>>> It's not clear to me that it is safe to unplug a DIMM that has network
>>> or disk I/O buffers, for example.  We also need to be robust against
>>> malicious guests who abuse the hotplug lifecycle.  QEMU should never be
>>> left with dangling pointers.
>>>
>> Not sure about the block layer. But I think those thread are already
>> out of big lock, so there should be a MemoryListener to catch the
>> RAM-unplug event, and if needed, bdrv_flush.
>
>
> IMO we should use the same mechanism as proposed for other devices:
> address_space_map() should grab a reference on the dimm device, and
> address_space_unmap() can release it.  This way device destruction will
> be deferred as soon as all devices complete I/O.
>
> We will have to be careful with network receive buffers though, since
> they can be held indefinitely.

Network receive buffers aren't mapped.  Net receive is not zero-copy.
For example, virtio-net does virtqueue_pop() inside
virtio_net_receive().

I don't see a problem with networking.

Stefan
Avi Kivity - Oct. 31, 2012, 12:34 p.m.
On 10/31/2012 02:18 PM, Stefan Hajnoczi wrote:
>>
>> IMO we should use the same mechanism as proposed for other devices:
>> address_space_map() should grab a reference on the dimm device, and
>> address_space_unmap() can release it.  This way device destruction will
>> be deferred as soon as all devices complete I/O.
>>
>> We will have to be careful with network receive buffers though, since
>> they can be held indefinitely.
> 
> Network receive buffers aren't mapped.  Net receive is not zero-copy.
> For example, virtio-net does virtqueue_pop() inside
> virtio_net_receive().
> 
> I don't see a problem with networking.

What about vhost-net?  But that is managed separately with a MemoryListener.
Stefan Hajnoczi - Oct. 31, 2012, 12:34 p.m.
On Wed, Oct 31, 2012 at 1:34 PM, Avi Kivity <avi@redhat.com> wrote:
> On 10/31/2012 02:18 PM, Stefan Hajnoczi wrote:
>>>
>>> IMO we should use the same mechanism as proposed for other devices:
>>> address_space_map() should grab a reference on the dimm device, and
>>> address_space_unmap() can release it.  This way device destruction will
>>> be deferred as soon as all devices complete I/O.
>>>
>>> We will have to be careful with network receive buffers though, since
>>> they can be held indefinitely.
>>
>> Network receive buffers aren't mapped.  Net receive is not zero-copy.
>> For example, virtio-net does virtqueue_pop() inside
>> virtio_net_receive().
>>
>> I don't see a problem with networking.
>
> What about vhost-net?  But that is managed separately with a MemoryListener.

Yep.  It should find out when memory regions change through its listener.

Stefan

Patch

diff --git a/hw/dimm.c b/hw/dimm.c
new file mode 100644
index 0000000..288b997
--- /dev/null
+++ b/hw/dimm.c
@@ -0,0 +1,305 @@ 
+/*
+ * Dimm device for Memory Hotplug
+ *
+ * Copyright ProfitBricks GmbH 2012
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>
+ */
+
+#include "trace.h"
+#include "qdev.h"
+#include "dimm.h"
+#include <time.h>
+#include "../exec-memory.h"
+#include "qmp-commands.h"
+
+/* the system-wide memory bus. */
+static DimmBus *main_memory_bus;
+/* the following list is used to hold dimm config info before machine
+ * initialization. After machine init, the list is emptied and not used anymore.*/
+static DimmConfiglist dimmconfig_list = QTAILQ_HEAD_INITIALIZER(dimmconfig_list);
+
+static void dimmbus_dev_print(Monitor *mon, DeviceState *dev, int indent);
+static char *dimmbus_get_fw_dev_path(DeviceState *dev);
+
+static Property dimm_properties[] = {
+    DEFINE_PROP_UINT64("start", DimmDevice, start, 0),
+    DEFINE_PROP_UINT64("size", DimmDevice, size, DEFAULT_DIMMSIZE),
+    DEFINE_PROP_UINT32("node", DimmDevice, node, 0),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void dimmbus_dev_print(Monitor *mon, DeviceState *dev, int indent)
+{
+}
+
+static char *dimmbus_get_fw_dev_path(DeviceState *dev)
+{
+    char path[40];
+
+    snprintf(path, sizeof(path), "%s", qdev_fw_name(dev));
+    return strdup(path);
+}
+
+static void dimm_bus_class_init(ObjectClass *klass, void *data)
+{
+    BusClass *k = BUS_CLASS(klass);
+
+    k->print_dev = dimmbus_dev_print;
+    k->get_fw_dev_path = dimmbus_get_fw_dev_path;
+}
+
+static void dimm_bus_initfn(Object *obj)
+{
+    DimmConfig *dimm_cfg, *next_dimm_cfg;
+    DimmBus *bus = DIMM_BUS(obj);
+    QTAILQ_INIT(&bus->dimmconfig_list);
+    QTAILQ_INIT(&bus->dimmlist);
+
+    QTAILQ_FOREACH_SAFE(dimm_cfg, &dimmconfig_list, nextdimmcfg, next_dimm_cfg) {
+        QTAILQ_REMOVE(&dimmconfig_list, dimm_cfg, nextdimmcfg);
+        QTAILQ_INSERT_TAIL(&bus->dimmconfig_list, dimm_cfg, nextdimmcfg);
+    }
+}
+
+static const TypeInfo dimm_bus_info = {
+    .name = TYPE_DIMM_BUS,
+    .parent = TYPE_BUS,
+    .instance_size = sizeof(DimmBus),
+    .instance_init = dimm_bus_initfn,
+    .class_init = dimm_bus_class_init,
+};
+
+void main_memory_bus_create(Object *parent)
+{
+    main_memory_bus = g_malloc0(dimm_bus_info.instance_size);
+    main_memory_bus->qbus.glib_allocated = true;
+    qbus_create_inplace(&main_memory_bus->qbus, TYPE_DIMM_BUS, DEVICE(parent),
+                        "membus");
+}
+
+static void dimm_populate(DimmDevice *s)
+{
+    DeviceState *dev= (DeviceState*)s;
+    MemoryRegion *new = NULL;
+
+    new = g_malloc(sizeof(MemoryRegion));
+    memory_region_init_ram(new, dev->id, s->size);
+    vmstate_register_ram_global(new);
+    memory_region_add_subregion(get_system_memory(), s->start, new);
+    s->mr = new;
+}
+
+static void dimm_depopulate(DimmDevice *s)
+{
+    assert(s);
+    vmstate_unregister_ram(s->mr, NULL);
+    memory_region_del_subregion(get_system_memory(), s->mr);
+    memory_region_destroy(s->mr);
+    s->mr = NULL;
+}
+
+void dimm_config_create(char *id, uint64_t size, uint64_t node, uint32_t
+        dimm_idx, uint32_t populated)
+{
+    DimmConfig *dimm_cfg;
+    dimm_cfg = (DimmConfig*) g_malloc0(sizeof(DimmConfig));
+    dimm_cfg->name = id;
+    dimm_cfg->idx = dimm_idx;
+    dimm_cfg->start = 0;
+    dimm_cfg->size = size;
+    dimm_cfg->node = node;
+    dimm_cfg->populated = populated;
+
+    QTAILQ_INSERT_TAIL(&dimmconfig_list, dimm_cfg, nextdimmcfg);
+}
+
+void dimm_bus_hotplug(dimm_hotplug_fn hotplug, DeviceState *qdev)
+{
+    DimmBus *bus = main_memory_bus;
+    bus->qbus.allow_hotplug = 1;
+    bus->dimm_hotplug_qdev = qdev;
+    bus->dimm_hotplug = hotplug;
+}
+
+static void dimm_plug_device(DimmDevice *slot)
+{
+    DimmBus *bus = main_memory_bus;
+
+    dimm_populate(slot);
+    if (bus->dimm_hotplug)
+        bus->dimm_hotplug(bus->dimm_hotplug_qdev, slot, 1);
+}
+
+static int dimm_unplug_device(DeviceState *qdev)
+{
+    DimmBus *bus = main_memory_bus;
+
+    if (bus->dimm_hotplug)
+        bus->dimm_hotplug(bus->dimm_hotplug_qdev, DIMM(qdev), 0);
+    return 1;
+}
+
+static DimmConfig *dimmcfg_find_from_name(const char *name)
+{
+    DimmConfig *slot;
+    DimmBus *bus = main_memory_bus;
+
+    QTAILQ_FOREACH(slot, &bus->dimmconfig_list, nextdimmcfg) {
+        if (!strcmp(slot->name, name)) {
+            return slot;
+        }
+    }
+    return NULL;
+}
+
+static DimmDevice *dimm_find_from_idx(uint32_t idx)
+{
+    DimmDevice *slot;
+    DimmBus *bus = main_memory_bus;
+
+    QTAILQ_FOREACH(slot, &bus->dimmlist, nextdimm) {
+        if (slot->idx == idx) {
+            return slot;
+        }
+    }
+    return NULL;
+}
+
+/* used to create a dimm device, only on incoming migration of a hotplugged
+ * RAMBlock
+ */
+int dimm_add(char *id)
+{
+    DimmConfig *slotcfg = NULL;
+    QemuOpts *devopts;
+    char buf[256];
+
+    if (!id) {
+        fprintf(stderr, "ERROR %s invalid id\n",__FUNCTION__);
+        return 1;
+    }
+
+    slotcfg = dimmcfg_find_from_name(id);
+
+    if (!slotcfg) {
+        fprintf(stderr, "%s no slot %s found\n", __FUNCTION__, id);
+        return 1;
+    }
+
+    devopts = qemu_opts_create(qemu_find_opts("device"), id, 0, NULL);
+    qemu_opt_set(devopts, "driver", "dimm");
+
+    snprintf(buf, sizeof(buf), "%lu", slotcfg->size);
+    qemu_opt_set(devopts, "size", buf);
+    snprintf(buf, sizeof(buf), "%u", slotcfg->node);
+    qemu_opt_set(devopts, "node", buf);
+    qdev_device_add(devopts);
+
+    return 0;
+}
+
+/* used to calculate physical address offsets for all dimms */
+void dimm_calc_offsets(dimm_calcoffset_fn calcfn)
+{
+    DimmConfig *slot;
+    QTAILQ_FOREACH(slot, &dimmconfig_list, nextdimmcfg) {
+        if (!slot->start) {
+            slot->start = calcfn(slot->size);
+        }
+    }
+}
+
+void setup_fwcfg_hp_dimms(uint64_t *fw_cfg_slots)
+{
+    DimmConfig *slot;
+
+    QTAILQ_FOREACH(slot, &dimmconfig_list, nextdimmcfg) {
+        assert(slot->start);
+        fw_cfg_slots[3 * slot->idx] = cpu_to_le64(slot->start);
+        fw_cfg_slots[3 * slot->idx + 1] = cpu_to_le64(slot->size);
+        fw_cfg_slots[3 * slot->idx + 2] = cpu_to_le64(slot->node);
+    }
+}
+
+void dimm_notify(uint32_t idx, uint32_t event)
+{
+    DimmBus *bus = main_memory_bus;
+    DimmDevice *s;
+    s = dimm_find_from_idx(idx);
+    assert(s != NULL);
+
+    switch(event) {
+        case DIMM_REMOVE_SUCCESS:
+            dimm_depopulate(s);
+            qdev_simple_unplug_cb((DeviceState*)s);
+            QTAILQ_REMOVE(&bus->dimmlist, s, nextdimm);
+            break;
+        default:
+            break;
+    }
+}
+
+static int dimm_init(DeviceState *s)
+{
+    DimmBus *bus = main_memory_bus;
+    DimmDevice *slot;
+    DimmConfig *slotcfg;
+
+    slot = DIMM(s);
+    slot->mr = NULL;
+
+    slotcfg = dimmcfg_find_from_name(s->id);
+
+    if (!slotcfg) {
+        fprintf(stderr, "%s no config for slot %s found\n",
+                __FUNCTION__, s->id);
+        return 1;
+    }
+
+    slot->idx = slotcfg->idx;
+    assert(slotcfg->start);
+    slot->start = slotcfg->start;
+    slot->size = slotcfg->size;
+    slot->node = slotcfg->node;
+
+    QTAILQ_INSERT_TAIL(&bus->dimmlist, slot, nextdimm);
+    dimm_plug_device(slot);
+
+    return 0;
+}
+
+
+static void dimm_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    dc->props = dimm_properties;
+    dc->unplug = dimm_unplug_device;
+    dc->init = dimm_init;
+}
+
+static TypeInfo dimm_info = {
+    .name          = TYPE_DIMM,
+    .parent        = TYPE_DEVICE,
+    .instance_size = sizeof(DimmDevice),
+    .class_init    = dimm_class_init,
+};
+
+static void dimm_register_types(void)
+{
+    type_register_static(&dimm_bus_info);
+    type_register_static(&dimm_info);
+}
+
+type_init(dimm_register_types)
diff --git a/hw/dimm.h b/hw/dimm.h
new file mode 100644
index 0000000..5e991a6
--- /dev/null
+++ b/hw/dimm.h
@@ -0,0 +1,90 @@ 
+#ifndef QEMU_DIMM_H
+#define QEMU_DIMM_H
+
+#include "qemu-common.h"
+#include "memory.h"
+#include "sysbus.h"
+#include "qapi-types.h"
+#include "qemu-queue.h"
+#include "cpus.h"
+#define MAX_DIMMS 255
+#define DIMM_BITMAP_BYTES (MAX_DIMMS + 7) / 8
+#define DEFAULT_DIMMSIZE 1024*1024*1024
+
+typedef enum {
+    DIMM_REMOVE_SUCCESS = 0,
+    DIMM_REMOVE_FAIL = 1,
+    DIMM_ADD_SUCCESS = 2,
+    DIMM_ADD_FAIL = 3
+} dimm_hp_result_code;
+
+#define TYPE_DIMM "dimm"
+#define DIMM(obj) \
+    OBJECT_CHECK(DimmDevice, (obj), TYPE_DIMM)
+#define DIMM_CLASS(klass) \
+    OBJECT_CLASS_CHECK(DimmDeviceClass, (klass), TYPE_DIMM)
+#define DIMM_GET_CLASS(obj) \
+    OBJECT_GET_CLASS(DimmDeviceClass, (obj), TYPE_DIMM)
+
+typedef struct DimmDevice DimmDevice;
+typedef QTAILQ_HEAD(DimmConfiglist, DimmConfig) DimmConfiglist;
+
+typedef struct DimmDeviceClass {
+    DeviceClass parent_class;
+
+    int (*init)(DimmDevice *dev);
+} DimmDeviceClass;
+
+typedef struct DimmDevice {
+    DeviceState qdev;
+    uint32_t idx; /* index in memory hotplug register/bitmap */
+    ram_addr_t start; /* starting physical address */
+    ram_addr_t size;
+    uint32_t node; /* numa node proximity */
+    MemoryRegion *mr; /* MemoryRegion for this slot. !NULL only if populated */
+    QTAILQ_ENTRY (DimmDevice) nextdimm;
+} DimmDevice;
+
+typedef struct DimmConfig
+{
+    const char *name;
+    uint32_t idx; /* index in memory hotplug register/bitmap */
+    ram_addr_t start; /* starting physical address */
+    ram_addr_t size;
+    uint32_t node; /* numa node proximity */
+    uint32_t populated; /* 1 means device has been hotplugged. Default is 0. */
+    QTAILQ_ENTRY (DimmConfig) nextdimmcfg;
+} DimmConfig;
+
+typedef int (*dimm_hotplug_fn)(DeviceState *qdev, DimmDevice *dev, int add);
+typedef target_phys_addr_t (*dimm_calcoffset_fn)(uint64_t size);
+
+#define TYPE_DIMM_BUS "dimmbus"
+#define DIMM_BUS(obj) OBJECT_CHECK(DimmBus, (obj), TYPE_DIMM_BUS)
+
+typedef struct DimmBus {
+    BusState qbus;
+    DeviceState *dimm_hotplug_qdev;
+    dimm_hotplug_fn dimm_hotplug;
+    dimm_calcoffset_fn dimm_calcoffset;
+    DimmConfiglist dimmconfig_list;
+    QTAILQ_HEAD(Dimmlist, DimmDevice) dimmlist;
+} DimmBus;
+
+struct dimm_hp_result {
+    const char *dimmname;
+    dimm_hp_result_code ret;
+    QTAILQ_ENTRY (dimm_hp_result) next;
+};
+
+void dimm_calc_offsets(dimm_calcoffset_fn calcfn);
+void dimm_notify(uint32_t idx, uint32_t event);
+void dimm_bus_hotplug(dimm_hotplug_fn hotplug, DeviceState *qdev);
+void setup_fwcfg_hp_dimms(uint64_t *fw_cfg_slots);
+int dimm_add(char *id);
+void main_memory_bus_create(Object *parent);
+void dimm_config_create(char *id, uint64_t size, uint64_t node,
+        uint32_t dimm_idx, uint32_t populated);
+
+
+#endif