diff mbox

[RFC,v2,01/20] Hierarchical memory region API

Message ID 1309180927-19003-2-git-send-email-avi@redhat.com
State New
Headers show

Commit Message

Avi Kivity June 27, 2011, 1:21 p.m. UTC
The memory API separates the attributes of a memory region (its size, how
reads or writes are handled, dirty logging, and coalescing) from where it
is mapped and whether it is enabled.  This allows a device to configure
a memory region once, then hand it off to its parent bus to map it according
to the bus configuration.

Hierarchical registration also allows a device to compose a region out of
a number of sub-regions with different properties; for example some may be
RAM while others may be MMIO.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 Makefile.target |    1 +
 memory.c        |  659 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 memory.h        |  201 +++++++++++++++++
 3 files changed, 861 insertions(+), 0 deletions(-)
 create mode 100644 memory.c
 create mode 100644 memory.h

Comments

Michael S. Tsirkin June 28, 2011, 10:03 a.m. UTC | #1
On Mon, Jun 27, 2011 at 04:21:48PM +0300, Avi Kivity wrote:

...

> +static bool memory_region_access_valid(MemoryRegion *mr,
> +                                       target_phys_addr_t addr,
> +                                       unsigned size)
> +{
> +    if (!mr->ops->valid.unaligned && (addr & (size - 1))) {
> +        return false;
> +    }
> +
> +    /* Treat zero as compatibility all valid */
> +    if (!mr->ops->valid.max_access_size) {
> +        return true;
> +    }
> +
> +    if (size > mr->ops->valid.max_access_size
> +        || size < mr->ops->valid.min_access_size) {
> +        return false;
> +    }
> +    return true;
> +}
> +
> +static uint32_t memory_region_read_thunk_n(void *_mr,
> +                                           target_phys_addr_t addr,
> +                                           unsigned size)
> +{
> +    MemoryRegion *mr = _mr;
> +    unsigned access_size, access_size_min, access_size_max;
> +    uint64_t access_mask;
> +    uint32_t data = 0, tmp;
> +    unsigned i;
> +
> +    if (!memory_region_access_valid(mr, addr, size)) {
> +        return -1U; /* FIXME: better signalling */
> +    }
> +
> +    /* FIXME: support unaligned access */
> +
> +    access_size_min = mr->ops->impl.max_access_size;

min = max: Intentional? Cut&paste error?

> +    if (!access_size_min) {
> +        access_size_min = 1;
> +    }
> +    access_size_max = mr->ops->impl.max_access_size;
> +    if (!access_size_max) {
> +        access_size_max = 4;
> +    }

...

> diff --git a/memory.h b/memory.h
> new file mode 100644
> index 0000000..a67ff94
> --- /dev/null
> +++ b/memory.h
> @@ -0,0 +1,201 @@
> +#ifndef MEMORY_H
> +#define MEMORY_H
> +
> +#ifndef CONFIG_USER_ONLY

What's the story with this ifdef?
There are no stubs provided ...

> +
> +#include <stdint.h>
> +#include <stdbool.h>
> +#include "qemu-common.h"
> +#include "cpu-common.h"
> +#include "targphys.h"
> +#include "qemu-queue.h"
> +
> +typedef struct MemoryRegionOps MemoryRegionOps;
> +typedef struct MemoryRegion MemoryRegion;
> +
> +/* Must match *_DIRTY_FLAGS in cpu-all.h.  To be replaced with dynamic
> + * registration.
> + */
> +#define DIRTY_MEMORY_VGA       0
> +#define DIRTY_MEMORY_CODE      1
> +#define DIRTY_MEMORY_MIGRATION 3
> +
> +/*
> + * Memory region callbacks
> + */
> +struct MemoryRegionOps {
> +    /* Read from the memory region. @addr is relative to @mr; @size is
> +     * in bytes. */
> +    uint64_t (*read)(MemoryRegion *mr,
> +                     target_phys_addr_t addr,
> +                     unsigned size);
> +    /* Write to the memory region. @addr is relative to @mr; @size is
> +     * in bytes. */
> +    void (*write)(MemoryRegion *mr,
> +                  target_phys_addr_t addr,
> +                  uint64_t data,
> +                  unsigned size);
> +
> +    enum device_endian endianness;
> +    /* Guest-visible constraints: */
> +    struct {
> +        /* If nonzero, specify bounds on access sizes beyond which a machine
> +         * check is thrown.
> +         */
> +        unsigned min_access_size;
> +        unsigned max_access_size;
> +        /* If true, unaligned accesses are supported.  Otherwise unaligned
> +         * accesses throw machine checks.
> +         */
> +         bool unaligned;
> +    } valid;
> +    /* Internal implementation constraints: */
> +    struct {
> +        /* If nonzero, specifies the minimum size implemented.  Smaller sizes
> +         * will be rounded upwards and a partial result will be returned.
> +         */
> +        unsigned min_access_size;
> +        /* If nonzero, specifies the maximum size implemented.  Larger sizes
> +         * will be done as a series of accesses with smaller sizes.
> +         */
> +        unsigned max_access_size;
> +        /* If true, unaligned accesses are supported.  Otherwise all accesses
> +         * are converted to (possibly multiple) naturally aligned accesses.
> +         */
> +         bool unaligned;
> +    } impl;
> +};
> +
> +typedef struct CoalescedMemoryRange CoalescedMemoryRange;
> +
> +struct MemoryRegion {
> +    /* All fields are private - violators will be prosecuted */
> +    const MemoryRegionOps *ops;
> +    MemoryRegion *parent;
> +    uint64_t size;
> +    target_phys_addr_t addr;
> +    target_phys_addr_t offset;
> +    ram_addr_t ram_addr;
> +    bool has_ram_addr;
> +    MemoryRegion *alias;
> +    target_phys_addr_t alias_offset;
> +    unsigned priority;
> +    bool may_overlap;
> +    QTAILQ_HEAD(subregions, MemoryRegion) subregions;
> +    QTAILQ_ENTRY(MemoryRegion) subregions_link;
> +    QTAILQ_HEAD(coalesced_ranges, CoalescedMemoryRange) coalesced;
> +    const char *name;

I'm never completely sure whether these should be target addresses
or bus addresses or just uint64_t.
With pci on a 32 bit system you can stick a 64 bit address
in a BAR and the result will be that it is never accessed
from the CPU.
Jan Kiszka June 28, 2011, 10:28 a.m. UTC | #2
On 2011-06-28 12:03, Michael S. Tsirkin wrote:
>> +struct MemoryRegion {
>> +    /* All fields are private - violators will be prosecuted */
>> +    const MemoryRegionOps *ops;
>> +    MemoryRegion *parent;
>> +    uint64_t size;
>> +    target_phys_addr_t addr;
>> +    target_phys_addr_t offset;
>> +    ram_addr_t ram_addr;
>> +    bool has_ram_addr;
>> +    MemoryRegion *alias;
>> +    target_phys_addr_t alias_offset;
>> +    unsigned priority;
>> +    bool may_overlap;
>> +    QTAILQ_HEAD(subregions, MemoryRegion) subregions;
>> +    QTAILQ_ENTRY(MemoryRegion) subregions_link;
>> +    QTAILQ_HEAD(coalesced_ranges, CoalescedMemoryRange) coalesced;
>> +    const char *name;
> 
> I'm never completely sure whether these should be target addresses
> or bus addresses or just uint64_t.
> With pci on a 32 bit system you can stick a 64 bit address
> in a BAR and the result will be that it is never accessed
> from the CPU.
> 

Memory regions are not bound to any current or future PCI
specifications. Any fixed bit width would be wrong here, ie. size should
rather be target_phys_addr_t.

Jan
Avi Kivity June 28, 2011, 11:51 a.m. UTC | #3
On 06/28/2011 01:03 PM, Michael S. Tsirkin wrote:
> On Mon, Jun 27, 2011 at 04:21:48PM +0300, Avi Kivity wrote:
>
> ...
>
> >  +static bool memory_region_access_valid(MemoryRegion *mr,
> >  +                                       target_phys_addr_t addr,
> >  +                                       unsigned size)
> >  +{
> >  +    if (!mr->ops->valid.unaligned&&  (addr&  (size - 1))) {
> >  +        return false;
> >  +    }
> >  +
> >  +    /* Treat zero as compatibility all valid */
> >  +    if (!mr->ops->valid.max_access_size) {
> >  +        return true;
> >  +    }
> >  +
> >  +    if (size>  mr->ops->valid.max_access_size
> >  +        || size<  mr->ops->valid.min_access_size) {
> >  +        return false;
> >  +    }
> >  +    return true;
> >  +}
> >  +
> >  +static uint32_t memory_region_read_thunk_n(void *_mr,
> >  +                                           target_phys_addr_t addr,
> >  +                                           unsigned size)
> >  +{
> >  +    MemoryRegion *mr = _mr;
> >  +    unsigned access_size, access_size_min, access_size_max;
> >  +    uint64_t access_mask;
> >  +    uint32_t data = 0, tmp;
> >  +    unsigned i;
> >  +
> >  +    if (!memory_region_access_valid(mr, addr, size)) {
> >  +        return -1U; /* FIXME: better signalling */
> >  +    }
> >  +
> >  +    /* FIXME: support unaligned access */
> >  +
> >  +    access_size_min = mr->ops->impl.max_access_size;
>
> min = max: Intentional? Cut&paste error?

Bug; thanks.

> >  diff --git a/memory.h b/memory.h
> >  new file mode 100644
> >  index 0000000..a67ff94
> >  --- /dev/null
> >  +++ b/memory.h
> >  @@ -0,0 +1,201 @@
> >  +#ifndef MEMORY_H
> >  +#define MEMORY_H
> >  +
> >  +#ifndef CONFIG_USER_ONLY
>
> What's the story with this ifdef?
> There are no stubs provided ...

No callers either - I build with a full configuration.  I prefer the 
#ifdef here rather than all call sites.

> >  +
> >  +struct MemoryRegion {
> >  +    /* All fields are private - violators will be prosecuted */
> >  +    const MemoryRegionOps *ops;
> >  +    MemoryRegion *parent;
> >  +    uint64_t size;
> >  +    target_phys_addr_t addr;
> >  +    target_phys_addr_t offset;
> >  +    ram_addr_t ram_addr;
> >  +    bool has_ram_addr;
> >  +    MemoryRegion *alias;
> >  +    target_phys_addr_t alias_offset;
> >  +    unsigned priority;
> >  +    bool may_overlap;
> >  +    QTAILQ_HEAD(subregions, MemoryRegion) subregions;
> >  +    QTAILQ_ENTRY(MemoryRegion) subregions_link;
> >  +    QTAILQ_HEAD(coalesced_ranges, CoalescedMemoryRange) coalesced;
> >  +    const char *name;
>
> I'm never completely sure whether these should be target addresses
> or bus addresses or just uint64_t.
> With pci on a 32 bit system you can stick a 64 bit address
> in a BAR and the result will be that it is never accessed
> from the CPU.
>

I agree.  Anyone objects to making the memory API 64-bit?

It will reduce performance slightly for 32-on-32, but these 
configurations are getting rarer, and the performance loss is quite small.

Maybe we should make t_p_a_t 64-bit unconditionally.  Note that sizes 
have to be 64-bit in any case, otherwise you can't express a 4G range 
without tricks.
Avi Kivity June 28, 2011, 11:53 a.m. UTC | #4
On 06/28/2011 01:28 PM, Jan Kiszka wrote:
> On 2011-06-28 12:03, Michael S. Tsirkin wrote:
> >>  +struct MemoryRegion {
> >>  +    /* All fields are private - violators will be prosecuted */
> >>  +    const MemoryRegionOps *ops;
> >>  +    MemoryRegion *parent;
> >>  +    uint64_t size;
> >>  +    target_phys_addr_t addr;
> >>  +    target_phys_addr_t offset;
> >>  +    ram_addr_t ram_addr;
> >>  +    bool has_ram_addr;
> >>  +    MemoryRegion *alias;
> >>  +    target_phys_addr_t alias_offset;
> >>  +    unsigned priority;
> >>  +    bool may_overlap;
> >>  +    QTAILQ_HEAD(subregions, MemoryRegion) subregions;
> >>  +    QTAILQ_ENTRY(MemoryRegion) subregions_link;
> >>  +    QTAILQ_HEAD(coalesced_ranges, CoalescedMemoryRange) coalesced;
> >>  +    const char *name;
> >
> >  I'm never completely sure whether these should be target addresses
> >  or bus addresses or just uint64_t.
> >  With pci on a 32 bit system you can stick a 64 bit address
> >  in a BAR and the result will be that it is never accessed
> >  from the CPU.
> >
>
> Memory regions are not bound to any current or future PCI
> specifications. Any fixed bit width would be wrong here, ie. size should
> rather be target_phys_addr_t.

The point is that different buses have different widths. 
target_phys_addr_t matches just one bus in the system.  It needs to be 
the maximum size of all buses present to be useful.
Jan Kiszka June 28, 2011, 12:07 p.m. UTC | #5
On 2011-06-28 13:53, Avi Kivity wrote:
> On 06/28/2011 01:28 PM, Jan Kiszka wrote:
>> On 2011-06-28 12:03, Michael S. Tsirkin wrote:
>>>>  +struct MemoryRegion {
>>>>  +    /* All fields are private - violators will be prosecuted */
>>>>  +    const MemoryRegionOps *ops;
>>>>  +    MemoryRegion *parent;
>>>>  +    uint64_t size;
>>>>  +    target_phys_addr_t addr;
>>>>  +    target_phys_addr_t offset;
>>>>  +    ram_addr_t ram_addr;
>>>>  +    bool has_ram_addr;
>>>>  +    MemoryRegion *alias;
>>>>  +    target_phys_addr_t alias_offset;
>>>>  +    unsigned priority;
>>>>  +    bool may_overlap;
>>>>  +    QTAILQ_HEAD(subregions, MemoryRegion) subregions;
>>>>  +    QTAILQ_ENTRY(MemoryRegion) subregions_link;
>>>>  +    QTAILQ_HEAD(coalesced_ranges, CoalescedMemoryRange) coalesced;
>>>>  +    const char *name;
>>>
>>>  I'm never completely sure whether these should be target addresses
>>>  or bus addresses or just uint64_t.
>>>  With pci on a 32 bit system you can stick a 64 bit address
>>>  in a BAR and the result will be that it is never accessed
>>>  from the CPU.
>>>
>>
>> Memory regions are not bound to any current or future PCI
>> specifications. Any fixed bit width would be wrong here, ie. size should
>> rather be target_phys_addr_t.
> 
> The point is that different buses have different widths. 
> target_phys_addr_t matches just one bus in the system.  It needs to be 
> the maximum size of all buses present to be useful.

Then we need a type for that. Or we need to demand that
target_phys_addr_t is defined large enough to support all buses that the
particular arch wants to address. Hardcoding 64 bit or anything is not
appropriate for a generic subsystem.

Jan
Avi Kivity June 28, 2011, 12:09 p.m. UTC | #6
On 06/28/2011 03:07 PM, Jan Kiszka wrote:
> >
> >  The point is that different buses have different widths.
> >  target_phys_addr_t matches just one bus in the system.  It needs to be
> >  the maximum size of all buses present to be useful.
>
> Then we need a type for that. Or we need to demand that
> target_phys_addr_t is defined large enough to support all buses that the
> particular arch wants to address. Hardcoding 64 bit or anything is not
> appropriate for a generic subsystem.

Okay, let's make t_p_a_t max(bus size in system).  Do we have 32-bit 
targets that don't support pci (I guess, pc-isa with cpu < ppro?).  Do 
we want to support a 32-bit variant of pci?  It certainly existed at 
some point.
Jan Kiszka June 28, 2011, 12:46 p.m. UTC | #7
On 2011-06-28 14:09, Avi Kivity wrote:
> On 06/28/2011 03:07 PM, Jan Kiszka wrote:
>>>
>>>  The point is that different buses have different widths.
>>>  target_phys_addr_t matches just one bus in the system.  It needs to be
>>>  the maximum size of all buses present to be useful.
>>
>> Then we need a type for that. Or we need to demand that
>> target_phys_addr_t is defined large enough to support all buses that the
>> particular arch wants to address. Hardcoding 64 bit or anything is not
>> appropriate for a generic subsystem.
> 
> Okay, let's make t_p_a_t max(bus size in system).  Do we have 32-bit 
> targets that don't support pci (I guess, pc-isa with cpu < ppro?).

At least lm32 and microblaze appear to fall into that category.

> Do we want to support a 32-bit variant of pci?  It certainly existed at 
> some point.

As long as making everything 64 bit in the implementation of the device
models is not guest visible, I don't think that should be a problem.

Jan
Avi Kivity June 28, 2011, 12:53 p.m. UTC | #8
On 06/28/2011 03:46 PM, Jan Kiszka wrote:
>
> >  Do we want to support a 32-bit variant of pci?  It certainly existed at
> >  some point.
>
> As long as making everything 64 bit in the implementation of the device
> models is not guest visible, I don't think that should be a problem.
>

How would it become guest visible?

AFAICT the only implication is a very minor slowdown in the cases where 
it is not actually required.
Peter Maydell June 28, 2011, 1:25 p.m. UTC | #9
On 28 June 2011 13:09, Avi Kivity <avi@redhat.com> wrote:
> Okay, let's make t_p_a_t max(bus size in system).

If you want a type for that, can't you give it a sensible (ie
different) name? target_phys_addr_t is pretty clearly "the type
of a physical address for this target" and having it actually
be something else is just going to be confusing.

> Do we have 32-bit targets
> that don't support pci (I guess, pc-isa with cpu < ppro?).  Do we want to
> support a 32-bit variant of pci?  It certainly existed at some point.

As a thought experiment, you could take an existing 32 bit
target and define a new board model that happens to have eg a
new pci controller on it. It doesn't seem right that that
should cause the system's idea of this type width to change,
it's just a new device model and board. So if you have this
type I think it ought to be max(bus size of widest bus qemu
supports).

-- PMM
Avi Kivity June 28, 2011, 1:36 p.m. UTC | #10
On 06/28/2011 04:25 PM, Peter Maydell wrote:
> On 28 June 2011 13:09, Avi Kivity<avi@redhat.com>  wrote:
> >  Okay, let's make t_p_a_t max(bus size in system).
>
> If you want a type for that, can't you give it a sensible (ie
> different) name? target_phys_addr_t is pretty clearly "the type
> of a physical address for this target" and having it actually
> be something else is just going to be confusing.

"a physical address" is ambiguous.  There are many physical addresses 
flowing around.  Certainly it's most natural to think about the 
processor's physical address bus, but that's not always useful.

Since all *devices* use target_phys_addr_t, I think we should just adopt 
that to avoid major and pointless churn.

> >  Do we have 32-bit targets
> >  that don't support pci (I guess, pc-isa with cpu<  ppro?).  Do we want to
> >  support a 32-bit variant of pci?  It certainly existed at some point.
>
> As a thought experiment, you could take an existing 32 bit
> target and define a new board model that happens to have eg a
> new pci controller on it. It doesn't seem right that that
> should cause the system's idea of this type width to change,
> it's just a new device model and board. So if you have this
> type I think it ought to be max(bus size of widest bus qemu
> supports).

That indicates

   typedef uint64_t target_phys_addr_t;
Olivier Galibert June 28, 2011, 4:27 p.m. UTC | #11
On Tue, Jun 28, 2011 at 03:09:38PM +0300, Avi Kivity wrote:
> On 06/28/2011 03:07 PM, Jan Kiszka wrote:
> > >
> > >  The point is that different buses have different widths.
> > >  target_phys_addr_t matches just one bus in the system.  It needs to be
> > >  the maximum size of all buses present to be useful.
> >
> > Then we need a type for that. Or we need to demand that
> > target_phys_addr_t is defined large enough to support all buses that the
> > particular arch wants to address. Hardcoding 64 bit or anything is not
> > appropriate for a generic subsystem.
> 
> Okay, let's make t_p_a_t max(bus size in system).  Do we have 32-bit 
> targets that don't support pci (I guess, pc-isa with cpu < ppro?).  Do 
> we want to support a 32-bit variant of pci?  It certainly existed at 
> some point.

PCI always had a mechanism for 64-bits addresses even on 32-bits wide
bus, called Dual Address Cycle.  I'm not sure which was rarer: devices
which could handle it, or north bridges which could use it.  Probably
a tie.

But in theory, it was there.

  OG.
diff mbox

Patch

diff --git a/Makefile.target b/Makefile.target
index 03d3646..5b24bd7 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -194,6 +194,7 @@  obj-$(CONFIG_REALLY_VIRTFS) += 9pfs/virtio-9p-device.o
 obj-y += rwhandler.o
 obj-$(CONFIG_KVM) += kvm.o kvm-all.o
 obj-$(CONFIG_NO_KVM) += kvm-stub.o
+obj-y += memory.o
 LIBS+=-lz
 
 QEMU_CFLAGS += $(VNC_TLS_CFLAGS)
diff --git a/memory.c b/memory.c
new file mode 100644
index 0000000..43499c3
--- /dev/null
+++ b/memory.c
@@ -0,0 +1,659 @@ 
+/*
+ * Physical memory management
+ *
+ * Copyright 2011 Red Hat, Inc. and/or its affiliates
+ *
+ * Authors:
+ *  Avi Kivity <avi@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "memory.h"
+#include <assert.h>
+
+typedef struct AddrRange AddrRange;
+
+struct AddrRange {
+    uint64_t start;
+    uint64_t size;
+};
+
+static AddrRange addrrange_make(uint64_t start, uint64_t size)
+{
+    return (AddrRange) { start, size };
+}
+
+static bool addrrange_equal(AddrRange r1, AddrRange r2)
+{
+    return r1.start == r2.start && r1.size == r2.size;
+}
+
+static uint64_t addrrange_end(AddrRange r)
+{
+    return r.start + r.size;
+}
+
+static AddrRange addrrange_shift(AddrRange range, int64_t delta)
+{
+    range.start += delta;
+    return range;
+}
+
+static bool addrrange_intersects(AddrRange r1, AddrRange r2)
+{
+    return (r1.start >= r2.start && r1.start < r2.start + r2.size)
+        || (r2.start >= r1.start && r2.start < r1.start + r1.size);
+}
+
+static AddrRange addrrange_intersection(AddrRange r1, AddrRange r2)
+{
+    uint64_t start = MAX(r1.start, r2.start);
+    /* off-by-one arithmetic to prevent overflow */
+    uint64_t end = MIN(addrrange_end(r1) - 1, addrrange_end(r2) - 1);
+    return addrrange_make(start, end - start + 1);
+}
+
+struct CoalescedMemoryRange {
+    AddrRange addr;
+    QTAILQ_ENTRY(CoalescedMemoryRange) link;
+};
+
+typedef struct FlatRange FlatRange;
+typedef struct FlatView FlatView;
+
+/* Range of memory in the global map.  Addresses are absolute. */
+struct FlatRange {
+    MemoryRegion *mr;
+    target_phys_addr_t offset_in_region;
+    AddrRange addr;
+};
+
+/* Flattened global view of current active memory hierarchy.  Kept in sorted
+ * order.
+ */
+struct FlatView {
+    FlatRange *ranges;
+    unsigned nr;
+    unsigned nr_allocated;
+};
+
+#define FOR_EACH_FLAT_RANGE(var, view)          \
+    for (var = (view)->ranges; var < (view)->ranges + (view)->nr; ++var)
+
+static FlatView current_memory_map;
+static MemoryRegion *root_memory_region;
+
+static bool flatrange_equal(FlatRange *a, FlatRange *b)
+{
+    return a->mr == b->mr
+        && addrrange_equal(a->addr, b->addr)
+        && a->offset_in_region == b->offset_in_region;
+}
+
+static void flatview_init(FlatView *view)
+{
+    view->ranges = NULL;
+    view->nr = 0;
+    view->nr_allocated = 0;
+}
+
+/* Insert a range into a given position.  Caller is responsible for maintaining
+ * sorting order.
+ */
+static void flatview_insert(FlatView *view, unsigned pos, FlatRange *range)
+{
+    if (view->nr == view->nr_allocated) {
+        view->nr_allocated = MAX(2 * view->nr, 10);
+        view->ranges = qemu_realloc(view->ranges,
+                                    view->nr_allocated * sizeof(*view->ranges));
+    }
+    memmove(view->ranges + pos + 1, view->ranges + pos,
+            (view->nr - pos) * sizeof(FlatRange));
+    view->ranges[pos] = *range;
+    ++view->nr;
+}
+
+static void flatview_destroy(FlatView *view)
+{
+    qemu_free(view->ranges);
+}
+
+/* Render a memory region into the global view.  Ranges in @view obscure
+ * ranges in @mr.
+ */
+static void render_memory_region(FlatView *view,
+                                 MemoryRegion *mr,
+                                 target_phys_addr_t base,
+                                 AddrRange clip)
+{
+    MemoryRegion *subregion;
+    unsigned i;
+    target_phys_addr_t offset_in_region;
+    uint64_t remain;
+    uint64_t now;
+    FlatRange fr;
+    AddrRange tmp;
+
+    base += mr->addr;
+
+    tmp = addrrange_make(base, mr->size);
+
+    if (!addrrange_intersects(tmp, clip)) {
+        return;
+    }
+
+    clip = addrrange_intersection(tmp, clip);
+
+    if (mr->alias) {
+        base -= mr->alias->addr;
+        base -= mr->alias_offset;
+        render_memory_region(view, mr->alias, base, clip);
+        return;
+    }
+
+    /* Render subregions in priority order. */
+    QTAILQ_FOREACH(subregion, &mr->subregions, subregions_link) {
+        render_memory_region(view, subregion, base, clip);
+    }
+
+    if (!mr->has_ram_addr) {
+        return;
+    }
+
+    offset_in_region = clip.start - base;
+    base = clip.start;
+    remain = clip.size;
+
+    /* Render the region itself into any gaps left by the current view. */
+    for (i = 0; i < view->nr && remain; ++i) {
+        if (base >= addrrange_end(view->ranges[i].addr)) {
+            continue;
+        }
+        if (base < view->ranges[i].addr.start) {
+            now = MIN(remain, view->ranges[i].addr.start - base);
+            fr.mr = mr;
+            fr.offset_in_region = offset_in_region;
+            fr.addr = addrrange_make(base, now);
+            flatview_insert(view, i, &fr);
+            ++i;
+            base += now;
+            offset_in_region += now;
+            remain -= now;
+        }
+        if (base == view->ranges[i].addr.start) {
+            now = MIN(remain, view->ranges[i].addr.size);
+            base += now;
+            offset_in_region += now;
+            remain -= now;
+        }
+    }
+    if (remain) {
+        fr.mr = mr;
+        fr.offset_in_region = offset_in_region;
+        fr.addr = addrrange_make(base, remain);
+        flatview_insert(view, i, &fr);
+    }
+}
+
+/* Render a memory topology into a list of disjoint absolute ranges. */
+static FlatView generate_memory_topology(MemoryRegion *mr)
+{
+    FlatView view;
+
+    flatview_init(&view);
+
+    render_memory_region(&view, mr, 0, addrrange_make(0, UINT64_MAX));
+
+    return view;
+}
+
+static void memory_region_update_topology(void)
+{
+    FlatView old_view = current_memory_map;
+    FlatView new_view = generate_memory_topology(root_memory_region);
+    unsigned iold, inew;
+    FlatRange *frold, *frnew;
+    ram_addr_t phys_offset, region_offset;
+
+    /* Generate a symmetric difference of the old and new memory maps.
+     * Kill ranges in the old map, and instantiate ranges in the new map.
+     */
+    iold = inew = 0;
+    while (iold < old_view.nr || inew < new_view.nr) {
+        if (iold < old_view.nr) {
+            frold = &old_view.ranges[iold];
+        } else {
+            frold = NULL;
+        }
+        if (inew < new_view.nr) {
+            frnew = &new_view.ranges[inew];
+        } else {
+            frnew = NULL;
+        }
+
+        if (frold
+            && (!frnew
+                || frold->addr.start < frnew->addr.start
+                || (frold->addr.start == frnew->addr.start
+                    && !flatrange_equal(frold, frnew)))) {
+            /* In old, but (not in new, or in new but attributes changed). */
+
+            cpu_register_physical_memory(frold->addr.start, frold->addr.size,
+                                         IO_MEM_UNASSIGNED);
+            ++iold;
+        } else if (frold && frnew && flatrange_equal(frold, frnew)) {
+            /* In both (logging may have changed) */
+
+            ++iold;
+            ++inew;
+            /* FIXME: dirty logging */
+        } else {
+            /* In new */
+
+            phys_offset = frnew->mr->ram_addr;
+            region_offset = frnew->offset_in_region;
+            /* cpu_register_physical_memory_log() wants region_offset for
+             * mmio, but prefers offseting phys_offset for RAM.  Humour it.
+             */
+            if ((phys_offset & ~TARGET_PAGE_MASK) <= IO_MEM_ROM) {
+                phys_offset += region_offset;
+                region_offset = 0;
+            }
+
+            cpu_register_physical_memory_log(frnew->addr.start,
+                                             frnew->addr.size,
+                                             phys_offset,
+                                             region_offset,
+                                             0);
+            ++inew;
+        }
+    }
+    current_memory_map = new_view;
+    flatview_destroy(&old_view);
+}
+
+void memory_region_init(MemoryRegion *mr,
+                        const char *name,
+                        uint64_t size)
+{
+    mr->ops = NULL;
+    mr->parent = NULL;
+    mr->size = size;
+    mr->addr = 0;
+    mr->offset = 0;
+    mr->has_ram_addr = false;
+    mr->priority = 0;
+    mr->may_overlap = false;
+    mr->alias = NULL;
+    QTAILQ_INIT(&mr->subregions);
+    memset(&mr->subregions_link, 0, sizeof mr->subregions_link);
+    QTAILQ_INIT(&mr->coalesced);
+    mr->name = qemu_strdup(name);
+}
+
+static bool memory_region_access_valid(MemoryRegion *mr,
+                                       target_phys_addr_t addr,
+                                       unsigned size)
+{
+    if (!mr->ops->valid.unaligned && (addr & (size - 1))) {
+        return false;
+    }
+
+    /* Treat zero as compatibility all valid */
+    if (!mr->ops->valid.max_access_size) {
+        return true;
+    }
+
+    if (size > mr->ops->valid.max_access_size
+        || size < mr->ops->valid.min_access_size) {
+        return false;
+    }
+    return true;
+}
+
+static uint32_t memory_region_read_thunk_n(void *_mr,
+                                           target_phys_addr_t addr,
+                                           unsigned size)
+{
+    MemoryRegion *mr = _mr;
+    unsigned access_size, access_size_min, access_size_max;
+    uint64_t access_mask;
+    uint32_t data = 0, tmp;
+    unsigned i;
+
+    if (!memory_region_access_valid(mr, addr, size)) {
+        return -1U; /* FIXME: better signalling */
+    }
+
+    /* FIXME: support unaligned access */
+
+    access_size_min = mr->ops->impl.max_access_size;
+    if (!access_size_min) {
+        access_size_min = 1;
+    }
+    access_size_max = mr->ops->impl.max_access_size;
+    if (!access_size_max) {
+        access_size_max = 4;
+    }
+    access_size = MAX(MIN(size, access_size_max), access_size_min);
+    access_mask = -1ULL >> (64 - access_size * 8);
+    addr += mr->offset;
+    for (i = 0; i < size; i += access_size) {
+        /* FIXME: big-endian support */
+        tmp = mr->ops->read(mr, addr + i, access_size);
+        data |= (tmp & access_mask) << (i * 8);
+    }
+
+    return data;
+}
+
+static void memory_region_write_thunk_n(void *_mr,
+                                        target_phys_addr_t addr,
+                                        unsigned size,
+                                        uint64_t data)
+{
+    MemoryRegion *mr = _mr;
+    unsigned access_size, access_size_min, access_size_max;
+    uint64_t access_mask;
+    unsigned i;
+
+    if (!memory_region_access_valid(mr, addr, size)) {
+        return; /* FIXME: better signalling */
+    }
+
+    /* FIXME: support unaligned access */
+
+    access_size_min = mr->ops->impl.max_access_size;
+    if (!access_size_min) {
+        access_size_min = 1;
+    }
+    access_size_max = mr->ops->impl.max_access_size;
+    if (!access_size_max) {
+        access_size_max = 4;
+    }
+    access_size = MAX(MIN(size, access_size_max), access_size_min);
+    access_mask = -1ULL >> (64 - access_size * 8);
+    addr += mr->offset;
+    for (i = 0; i < size; i += access_size) {
+        /* FIXME: big-endian support */
+        mr->ops->write(mr, addr + i, (data >> (i * 8)) & access_mask,
+                       access_size);
+    }
+}
+
+static uint32_t memory_region_read_thunk_b(void *mr, target_phys_addr_t addr)
+{
+    return memory_region_read_thunk_n(mr, addr, 1);
+}
+
+static uint32_t memory_region_read_thunk_w(void *mr, target_phys_addr_t addr)
+{
+    return memory_region_read_thunk_n(mr, addr, 2);
+}
+
+static uint32_t memory_region_read_thunk_l(void *mr, target_phys_addr_t addr)
+{
+    return memory_region_read_thunk_n(mr, addr, 4);
+}
+
+static void memory_region_write_thunk_b(void *mr, target_phys_addr_t addr,
+                                        uint32_t data)
+{
+    memory_region_write_thunk_n(mr, addr, 1, data);
+}
+
+static void memory_region_write_thunk_w(void *mr, target_phys_addr_t addr,
+                                        uint32_t data)
+{
+    memory_region_write_thunk_n(mr, addr, 2, data);
+}
+
+static void memory_region_write_thunk_l(void *mr, target_phys_addr_t addr,
+                                        uint32_t data)
+{
+    memory_region_write_thunk_n(mr, addr, 4, data);
+}
+
+static CPUReadMemoryFunc * const memory_region_read_thunk[] = {
+    memory_region_read_thunk_b,
+    memory_region_read_thunk_w,
+    memory_region_read_thunk_l,
+};
+
+static CPUWriteMemoryFunc * const memory_region_write_thunk[] = {
+    memory_region_write_thunk_b,
+    memory_region_write_thunk_w,
+    memory_region_write_thunk_l,
+};
+
+void memory_region_init_io(MemoryRegion *mr,
+                           const MemoryRegionOps *ops,
+                           const char *name,
+                           uint64_t size)
+{
+    memory_region_init(mr, name, size);
+    mr->ops = ops;
+    mr->has_ram_addr = true;
+    mr->ram_addr = cpu_register_io_memory(memory_region_read_thunk,
+                                          memory_region_write_thunk,
+                                          mr,
+                                          mr->ops->endianness);
+}
+
+void memory_region_init_ram(MemoryRegion *mr,
+                            DeviceState *dev,
+                            const char *name,
+                            uint64_t size)
+{
+    memory_region_init(mr, name, size);
+    mr->has_ram_addr = true;
+    mr->ram_addr = qemu_ram_alloc(dev, name, size);
+}
+
+void memory_region_init_ram_ptr(MemoryRegion *mr,
+                                DeviceState *dev,
+                                const char *name,
+                                uint64_t size,
+                                void *ptr)
+{
+    memory_region_init(mr, name, size);
+    mr->has_ram_addr = true;
+    mr->ram_addr = qemu_ram_alloc_from_ptr(dev, name, size, ptr);
+}
+
+void memory_region_init_alias(MemoryRegion *mr,
+                              const char *name,
+                              MemoryRegion *orig,
+                              target_phys_addr_t offset,
+                              uint64_t size)
+{
+    memory_region_init(mr, name, size);
+    mr->alias = orig;
+    mr->alias_offset = offset;
+}
+
+void memory_region_destroy(MemoryRegion *mr)
+{
+    assert(QTAILQ_EMPTY(&mr->subregions));
+    memory_region_clear_coalescing(mr);
+    qemu_free((char *)mr->name);
+}
+
+target_phys_addr_t memory_region_size(MemoryRegion *mr)
+{
+    return mr->size;
+}
+
+void memory_region_set_offset(MemoryRegion *mr, target_phys_addr_t offset)
+{
+    mr->offset = offset;
+}
+
+void memory_region_set_log(MemoryRegion *mr, bool log, unsigned client)
+{
+    /* FIXME */
+}
+
+bool memory_region_get_dirty(MemoryRegion *mr, target_phys_addr_t addr,
+                             unsigned client)
+{
+    /* FIXME */
+    return true;
+}
+
+void memory_region_set_dirty(MemoryRegion *mr, target_phys_addr_t addr)
+{
+    /* FIXME */
+}
+
+void memory_region_sync_dirty_bitmap(MemoryRegion *mr)
+{
+    /* FIXME */
+}
+
+void memory_region_set_readonly(MemoryRegion *mr, bool readonly)
+{
+    /* FIXME */
+}
+
+void memory_region_reset_dirty(MemoryRegion *mr, target_phys_addr_t addr,
+                               target_phys_addr_t size, unsigned client)
+{
+    /* FIXME */
+}
+
+void *memory_region_get_ram_ptr(MemoryRegion *mr)
+{
+    if (mr->alias) {
+        return memory_region_get_ram_ptr(mr->alias) + mr->alias_offset;
+    }
+
+    if (!mr->has_ram_addr) {
+        abort();
+    }
+
+    return qemu_get_ram_ptr(mr->ram_addr);
+}
+
+static void memory_region_update_coalesced_range(MemoryRegion *mr)
+{
+    FlatRange *fr;
+    CoalescedMemoryRange *cmr;
+    AddrRange tmp;
+
+    FOR_EACH_FLAT_RANGE(fr, &current_memory_map) {
+        if (fr->mr == mr) {
+            qemu_unregister_coalesced_mmio(fr->addr.start, fr->addr.size);
+            QTAILQ_FOREACH(cmr, &mr->coalesced, link) {
+                tmp = addrrange_shift(cmr->addr,
+                                      fr->addr.start - fr->offset_in_region);
+                if (!addrrange_intersects(tmp, fr->addr)) {
+                    continue;
+                }
+                tmp = addrrange_intersection(tmp, fr->addr);
+                qemu_register_coalesced_mmio(tmp.start, tmp.size);
+            }
+        }
+    }
+}
+
+void memory_region_set_coalescing(MemoryRegion *mr)
+{
+    memory_region_clear_coalescing(mr);
+    memory_region_add_coalescing(mr, 0, mr->size);
+}
+
+void memory_region_add_coalescing(MemoryRegion *mr,
+                                  target_phys_addr_t offset,
+                                  uint64_t size)
+{
+    CoalescedMemoryRange *cmr = qemu_malloc(sizeof(*cmr));
+
+    cmr->addr = addrrange_make(offset, size);
+    QTAILQ_INSERT_TAIL(&mr->coalesced, cmr, link);
+    memory_region_update_coalesced_range(mr);
+}
+
+void memory_region_clear_coalescing(MemoryRegion *mr)
+{
+    CoalescedMemoryRange *cmr;
+
+    while (!QTAILQ_EMPTY(&mr->coalesced)) {
+        cmr = QTAILQ_FIRST(&mr->coalesced);
+        QTAILQ_REMOVE(&mr->coalesced, cmr, link);
+        qemu_free(cmr);
+    }
+    memory_region_update_coalesced_range(mr);
+}
+
+static void memory_region_add_subregion_common(MemoryRegion *mr,
+                                               target_phys_addr_t offset,
+                                               MemoryRegion *subregion)
+{
+    MemoryRegion *other;
+
+    assert(!subregion->parent);
+    subregion->parent = mr;
+    subregion->addr = offset;
+    QTAILQ_FOREACH(other, &mr->subregions, subregions_link) {
+        if (subregion->may_overlap || other->may_overlap) {
+            continue;
+        }
+        if (offset >= other->offset + other->size
+            || offset + subregion->size <= other->offset) {
+            continue;
+        }
+        printf("warning: subregion collision %llx/%llx vs %llx/%llx\n",
+               (unsigned long long)offset,
+               (unsigned long long)subregion->size,
+               (unsigned long long)other->offset,
+               (unsigned long long)other->size);
+    }
+    QTAILQ_FOREACH(other, &mr->subregions, subregions_link) {
+        if (subregion->priority >= other->priority) {
+            QTAILQ_INSERT_BEFORE(other, subregion, subregions_link);
+            goto done;
+        }
+    }
+    QTAILQ_INSERT_TAIL(&mr->subregions, subregion, subregions_link);
+done:
+    memory_region_update_topology();
+}
+
+
+void memory_region_add_subregion(MemoryRegion *mr,
+                                 target_phys_addr_t offset,
+                                 MemoryRegion *subregion)
+{
+    subregion->may_overlap = false;
+    subregion->priority = 0;
+    memory_region_add_subregion_common(mr, offset, subregion);
+}
+
+void memory_region_add_subregion_overlap(MemoryRegion *mr,
+                                         target_phys_addr_t offset,
+                                         MemoryRegion *subregion,
+                                         unsigned priority)
+{
+    subregion->may_overlap = true;
+    subregion->priority = priority;
+    memory_region_add_subregion_common(mr, offset, subregion);
+}
+
+void memory_region_del_subregion(MemoryRegion *mr,
+                                 MemoryRegion *subregion)
+{
+    assert(subregion->parent == mr);
+    subregion->parent = NULL;
+    QTAILQ_REMOVE(&mr->subregions, subregion, subregions_link);
+    memory_region_update_topology();
+}
+
+void set_system_memory_map(MemoryRegion *mr)
+{
+    root_memory_region = mr;
+    memory_region_update_topology();
+}
diff --git a/memory.h b/memory.h
new file mode 100644
index 0000000..a67ff94
--- /dev/null
+++ b/memory.h
@@ -0,0 +1,201 @@ 
+#ifndef MEMORY_H
+#define MEMORY_H
+
+#ifndef CONFIG_USER_ONLY
+
+#include <stdint.h>
+#include <stdbool.h>
+#include "qemu-common.h"
+#include "cpu-common.h"
+#include "targphys.h"
+#include "qemu-queue.h"
+
+typedef struct MemoryRegionOps MemoryRegionOps;
+typedef struct MemoryRegion MemoryRegion;
+
+/* Must match *_DIRTY_FLAGS in cpu-all.h.  To be replaced with dynamic
+ * registration.
+ */
+#define DIRTY_MEMORY_VGA       0
+#define DIRTY_MEMORY_CODE      1
+#define DIRTY_MEMORY_MIGRATION 3
+
+/*
+ * Memory region callbacks
+ */
+struct MemoryRegionOps {
+    /* Read from the memory region. @addr is relative to @mr; @size is
+     * in bytes. */
+    uint64_t (*read)(MemoryRegion *mr,
+                     target_phys_addr_t addr,
+                     unsigned size);
+    /* Write to the memory region. @addr is relative to @mr; @size is
+     * in bytes. */
+    void (*write)(MemoryRegion *mr,
+                  target_phys_addr_t addr,
+                  uint64_t data,
+                  unsigned size);
+
+    enum device_endian endianness;
+    /* Guest-visible constraints: */
+    struct {
+        /* If nonzero, specify bounds on access sizes beyond which a machine
+         * check is thrown.
+         */
+        unsigned min_access_size;
+        unsigned max_access_size;
+        /* If true, unaligned accesses are supported.  Otherwise unaligned
+         * accesses throw machine checks.
+         */
+         bool unaligned;
+    } valid;
+    /* Internal implementation constraints: */
+    struct {
+        /* If nonzero, specifies the minimum size implemented.  Smaller sizes
+         * will be rounded upwards and a partial result will be returned.
+         */
+        unsigned min_access_size;
+        /* If nonzero, specifies the maximum size implemented.  Larger sizes
+         * will be done as a series of accesses with smaller sizes.
+         */
+        unsigned max_access_size;
+        /* If true, unaligned accesses are supported.  Otherwise all accesses
+         * are converted to (possibly multiple) naturally aligned accesses.
+         */
+         bool unaligned;
+    } impl;
+};
+
+typedef struct CoalescedMemoryRange CoalescedMemoryRange;
+
+struct MemoryRegion {
+    /* All fields are private - violators will be prosecuted */
+    const MemoryRegionOps *ops;
+    MemoryRegion *parent;
+    uint64_t size;
+    target_phys_addr_t addr;
+    target_phys_addr_t offset;
+    ram_addr_t ram_addr;
+    bool has_ram_addr;
+    MemoryRegion *alias;
+    target_phys_addr_t alias_offset;
+    unsigned priority;
+    bool may_overlap;
+    QTAILQ_HEAD(subregions, MemoryRegion) subregions;
+    QTAILQ_ENTRY(MemoryRegion) subregions_link;
+    QTAILQ_HEAD(coalesced_ranges, CoalescedMemoryRange) coalesced;
+    const char *name;
+};
+
+/* Initialize a memory region
+ *
+ * The region typically acts as a container for other memory regions.
+ */
+void memory_region_init(MemoryRegion *mr,
+                        const char *name,
+                        uint64_t size);
+/* Initialize an I/O memory region.  Accesses into the region will be
+ * cause the callbacks in @ops to be called.
+ *
+ * if @size is nonzero, subregions will be clipped to @size.
+ */
+void memory_region_init_io(MemoryRegion *mr,
+                           const MemoryRegionOps *ops,
+                           const char *name,
+                           uint64_t size);
+/* Initialize an I/O memory region.  Accesses into the region will be
+ * modify memory directly.
+ */
+void memory_region_init_ram(MemoryRegion *mr,
+                            DeviceState *dev, /* FIXME: layering violation */
+                            const char *name,
+                            uint64_t size);
+/* Initialize a RAM memory region.  Accesses into the region will be
+ * modify memory in @ptr directly.
+ */
+void memory_region_init_ram_ptr(MemoryRegion *mr,
+                                DeviceState *dev, /* FIXME: layering violation */
+                                const char *name,
+                                uint64_t size,
+                                void *ptr);
+/* Initializes a memory region which aliases a section of another memory
+ * region.
+ */
+void memory_region_init_alias(MemoryRegion *mr,
+                              const char *name,
+                              MemoryRegion *orig,
+                              target_phys_addr_t offset,
+                              uint64_t size);
+
+/* Destroy a memory region.  The memory becomes inaccessible. */
+void memory_region_destroy(MemoryRegion *mr);
+
+target_phys_addr_t memory_region_size(MemoryRegion *mr);
+
+/* Get a pointer into a RAM memory region; use with care */
+void *memory_region_get_ram_ptr(MemoryRegion *mr);
+
+/* Sets an offset to be added to MemoryRegionOps callbacks.  This function
+ * is deprecated and should not be used in new code. */
+void memory_region_set_offset(MemoryRegion *mr, target_phys_addr_t offset);
+
+/* Turn logging on or off for specified client (display, migration) */
+void memory_region_set_log(MemoryRegion *mr, bool log, unsigned client);
+
+/* Check whether a page is dirty for a specified client. */
+bool memory_region_get_dirty(MemoryRegion *mr, target_phys_addr_t addr,
+                             unsigned client);
+
+/* Mark a page as dirty in a memory region, after it has been dirtied outside
+ * guest code
+ */
+void memory_region_set_dirty(MemoryRegion *mr, target_phys_addr_t addr);
+
+/* Synchronize a region's dirty bitmap with any external TLBs (e.g. kvm) */
+void memory_region_sync_dirty_bitmap(MemoryRegion *mr);
+
+/* Mark a range of pages as not dirty, for a specified client. */
+void memory_region_reset_dirty(MemoryRegion *mr, target_phys_addr_t addr,
+                               target_phys_addr_t size, unsigned client);
+
+/* Turn a memory region read-only (or read-write) */
+void memory_region_set_readonly(MemoryRegion *mr, bool readonly);
+
+/* Enable memory coalescing for the region.  MMIO ->write callbacks may be
+ * delayed until a non-coalesced MMIO is issued.
+ */
+void memory_region_set_coalescing(MemoryRegion *mr);
+
+/* Enable memory coalescing for a sub-range of the region.  MMIO ->write
+ * callbacks may be delayed until a non-coalesced MMIO is issued.
+ */
+void memory_region_add_coalescing(MemoryRegion *mr,
+                                  target_phys_addr_t offset,
+                                  uint64_t size);
+/* Disable MMIO coalescing for the region. */
+void memory_region_clear_coalescing(MemoryRegion *mr);
+
+/* Add a sub-region at @offset.  The sub-region may not overlap with other
+ * subregions (except for those explicitly marked as overlapping)
+ */
+void memory_region_add_subregion(MemoryRegion *mr,
+                                 target_phys_addr_t offset,
+                                 MemoryRegion *subregion);
+/* Add a sub-region at @offset.  The sub-region may overlap other subregions;
+ * conflicts are resolved by having a higher @priority hide a lower @priority.
+ * Subregions without priority are taken as @priority 0.
+ */
+void memory_region_add_subregion_overlap(MemoryRegion *mr,
+                                         target_phys_addr_t offset,
+                                         MemoryRegion *subregion,
+                                         unsigned priority);
+/* Remove a subregion. */
+void memory_region_del_subregion(MemoryRegion *mr,
+                                 MemoryRegion *subregion);
+
+/* Set the root memory region.  This region is the system memory map. */
+void set_system_memory_map(MemoryRegion *mr);
+
+#endif
+
+#endif