Patchwork [RFC,10/15] memory: Rework sub-page handling

login
register
mail settings
Submitter Paolo Bonzini
Date May 7, 2013, 12:35 p.m.
Message ID <5188F4FD.4000506@redhat.com>
Download mbox | patch
Permalink /patch/242194/
State New
Headers show

Comments

Paolo Bonzini - May 7, 2013, 12:35 p.m.
Il 06/05/2013 22:46, Peter Maydell ha scritto:
> On 6 May 2013 15:26, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>> Simplify the sub-page handling by implementing it directly in the
>> dispatcher instead of using a redirection memory region. We extend the
>> phys_sections entries to optionally hold a pointer to the sub-section
>> table that used to reside in the subpage_t structure. IOW, we add one
>> optional dispatch level below the existing radix tree.
>>
>> address_space_lookup_region is extended to take this additional level
>> into account. This direct dispatching to that target memory region will
>> also be helpful when we want to add per-region locking control.
> 
> This patch seems to break vexpress-a9. Test case if you want it:
> http://staging.people.linaro.org/~peter.maydell/vexpress-3.8.tar.gz
> (125MB) Edit the 'runme' script to fix up the paths to kernel/initrd/dtb
> and then run it; before this patch it boots, afterwards it doesn't
> even manage to start the kernel.
> 
> My guess is you've broken subregion-sized mmio regions somehow
> (and/or regions which are larger than a page in size but start
> or finish at a non-page-aligned address), and probably in particular
> the arm_gic regions that a9mpcore maps...

I think we just found out what all the subpage stuff is for. :)

When I added address_space_translate(), the trickiest conversion to the
new API was in tlb_set_page.  Hence I added a "you never know"-style assert:

    assert(size >= TARGET_PAGE_SIZE);
    if (size != TARGET_PAGE_SIZE) {
        tlb_add_large_page(env, vaddr, size);
     }
-    section = phys_page_find(address_space_memory.dispatch,
-			     paddr >> TARGET_PAGE_BITS);
+    sz = size;
+    section = address_space_translate(&address_space_memory, paddr,
+				      &xlat, &sz, false);
+    assert(sz >= TARGET_PAGE_SIZE);


Now, remember that address_space_translate clamps sz on output to the
size of the section.  And here's what happens:

(gdb) p sz
$4 = 256
(gdb) p *(section->mr)
$5 = {ops = 0x7ffff82d33c0, iommu_ops = 0x0, opaque = 0x7ffff91e6b50,
parent = 0x7ffff9069400, owner = 0x0, size = {lo = 256,
    hi = 0}, addr = 0, destructor = 0x7ffff7e824d0
<memory_region_destructor_none>, ram_addr = 18446744073709551615,
terminates =
    true, romd_mode = true, ram = false, readonly = false, enabled =
true, rom_device = false, warning_printed = false,
  flush_coalesced_mmio = false, alias = 0x0, alias_offset = 0, priority
= 0, may_overlap = false, subregions = {tqh_first = 0x0,
    tqh_last = 0x7ffff91e8ee8}, subregions_link = {tqe_next = 0x0,
tqe_prev = 0x7ffff91e64f8}, coalesced = {tqh_first = 0x0,
    tqh_last = 0x7ffff91e8f08}, name = 0x7ffff906bb60 "a9-scu",
dirty_log_mask = 0 '\000', ioeventfd_nb = 0, ioeventfds = 0x0,
  iommu_target_as = 0x0}

The TLB would only store this region instead of the whole page, and
subsequent access past the first 256 bytes would be emulated incorrectly.

For the record, I attach the patch I was using to fix Jan's.

Paolo
Jan Kiszka - May 7, 2013, 5:26 p.m.
On 2013-05-07 14:35, Paolo Bonzini wrote:
> Il 06/05/2013 22:46, Peter Maydell ha scritto:
>> On 6 May 2013 15:26, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>>> Simplify the sub-page handling by implementing it directly in the
>>> dispatcher instead of using a redirection memory region. We extend the
>>> phys_sections entries to optionally hold a pointer to the sub-section
>>> table that used to reside in the subpage_t structure. IOW, we add one
>>> optional dispatch level below the existing radix tree.
>>>
>>> address_space_lookup_region is extended to take this additional level
>>> into account. This direct dispatching to that target memory region will
>>> also be helpful when we want to add per-region locking control.
>>
>> This patch seems to break vexpress-a9. Test case if you want it:
>> http://staging.people.linaro.org/~peter.maydell/vexpress-3.8.tar.gz
>> (125MB) Edit the 'runme' script to fix up the paths to kernel/initrd/dtb
>> and then run it; before this patch it boots, afterwards it doesn't
>> even manage to start the kernel.
>>
>> My guess is you've broken subregion-sized mmio regions somehow
>> (and/or regions which are larger than a page in size but start
>> or finish at a non-page-aligned address), and probably in particular
>> the arm_gic regions that a9mpcore maps...
> 
> I think we just found out what all the subpage stuff is for. :)
> 
> When I added address_space_translate(), the trickiest conversion to the
> new API was in tlb_set_page.  Hence I added a "you never know"-style assert:
> 
>     assert(size >= TARGET_PAGE_SIZE);
>     if (size != TARGET_PAGE_SIZE) {
>         tlb_add_large_page(env, vaddr, size);
>      }
> -    section = phys_page_find(address_space_memory.dispatch,
> -			     paddr >> TARGET_PAGE_BITS);
> +    sz = size;
> +    section = address_space_translate(&address_space_memory, paddr,
> +				      &xlat, &sz, false);
> +    assert(sz >= TARGET_PAGE_SIZE);
> 
> 
> Now, remember that address_space_translate clamps sz on output to the
> size of the section.  And here's what happens:
> 
> (gdb) p sz
> $4 = 256
> (gdb) p *(section->mr)
> $5 = {ops = 0x7ffff82d33c0, iommu_ops = 0x0, opaque = 0x7ffff91e6b50,
> parent = 0x7ffff9069400, owner = 0x0, size = {lo = 256,
>     hi = 0}, addr = 0, destructor = 0x7ffff7e824d0
> <memory_region_destructor_none>, ram_addr = 18446744073709551615,
> terminates =
>     true, romd_mode = true, ram = false, readonly = false, enabled =
> true, rom_device = false, warning_printed = false,
>   flush_coalesced_mmio = false, alias = 0x0, alias_offset = 0, priority
> = 0, may_overlap = false, subregions = {tqh_first = 0x0,
>     tqh_last = 0x7ffff91e8ee8}, subregions_link = {tqe_next = 0x0,
> tqe_prev = 0x7ffff91e64f8}, coalesced = {tqh_first = 0x0,
>     tqh_last = 0x7ffff91e8f08}, name = 0x7ffff906bb60 "a9-scu",
> dirty_log_mask = 0 '\000', ioeventfd_nb = 0, ioeventfds = 0x0,
>   iommu_target_as = 0x0}
> 
> The TLB would only store this region instead of the whole page, and
> subsequent access past the first 256 bytes would be emulated incorrectly.

Good catch. Hmm, and a tricky issue. So we need to preserve the
dispatching read/write handler of sub-pages, and a reference to the
sub-page would continue to end up in the TCG TLBs. But reference
counting of the memory regions that page may point to becomes hairy.

Conceptually, we would have to increment the counter for all regions a
sub-page refers to. Even worse, regions may be added to the sub-page
while it is referenced by some user. Doesn't work.

Well, the alternative is to handle a sub-page dispatch (ie. calling into
subpage_[ram_]read/write just like address_space_rw: take the necessary
lock that protect mapping changes, look into the sub-page and pick up
the target region, invoke memory_region_ref on it, perform the access
and unref the region again. Slow, but that's how sub-pages are. And it
only affects TCG. Hmm, or does your IOMMU core cache translations on a
per-page base as well?

Jan
Jan Kiszka - May 7, 2013, 6:23 p.m.
On 2013-05-07 19:26, Jan Kiszka wrote:
> Well, the alternative is to handle a sub-page dispatch (ie. calling into
> subpage_[ram_]read/write just like address_space_rw: take the necessary
> lock that protect mapping changes, look into the sub-page and pick up
> the target region, invoke memory_region_ref on it, perform the access
> and unref the region again. Slow, but that's how sub-pages are. And it
> only affects TCG. Hmm, or does your IOMMU core cache translations on a
> per-page base as well?

OK, there is no translation caching in the memory core. So I will
preserve the dispatching functions of sub-pages, just like the term
"sub-page" - along with a comment why we depend on page granularity.

Jan
Paolo Bonzini - May 8, 2013, 8:41 a.m.
> On 2013-05-07 19:26, Jan Kiszka wrote:
> > Well, the alternative is to handle a sub-page dispatch (ie. calling into
> > subpage_[ram_]read/write just like address_space_rw: take the necessary
> > lock that protect mapping changes, look into the sub-page and pick up
> > the target region, invoke memory_region_ref on it, perform the access
> > and unref the region again. Slow, but that's how sub-pages are. And it
> > only affects TCG. Hmm, or does your IOMMU core cache translations on a
> > per-page base as well?
> 
> OK, there is no translation caching in the memory core. So I will
> preserve the dispatching functions of sub-pages, just like the term
> "sub-page" - along with a comment why we depend on page granularity.

Note that TCG will cache translations because the TLB entry is filled
with the page after translation has taken place. In fact, this is the
main change from Avi's series to mine, and a side-effect of centralizing
the translation in address_space_translate.

It shouldn't be a problem though, the TLB entry will point to the subpage
and the compiled code will dispatch to it.

Paolo

Patch

From 796abe4e7114d18e74cc869922cc5eb0813396c8 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 7 May 2013 11:30:23 +0200
Subject: [PATCH] subpage fix

Note: this temporarily breaks RAM regions in the I/O address space, but
there is none.  It will be fixed later when the special address space
listener is dropped.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 exec.c |   56 ++++++++++++--------------------------------------------
 1 files changed, 12 insertions(+), 44 deletions(-)

diff --git a/exec.c b/exec.c
index 7098632..21bd08d 100644
--- a/exec.c
+++ b/exec.c
@@ -65,7 +65,6 @@  AddressSpace address_space_io;
 AddressSpace address_space_memory;
 
 MemoryRegion io_mem_ram, io_mem_rom, io_mem_unassigned, io_mem_notdirty;
-static MemoryRegion io_mem_subpage_ram;
 
 #endif
 
@@ -80,7 +79,8 @@  int use_icount;
 
 #if !defined(CONFIG_USER_ONLY)
 
-#define SUBSECTION_IDX(addr) ((addr) & ~TARGET_PAGE_MASK)
+#define SUBSECTION_IDX(addr)      ((addr) & ~TARGET_PAGE_MASK)
+#define PHYS_SECTION_ID(psection) ((psection) - phys_sections)
 
 typedef struct PhysSection {
     MemoryRegionSection section;
@@ -695,7 +695,7 @@  hwaddr memory_region_section_get_iotlb(CPUArchState *env,
             iotlb |= phys_section_rom;
         }
     } else {
-        iotlb = container_of(section, PhysSection, section) - phys_sections;
+        iotlb = PHYS_SECTION_ID(container_of(section, PhysSection, section));
         iotlb += xlat;
     }
 
@@ -782,7 +782,7 @@  static void register_subsection(AddressSpaceDispatch *d,
         .offset_within_address_space = base,
         .size = TARGET_PAGE_SIZE,
     };
-    uint16_t new_section;
+    uint16_t new_section, new_subsection;
     hwaddr start, end;
 
     assert(psection->sub_section ||
@@ -793,10 +793,17 @@  static void register_subsection(AddressSpaceDispatch *d,
         psection = &phys_sections[new_section];
         subsections_init(psection);
         phys_page_set(d, base >> TARGET_PAGE_BITS, 1, new_section);
+    } else {
+        new_section = PHYS_SECTION_ID(psection);
     }
+
+    new_subsection = phys_section_add(section);
+
+    /* phys_section_add invalidates psection, reload it  */
+    psection = &phys_sections[new_section];
     start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
     end = start + section->size - 1;
-    subsection_register(psection, start, end, phys_section_add(section));
+    subsection_register(psection, start, end, new_subsection);
 }
 
 
@@ -1607,38 +1614,6 @@  static const MemoryRegionOps watch_mem_ops = {
     .endianness = DEVICE_NATIVE_ENDIAN,
 };
 
-static uint64_t subpage_ram_read(void *opaque, hwaddr addr,
-                                 unsigned size)
-{
-    ram_addr_t raddr = addr;
-    void *ptr = qemu_get_ram_ptr(raddr);
-    switch (size) {
-    case 1: return ldub_p(ptr);
-    case 2: return lduw_p(ptr);
-    case 4: return ldl_p(ptr);
-    default: abort();
-    }
-}
-
-static void subpage_ram_write(void *opaque, hwaddr addr,
-                              uint64_t value, unsigned size)
-{
-    ram_addr_t raddr = addr;
-    void *ptr = qemu_get_ram_ptr(raddr);
-    switch (size) {
-    case 1: return stb_p(ptr, value);
-    case 2: return stw_p(ptr, value);
-    case 4: return stl_p(ptr, value);
-    default: abort();
-    }
-}
-
-static const MemoryRegionOps subpage_ram_ops = {
-    .read = subpage_ram_read,
-    .write = subpage_ram_write,
-    .endianness = DEVICE_NATIVE_ENDIAN,
-};
-
 static int subsection_register(PhysSection *psection, uint32_t start,
                                uint32_t end, uint16_t section)
 {
@@ -1648,11 +1623,6 @@  static int subsection_register(PhysSection *psection, uint32_t start,
         return -1;
     idx = SUBSECTION_IDX(start);
     eidx = SUBSECTION_IDX(end);
-    if (memory_region_is_ram(phys_sections[section].section.mr)) {
-        MemoryRegionSection new_section = phys_sections[section].section;
-        new_section.mr = &io_mem_subpage_ram;
-        section = phys_section_add(&new_section);
-    }
     for (; idx <= eidx; idx++) {
         psection->sub_section[idx] = section;
     }
@@ -1692,8 +1662,6 @@  static void io_mem_init(void)
                           "unassigned", UINT64_MAX);
     memory_region_init_io(&io_mem_notdirty, &notdirty_mem_ops, NULL,
                           "notdirty", UINT64_MAX);
-    memory_region_init_io(&io_mem_subpage_ram, &subpage_ram_ops, NULL,
-                          "subpage-ram", UINT64_MAX);
     memory_region_init_io(&io_mem_watch, &watch_mem_ops, NULL,
                           "watch", UINT64_MAX);
 }
-- 
1.7.1