diff mbox

[RFC,2/2] mem: prepare address_space listener rcu style

Message ID 1368415264-10800-3-git-send-email-qemulist@gmail.com
State New
Headers show

Commit Message

pingfan liu May 13, 2013, 3:21 a.m. UTC
From: Liu Ping Fan <pingfank@linux.vnet.ibm.com>

Each address space listener has PhysPageMap *cur_map, *next_map,
the switch from cur_map to next_map complete the RCU style. The
mem_commit() do the switch, and it is against reader but AddressSpace's
lock or later RCU mechanism (around address_space_translate() ).

Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
---
 exec.c                         |   36 +++++++++++++++++++++++++++---------
 include/exec/memory-internal.h |   11 ++++++++++-
 2 files changed, 37 insertions(+), 10 deletions(-)

Comments

Paolo Bonzini May 13, 2013, 9:31 a.m. UTC | #1
Il 13/05/2013 05:21, Liu Ping Fan ha scritto:
> From: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
> 
> Each address space listener has PhysPageMap *cur_map, *next_map,
> the switch from cur_map to next_map complete the RCU style. The
> mem_commit() do the switch, and it is against reader but AddressSpace's
> lock or later RCU mechanism (around address_space_translate() ).
> 
> Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
> ---
>  exec.c                         |   36 +++++++++++++++++++++++++++---------
>  include/exec/memory-internal.h |   11 ++++++++++-
>  2 files changed, 37 insertions(+), 10 deletions(-)
> 
> diff --git a/exec.c b/exec.c
> index bb4e540..e5871d6 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -186,24 +186,26 @@ static void phys_page_set(AddressSpaceDispatch *d,
>                            hwaddr index, hwaddr nb,
>                            uint16_t leaf)
>  {
> +    PhysPageMap *map = d->next_map;
>      /* Wildly overreserve - it doesn't matter much. */
>      phys_map_node_reserve(3 * P_L2_LEVELS);
>  
> -    phys_page_set_level(&d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
> +    phys_page_set_level(&map->root, &index, &nb, leaf, P_L2_LEVELS - 1);
>  }
>  
>  static PhysSection *phys_section_find(AddressSpaceDispatch *d,
>                                        hwaddr index)
>  {
> -    PhysPageEntry lp = d->phys_map;
>      PhysPageEntry *p;
> -    PhysSection *phys_sections = cur_pgtbl->phys_sections;
> -    Node *phys_map_nodes = cur_pgtbl->phys_map_nodes;
> +    PhysPageEntry lp = d->cur_map->root;
> +    PhysPageTable *pgtbl = d->cur_map->pgtbl;
> +    PhysSection *phys_sections = pgtbl->phys_sections;
> +    Node *phys_map_nodes = pgtbl->phys_map_nodes;
>      int i;
>  
>      for (i = P_L2_LEVELS - 1; i >= 0 && !lp.is_leaf; i--) {
>          if (lp.ptr == PHYS_MAP_NODE_NIL) {
> -            return &phys_sections[cur_pgtbl->phys_section_unassigned];
> +            return &phys_sections[pgtbl->phys_section_unassigned];
>          }
>          p = phys_map_nodes[lp.ptr];
>          lp = p[(index >> (i * L2_BITS)) & (L2_SIZE - 1)];
> @@ -234,7 +236,7 @@ MemoryRegionSection *address_space_translate(AddressSpace *as, hwaddr addr,
>      IOMMUTLBEntry iotlb;
>      MemoryRegionSection *section;
>      hwaddr len = *plen;
> -
> +    PhysPageTable *pgtbl = cur_pgtbl;

d->cur_map->pgtbl.

>      for (;;) {
>          section = address_space_lookup_region(as, addr);
>  
> @@ -254,7 +256,7 @@ MemoryRegionSection *address_space_translate(AddressSpace *as, hwaddr addr,
>                  | (addr & iotlb.addr_mask));
>          len = MIN(len, (addr | iotlb.addr_mask) - addr + 1);
>          if (!iotlb.perm[is_write]) {
> -            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_unassigned].section;
> +            section = &pgtbl->phys_sections[pgtbl->phys_section_unassigned].section;
>              break;
>          }
>  
> @@ -1703,7 +1705,21 @@ static void mem_begin(MemoryListener *listener)
>  {
>      AddressSpaceDispatch *d = container_of(listener, AddressSpaceDispatch, listener);
>  
> -    d->phys_map.ptr = PHYS_MAP_NODE_NIL;
> +    d->next_map = g_new0(PhysPageMap, 1);
> +    d->next_map->pgtbl = next_pgtbl;
> +}
> +
> +static void mem_commit(MemoryListener *listener)
> +{
> +    AddressSpaceDispatch *d = container_of(listener, AddressSpaceDispatch, listener);
> +    PhysPageMap *m = d->cur_map;
> +
> +    d->cur_map = d->next_map;
> +    /* Fixme, Currently, we rely on biglock or address-space lock against
> +    * reader. So here, we can safely drop it.
> +    * After RCU, should change to call_rcu()
> +    */
> +    g_free(m);
>  }
>  
>  static void core_begin(MemoryListener *listener)
> @@ -1771,11 +1787,12 @@ void address_space_init_dispatch(AddressSpace *as)
>  {
>      AddressSpaceDispatch *d = g_new(AddressSpaceDispatch, 1);
>  
> -    d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .is_leaf = 0 };
> +    d->cur_map = g_new0(PhysPageMap, 1);
>      d->listener = (MemoryListener) {
>          .begin = mem_begin,
>          .region_add = mem_add,
>          .region_nop = mem_add,
> +        .commit = mem_commit,
>          .priority = 0,
>      };
>      as->dispatch = d;
> @@ -1787,6 +1804,7 @@ void address_space_destroy_dispatch(AddressSpace *as)
>      AddressSpaceDispatch *d = as->dispatch;
>  
>      memory_listener_unregister(&d->listener);
> +    g_free(d->cur_map);
>      g_free(d);
>      as->dispatch = NULL;
>  }
> diff --git a/include/exec/memory-internal.h b/include/exec/memory-internal.h
> index 1b156fd..0dfe260 100644
> --- a/include/exec/memory-internal.h
> +++ b/include/exec/memory-internal.h
> @@ -30,13 +30,22 @@ struct PhysPageEntry {
>      uint16_t ptr : 15;
>  };
>  
> +struct PhysPageTable;
> +typedef struct PhysPageMap PhysPageMap;
> +
> +struct PhysPageMap {
> +    PhysPageEntry root;
> +    struct PhysPageTable *pgtbl;

cur_pgtbl should be introduced in patch 1 already.

> +};
> +
>  typedef struct AddressSpaceDispatch AddressSpaceDispatch;
>  
>  struct AddressSpaceDispatch {
>      /* This is a multi-level map on the physical address space.
>       * The bottom level has pointers to MemoryRegionSections.
>       */
> -    PhysPageEntry phys_map;
> +    PhysPageMap *cur_map;
> +    PhysPageMap *next_map;

Pointers are quite expensive here.  With RCU we can fetch a consistent
root/table pair like this:

    rcu_read_lock();
    do {
        pgtbl = d->cur_pgtbl;
        smp_rmb();
        root = d->cur_root;

        /* RCU ensures that d->cur_pgtbl remains alive, thus it cannot
         * be recycled while this loop is running.  If
         * d->cur_pgtbl == pgtbl, the root is the right one for this
         * pgtable.
         */
        smp_rmb();
    } while (d->cur_pgtbl == pgtbl);
    ...
    rcu_read_unlock();

Remember to have a matching smp_wmb() in mem_commit, and to write ->root
first:

    old_pgtbl = d->cur_pgtbl;
    smp_wmb();
    d->cur_root = d->next_root;

    /* Write the root before updating the page table.  */
    smp_wmb();
    d->cur_pgtbl = d->next_pgtbl;

    /* Write cur_pgtbl before possibly destroying the old one.  */
    smp_mb();
    page_table_unref(old_pgtbl); /* uses call_rcu if --refcount == 0 */

If you are renaming fields, please do it as the first step.

Paolo

>      MemoryListener listener;
>  };
>  
>
pingfan liu May 14, 2013, 5:47 a.m. UTC | #2
On Mon, May 13, 2013 at 5:31 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> Il 13/05/2013 05:21, Liu Ping Fan ha scritto:
>> From: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
>>
>> Each address space listener has PhysPageMap *cur_map, *next_map,
>> the switch from cur_map to next_map complete the RCU style. The
>> mem_commit() do the switch, and it is against reader but AddressSpace's
>> lock or later RCU mechanism (around address_space_translate() ).
>>
>> Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
>> ---
>>  exec.c                         |   36 +++++++++++++++++++++++++++---------
>>  include/exec/memory-internal.h |   11 ++++++++++-
>>  2 files changed, 37 insertions(+), 10 deletions(-)
>>
>> diff --git a/exec.c b/exec.c
>> index bb4e540..e5871d6 100644
>> --- a/exec.c
>> +++ b/exec.c
>> @@ -186,24 +186,26 @@ static void phys_page_set(AddressSpaceDispatch *d,
>>                            hwaddr index, hwaddr nb,
>>                            uint16_t leaf)
>>  {
>> +    PhysPageMap *map = d->next_map;
>>      /* Wildly overreserve - it doesn't matter much. */
>>      phys_map_node_reserve(3 * P_L2_LEVELS);
>>
>> -    phys_page_set_level(&d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
>> +    phys_page_set_level(&map->root, &index, &nb, leaf, P_L2_LEVELS - 1);
>>  }
>>
>>  static PhysSection *phys_section_find(AddressSpaceDispatch *d,
>>                                        hwaddr index)
>>  {
>> -    PhysPageEntry lp = d->phys_map;
>>      PhysPageEntry *p;
>> -    PhysSection *phys_sections = cur_pgtbl->phys_sections;
>> -    Node *phys_map_nodes = cur_pgtbl->phys_map_nodes;
>> +    PhysPageEntry lp = d->cur_map->root;
>> +    PhysPageTable *pgtbl = d->cur_map->pgtbl;
>> +    PhysSection *phys_sections = pgtbl->phys_sections;
>> +    Node *phys_map_nodes = pgtbl->phys_map_nodes;
>>      int i;
>>
>>      for (i = P_L2_LEVELS - 1; i >= 0 && !lp.is_leaf; i--) {
>>          if (lp.ptr == PHYS_MAP_NODE_NIL) {
>> -            return &phys_sections[cur_pgtbl->phys_section_unassigned];
>> +            return &phys_sections[pgtbl->phys_section_unassigned];
>>          }
>>          p = phys_map_nodes[lp.ptr];
>>          lp = p[(index >> (i * L2_BITS)) & (L2_SIZE - 1)];
>> @@ -234,7 +236,7 @@ MemoryRegionSection *address_space_translate(AddressSpace *as, hwaddr addr,
>>      IOMMUTLBEntry iotlb;
>>      MemoryRegionSection *section;
>>      hwaddr len = *plen;
>> -
>> +    PhysPageTable *pgtbl = cur_pgtbl;
>
> d->cur_map->pgtbl.
>
>>      for (;;) {
>>          section = address_space_lookup_region(as, addr);
>>
>> @@ -254,7 +256,7 @@ MemoryRegionSection *address_space_translate(AddressSpace *as, hwaddr addr,
>>                  | (addr & iotlb.addr_mask));
>>          len = MIN(len, (addr | iotlb.addr_mask) - addr + 1);
>>          if (!iotlb.perm[is_write]) {
>> -            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_unassigned].section;
>> +            section = &pgtbl->phys_sections[pgtbl->phys_section_unassigned].section;
>>              break;
>>          }
>>
>> @@ -1703,7 +1705,21 @@ static void mem_begin(MemoryListener *listener)
>>  {
>>      AddressSpaceDispatch *d = container_of(listener, AddressSpaceDispatch, listener);
>>
>> -    d->phys_map.ptr = PHYS_MAP_NODE_NIL;
>> +    d->next_map = g_new0(PhysPageMap, 1);
>> +    d->next_map->pgtbl = next_pgtbl;
>> +}
>> +
>> +static void mem_commit(MemoryListener *listener)
>> +{
>> +    AddressSpaceDispatch *d = container_of(listener, AddressSpaceDispatch, listener);
>> +    PhysPageMap *m = d->cur_map;
>> +
>> +    d->cur_map = d->next_map;
>> +    /* Fixme, Currently, we rely on biglock or address-space lock against
>> +    * reader. So here, we can safely drop it.
>> +    * After RCU, should change to call_rcu()
>> +    */
>> +    g_free(m);
>>  }
>>
>>  static void core_begin(MemoryListener *listener)
>> @@ -1771,11 +1787,12 @@ void address_space_init_dispatch(AddressSpace *as)
>>  {
>>      AddressSpaceDispatch *d = g_new(AddressSpaceDispatch, 1);
>>
>> -    d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .is_leaf = 0 };
>> +    d->cur_map = g_new0(PhysPageMap, 1);
>>      d->listener = (MemoryListener) {
>>          .begin = mem_begin,
>>          .region_add = mem_add,
>>          .region_nop = mem_add,
>> +        .commit = mem_commit,
>>          .priority = 0,
>>      };
>>      as->dispatch = d;
>> @@ -1787,6 +1804,7 @@ void address_space_destroy_dispatch(AddressSpace *as)
>>      AddressSpaceDispatch *d = as->dispatch;
>>
>>      memory_listener_unregister(&d->listener);
>> +    g_free(d->cur_map);
>>      g_free(d);
>>      as->dispatch = NULL;
>>  }
>> diff --git a/include/exec/memory-internal.h b/include/exec/memory-internal.h
>> index 1b156fd..0dfe260 100644
>> --- a/include/exec/memory-internal.h
>> +++ b/include/exec/memory-internal.h
>> @@ -30,13 +30,22 @@ struct PhysPageEntry {
>>      uint16_t ptr : 15;
>>  };
>>
>> +struct PhysPageTable;
>> +typedef struct PhysPageMap PhysPageMap;
>> +
>> +struct PhysPageMap {
>> +    PhysPageEntry root;
>> +    struct PhysPageTable *pgtbl;
>
> cur_pgtbl should be introduced in patch 1 already.
>
PhysPageMap is a member of each AddressSpaceDispatch.  And we achieve
RCU based on PhysPageMap *cur_map=next_map.  While
cur_pgtbl/next_pgtbl are shared by all AddressSpaceDispatch. And the
out-date one will be reclaimed after all AddressSpaceDispatch have
finish its RCU critical section.

Maybe I did not lay out the patches correctly, but I originally want
to split it out to distinguish the shared object by all
AddressSpaceDispatch and their own object.
>> +};
>> +
>>  typedef struct AddressSpaceDispatch AddressSpaceDispatch;
>>
>>  struct AddressSpaceDispatch {
>>      /* This is a multi-level map on the physical address space.
>>       * The bottom level has pointers to MemoryRegionSections.
>>       */
>> -    PhysPageEntry phys_map;
>> +    PhysPageMap *cur_map;
>> +    PhysPageMap *next_map;
>
> Pointers are quite expensive here.  With RCU we can fetch a consistent
> root/table pair like this:
>
>     rcu_read_lock();
>     do {
>         pgtbl = d->cur_pgtbl;
>         smp_rmb();
>         root = d->cur_root;
>
>         /* RCU ensures that d->cur_pgtbl remains alive, thus it cannot
>          * be recycled while this loop is running.  If
>          * d->cur_pgtbl == pgtbl, the root is the right one for this
>          * pgtable.
>          */
>         smp_rmb();
>     } while (d->cur_pgtbl == pgtbl);
>     ...
>     rcu_read_unlock();
>
It seems to break the semantic of rcu_dereference() and rcu_assign().
If pointers are expensive, how about this:
if (unlikely(d->prev_map!=d->cur_map)) {
    d->root = d->cur_map->root;
    d->pgtbl = d->cur_map->root;
    d->prev_map = d->cur_map;
}
So usually, we use cache value.

Thanks and regards,
Pingfan
> Remember to have a matching smp_wmb() in mem_commit, and to write ->root
> first:
>
>     old_pgtbl = d->cur_pgtbl;
>     smp_wmb();
>     d->cur_root = d->next_root;
>
>     /* Write the root before updating the page table.  */
>     smp_wmb();
>     d->cur_pgtbl = d->next_pgtbl;
>
>     /* Write cur_pgtbl before possibly destroying the old one.  */
>     smp_mb();
>     page_table_unref(old_pgtbl); /* uses call_rcu if --refcount == 0 */
>
> If you are renaming fields, please do it as the first step.
>
> Paolo
>
>>      MemoryListener listener;
>>  };
>>
>>
>
Paolo Bonzini May 14, 2013, 9:34 a.m. UTC | #3
Il 14/05/2013 07:47, liu ping fan ha scritto:
> On Mon, May 13, 2013 at 5:31 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>> Il 13/05/2013 05:21, Liu Ping Fan ha scritto:
>>> From: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
>>>
>>> Each address space listener has PhysPageMap *cur_map, *next_map,
>>> the switch from cur_map to next_map complete the RCU style. The
>>> mem_commit() do the switch, and it is against reader but AddressSpace's
>>> lock or later RCU mechanism (around address_space_translate() ).
>>>
>>> Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
>>> ---
>>>  exec.c                         |   36 +++++++++++++++++++++++++++---------
>>>  include/exec/memory-internal.h |   11 ++++++++++-
>>>  2 files changed, 37 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/exec.c b/exec.c
>>> index bb4e540..e5871d6 100644
>>> --- a/exec.c
>>> +++ b/exec.c
>>> @@ -186,24 +186,26 @@ static void phys_page_set(AddressSpaceDispatch *d,
>>>                            hwaddr index, hwaddr nb,
>>>                            uint16_t leaf)
>>>  {
>>> +    PhysPageMap *map = d->next_map;
>>>      /* Wildly overreserve - it doesn't matter much. */
>>>      phys_map_node_reserve(3 * P_L2_LEVELS);
>>>
>>> -    phys_page_set_level(&d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
>>> +    phys_page_set_level(&map->root, &index, &nb, leaf, P_L2_LEVELS - 1);
>>>  }
>>>
>>>  static PhysSection *phys_section_find(AddressSpaceDispatch *d,
>>>                                        hwaddr index)
>>>  {
>>> -    PhysPageEntry lp = d->phys_map;
>>>      PhysPageEntry *p;
>>> -    PhysSection *phys_sections = cur_pgtbl->phys_sections;
>>> -    Node *phys_map_nodes = cur_pgtbl->phys_map_nodes;
>>> +    PhysPageEntry lp = d->cur_map->root;
>>> +    PhysPageTable *pgtbl = d->cur_map->pgtbl;
>>> +    PhysSection *phys_sections = pgtbl->phys_sections;
>>> +    Node *phys_map_nodes = pgtbl->phys_map_nodes;
>>>      int i;
>>>
>>>      for (i = P_L2_LEVELS - 1; i >= 0 && !lp.is_leaf; i--) {
>>>          if (lp.ptr == PHYS_MAP_NODE_NIL) {
>>> -            return &phys_sections[cur_pgtbl->phys_section_unassigned];
>>> +            return &phys_sections[pgtbl->phys_section_unassigned];
>>>          }
>>>          p = phys_map_nodes[lp.ptr];
>>>          lp = p[(index >> (i * L2_BITS)) & (L2_SIZE - 1)];
>>> @@ -234,7 +236,7 @@ MemoryRegionSection *address_space_translate(AddressSpace *as, hwaddr addr,
>>>      IOMMUTLBEntry iotlb;
>>>      MemoryRegionSection *section;
>>>      hwaddr len = *plen;
>>> -
>>> +    PhysPageTable *pgtbl = cur_pgtbl;
>>
>> d->cur_map->pgtbl.
>>
>>>      for (;;) {
>>>          section = address_space_lookup_region(as, addr);
>>>
>>> @@ -254,7 +256,7 @@ MemoryRegionSection *address_space_translate(AddressSpace *as, hwaddr addr,
>>>                  | (addr & iotlb.addr_mask));
>>>          len = MIN(len, (addr | iotlb.addr_mask) - addr + 1);
>>>          if (!iotlb.perm[is_write]) {
>>> -            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_unassigned].section;
>>> +            section = &pgtbl->phys_sections[pgtbl->phys_section_unassigned].section;
>>>              break;
>>>          }
>>>
>>> @@ -1703,7 +1705,21 @@ static void mem_begin(MemoryListener *listener)
>>>  {
>>>      AddressSpaceDispatch *d = container_of(listener, AddressSpaceDispatch, listener);
>>>
>>> -    d->phys_map.ptr = PHYS_MAP_NODE_NIL;
>>> +    d->next_map = g_new0(PhysPageMap, 1);
>>> +    d->next_map->pgtbl = next_pgtbl;
>>> +}
>>> +
>>> +static void mem_commit(MemoryListener *listener)
>>> +{
>>> +    AddressSpaceDispatch *d = container_of(listener, AddressSpaceDispatch, listener);
>>> +    PhysPageMap *m = d->cur_map;
>>> +
>>> +    d->cur_map = d->next_map;
>>> +    /* Fixme, Currently, we rely on biglock or address-space lock against
>>> +    * reader. So here, we can safely drop it.
>>> +    * After RCU, should change to call_rcu()
>>> +    */
>>> +    g_free(m);
>>>  }
>>>
>>>  static void core_begin(MemoryListener *listener)
>>> @@ -1771,11 +1787,12 @@ void address_space_init_dispatch(AddressSpace *as)
>>>  {
>>>      AddressSpaceDispatch *d = g_new(AddressSpaceDispatch, 1);
>>>
>>> -    d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .is_leaf = 0 };
>>> +    d->cur_map = g_new0(PhysPageMap, 1);
>>>      d->listener = (MemoryListener) {
>>>          .begin = mem_begin,
>>>          .region_add = mem_add,
>>>          .region_nop = mem_add,
>>> +        .commit = mem_commit,
>>>          .priority = 0,
>>>      };
>>>      as->dispatch = d;
>>> @@ -1787,6 +1804,7 @@ void address_space_destroy_dispatch(AddressSpace *as)
>>>      AddressSpaceDispatch *d = as->dispatch;
>>>
>>>      memory_listener_unregister(&d->listener);
>>> +    g_free(d->cur_map);
>>>      g_free(d);
>>>      as->dispatch = NULL;
>>>  }
>>> diff --git a/include/exec/memory-internal.h b/include/exec/memory-internal.h
>>> index 1b156fd..0dfe260 100644
>>> --- a/include/exec/memory-internal.h
>>> +++ b/include/exec/memory-internal.h
>>> @@ -30,13 +30,22 @@ struct PhysPageEntry {
>>>      uint16_t ptr : 15;
>>>  };
>>>
>>> +struct PhysPageTable;
>>> +typedef struct PhysPageMap PhysPageMap;
>>> +
>>> +struct PhysPageMap {
>>> +    PhysPageEntry root;
>>> +    struct PhysPageTable *pgtbl;
>>
>> cur_pgtbl should be introduced in patch 1 already.
>>
> PhysPageMap is a member of each AddressSpaceDispatch.  And we achieve
> RCU based on PhysPageMap *cur_map=next_map.

But assignment of cur_map is not atomic.  That's what I was trying to
achieve with the do...while loop below.

> While cur_pgtbl/next_pgtbl are shared by all AddressSpaceDispatch.

You cannot share them, because the root of some AddressSpaceDispatch
structures will still refer to the old pagetable.  You need to attach
the root of each ASD to the right pagetable, and the simplest way to do
it is to put cur_pgtbl/next_pgtbl in the ASD.  And this problem exists
in the first patch already, so that's where you have to put it.

>>> +};
>>> +
>>>  typedef struct AddressSpaceDispatch AddressSpaceDispatch;
>>>
>>>  struct AddressSpaceDispatch {
>>>      /* This is a multi-level map on the physical address space.
>>>       * The bottom level has pointers to MemoryRegionSections.
>>>       */
>>> -    PhysPageEntry phys_map;
>>> +    PhysPageMap *cur_map;
>>> +    PhysPageMap *next_map;
>>
>> Pointers are quite expensive here.  With RCU we can fetch a consistent
>> root/table pair like this:
>>
>>     rcu_read_lock();
>>     do {
>>         pgtbl = d->cur_pgtbl;
>>         smp_rmb();
>>         root = d->cur_root;
>>
>>         /* RCU ensures that d->cur_pgtbl remains alive, thus it cannot
>>          * be recycled while this loop is running.  If
>>          * d->cur_pgtbl == pgtbl, the root is the right one for this
>>          * pgtable.
>>          */
>>         smp_rmb();
>>     } while (d->cur_pgtbl == pgtbl);

Ouch, != of course.

>>     ...
>>     rcu_read_unlock();
>>
> It seems to break the semantic of rcu_dereference() and rcu_assign().

It doesn't.  In fact it is even stronger, I'm using a "full" rmb instead
of read_barrier_depends.

> If pointers are expensive, how about this:
> if (unlikely(d->prev_map!=d->cur_map)) {
>     d->root = d->cur_map->root;
>     d->pgtbl = d->cur_map->root;
>     d->prev_map = d->cur_map;
> }
> So usually, we use cache value.

Doesn't work, it has ABA problem.  In my solution above, RCU avoids ABA
because the read and the check are under the same RCU critical section.
 In your solution, the read and the check are under different RCU
critical sections, so it doesn't work.

Paolo

> Thanks and regards,
> Pingfan
>> Remember to have a matching smp_wmb() in mem_commit, and to write ->root
>> first:
>>
>>     old_pgtbl = d->cur_pgtbl;
>>     smp_wmb();
>>     d->cur_root = d->next_root;
>>
>>     /* Write the root before updating the page table.  */
>>     smp_wmb();
>>     d->cur_pgtbl = d->next_pgtbl;
>>
>>     /* Write cur_pgtbl before possibly destroying the old one.  */
>>     smp_mb();
>>     page_table_unref(old_pgtbl); /* uses call_rcu if --refcount == 0 */
>>
>> If you are renaming fields, please do it as the first step.
>>
>> Paolo
>>
>>>      MemoryListener listener;
>>>  };
>>>
>>>
>>
> 
>
pingfan liu May 15, 2013, 1:29 a.m. UTC | #4
On Tue, May 14, 2013 at 5:34 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> Il 14/05/2013 07:47, liu ping fan ha scritto:
>> On Mon, May 13, 2013 at 5:31 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>> Il 13/05/2013 05:21, Liu Ping Fan ha scritto:
>>>> From: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
>>>>
>>>> Each address space listener has PhysPageMap *cur_map, *next_map,
>>>> the switch from cur_map to next_map complete the RCU style. The
>>>> mem_commit() do the switch, and it is against reader but AddressSpace's
>>>> lock or later RCU mechanism (around address_space_translate() ).
>>>>
>>>> Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
>>>> ---
>>>>  exec.c                         |   36 +++++++++++++++++++++++++++---------
>>>>  include/exec/memory-internal.h |   11 ++++++++++-
>>>>  2 files changed, 37 insertions(+), 10 deletions(-)
>>>>
>>>> diff --git a/exec.c b/exec.c
>>>> index bb4e540..e5871d6 100644
>>>> --- a/exec.c
>>>> +++ b/exec.c
>>>> @@ -186,24 +186,26 @@ static void phys_page_set(AddressSpaceDispatch *d,
>>>>                            hwaddr index, hwaddr nb,
>>>>                            uint16_t leaf)
>>>>  {
>>>> +    PhysPageMap *map = d->next_map;
>>>>      /* Wildly overreserve - it doesn't matter much. */
>>>>      phys_map_node_reserve(3 * P_L2_LEVELS);
>>>>
>>>> -    phys_page_set_level(&d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
>>>> +    phys_page_set_level(&map->root, &index, &nb, leaf, P_L2_LEVELS - 1);
>>>>  }
>>>>
>>>>  static PhysSection *phys_section_find(AddressSpaceDispatch *d,
>>>>                                        hwaddr index)
>>>>  {
>>>> -    PhysPageEntry lp = d->phys_map;
>>>>      PhysPageEntry *p;
>>>> -    PhysSection *phys_sections = cur_pgtbl->phys_sections;
>>>> -    Node *phys_map_nodes = cur_pgtbl->phys_map_nodes;
>>>> +    PhysPageEntry lp = d->cur_map->root;
>>>> +    PhysPageTable *pgtbl = d->cur_map->pgtbl;
>>>> +    PhysSection *phys_sections = pgtbl->phys_sections;
>>>> +    Node *phys_map_nodes = pgtbl->phys_map_nodes;
>>>>      int i;
>>>>
>>>>      for (i = P_L2_LEVELS - 1; i >= 0 && !lp.is_leaf; i--) {
>>>>          if (lp.ptr == PHYS_MAP_NODE_NIL) {
>>>> -            return &phys_sections[cur_pgtbl->phys_section_unassigned];
>>>> +            return &phys_sections[pgtbl->phys_section_unassigned];
>>>>          }
>>>>          p = phys_map_nodes[lp.ptr];
>>>>          lp = p[(index >> (i * L2_BITS)) & (L2_SIZE - 1)];
>>>> @@ -234,7 +236,7 @@ MemoryRegionSection *address_space_translate(AddressSpace *as, hwaddr addr,
>>>>      IOMMUTLBEntry iotlb;
>>>>      MemoryRegionSection *section;
>>>>      hwaddr len = *plen;
>>>> -
>>>> +    PhysPageTable *pgtbl = cur_pgtbl;
>>>
>>> d->cur_map->pgtbl.
>>>
>>>>      for (;;) {
>>>>          section = address_space_lookup_region(as, addr);
>>>>
>>>> @@ -254,7 +256,7 @@ MemoryRegionSection *address_space_translate(AddressSpace *as, hwaddr addr,
>>>>                  | (addr & iotlb.addr_mask));
>>>>          len = MIN(len, (addr | iotlb.addr_mask) - addr + 1);
>>>>          if (!iotlb.perm[is_write]) {
>>>> -            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_unassigned].section;
>>>> +            section = &pgtbl->phys_sections[pgtbl->phys_section_unassigned].section;
>>>>              break;
>>>>          }
>>>>
>>>> @@ -1703,7 +1705,21 @@ static void mem_begin(MemoryListener *listener)
>>>>  {
>>>>      AddressSpaceDispatch *d = container_of(listener, AddressSpaceDispatch, listener);
>>>>
>>>> -    d->phys_map.ptr = PHYS_MAP_NODE_NIL;
>>>> +    d->next_map = g_new0(PhysPageMap, 1);
>>>> +    d->next_map->pgtbl = next_pgtbl;
>>>> +}
>>>> +
>>>> +static void mem_commit(MemoryListener *listener)
>>>> +{
>>>> +    AddressSpaceDispatch *d = container_of(listener, AddressSpaceDispatch, listener);
>>>> +    PhysPageMap *m = d->cur_map;
>>>> +
>>>> +    d->cur_map = d->next_map;
>>>> +    /* Fixme, Currently, we rely on biglock or address-space lock against
>>>> +    * reader. So here, we can safely drop it.
>>>> +    * After RCU, should change to call_rcu()
>>>> +    */
>>>> +    g_free(m);
>>>>  }
>>>>
>>>>  static void core_begin(MemoryListener *listener)
>>>> @@ -1771,11 +1787,12 @@ void address_space_init_dispatch(AddressSpace *as)
>>>>  {
>>>>      AddressSpaceDispatch *d = g_new(AddressSpaceDispatch, 1);
>>>>
>>>> -    d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .is_leaf = 0 };
>>>> +    d->cur_map = g_new0(PhysPageMap, 1);
>>>>      d->listener = (MemoryListener) {
>>>>          .begin = mem_begin,
>>>>          .region_add = mem_add,
>>>>          .region_nop = mem_add,
>>>> +        .commit = mem_commit,
>>>>          .priority = 0,
>>>>      };
>>>>      as->dispatch = d;
>>>> @@ -1787,6 +1804,7 @@ void address_space_destroy_dispatch(AddressSpace *as)
>>>>      AddressSpaceDispatch *d = as->dispatch;
>>>>
>>>>      memory_listener_unregister(&d->listener);
>>>> +    g_free(d->cur_map);
>>>>      g_free(d);
>>>>      as->dispatch = NULL;
>>>>  }
>>>> diff --git a/include/exec/memory-internal.h b/include/exec/memory-internal.h
>>>> index 1b156fd..0dfe260 100644
>>>> --- a/include/exec/memory-internal.h
>>>> +++ b/include/exec/memory-internal.h
>>>> @@ -30,13 +30,22 @@ struct PhysPageEntry {
>>>>      uint16_t ptr : 15;
>>>>  };
>>>>
>>>> +struct PhysPageTable;
>>>> +typedef struct PhysPageMap PhysPageMap;
>>>> +
>>>> +struct PhysPageMap {
>>>> +    PhysPageEntry root;
>>>> +    struct PhysPageTable *pgtbl;
>>>
>>> cur_pgtbl should be introduced in patch 1 already.
>>>
>> PhysPageMap is a member of each AddressSpaceDispatch.  And we achieve
>> RCU based on PhysPageMap *cur_map=next_map.
>
> But assignment of cur_map is not atomic.  That's what I was trying to
> achieve with the do...while loop below.
>
It is atomic. The reason why I introduce PhysPageMap* is to ensure the
atomic assignment in RCU, since root and pgtbl can not be changed
atomically.

>> While cur_pgtbl/next_pgtbl are shared by all AddressSpaceDispatch.
>
> You cannot share them, because the root of some AddressSpaceDispatch
> structures will still refer to the old pagetable.  You need to attach

All the AddressSpaceDispatch's updaters are overlapped in
mem_commit(), the shared cur_pgtbl will be reclaimed only after all
the AddressSpaceDispatch have been reclaimed. So at that time, no
"root of some AddressSpaceDispatch  structures will still refer to the
old pagetable"

> the root of each ASD to the right pagetable, and the simplest way to do
> it is to put cur_pgtbl/next_pgtbl in the ASD.  And this problem exists
> in the first patch already, so that's where you have to put it.
>
I will implement them in each ASD, since it is more coherent with RCU
model (not coalescing updaters ).
>>>> +};
>>>> +
>>>>  typedef struct AddressSpaceDispatch AddressSpaceDispatch;
>>>>
>>>>  struct AddressSpaceDispatch {
>>>>      /* This is a multi-level map on the physical address space.
>>>>       * The bottom level has pointers to MemoryRegionSections.
>>>>       */
>>>> -    PhysPageEntry phys_map;
>>>> +    PhysPageMap *cur_map;
>>>> +    PhysPageMap *next_map;
>>>
>>> Pointers are quite expensive here.  With RCU we can fetch a consistent
>>> root/table pair like this:
>>>
>>>     rcu_read_lock();
>>>     do {
>>>         pgtbl = d->cur_pgtbl;
>>>         smp_rmb();
>>>         root = d->cur_root;
>>>
>>>         /* RCU ensures that d->cur_pgtbl remains alive, thus it cannot
>>>          * be recycled while this loop is running.  If
>>>          * d->cur_pgtbl == pgtbl, the root is the right one for this
>>>          * pgtable.
>>>          */
>>>         smp_rmb();
>>>     } while (d->cur_pgtbl == pgtbl);
>
> Ouch, != of course.
>
>>>     ...
>>>     rcu_read_unlock();
>>>
>> It seems to break the semantic of rcu_dereference() and rcu_assign().
>
> It doesn't.  In fact it is even stronger, I'm using a "full" rmb instead
> of read_barrier_depends.
>
rcu_dereference()/rcu_assign() ensure the switch from prev to next
version, based on atomic-ops. I think your method _does_ work based on
read+check+barrier skill, but it is not the standard RCU method, and
export some concept (barrier) outside RCU.

>> If pointers are expensive, how about this:
>> if (unlikely(d->prev_map!=d->cur_map)) {
>>     d->root = d->cur_map->root;
>>     d->pgtbl = d->cur_map->root;
>>     d->prev_map = d->cur_map;
>> }
>> So usually, we use cache value.
>
rcu_read_lock();
map = rcu_derefenrence(d->cur_map)
if (unlikely(d->prev_map!=map) {
    d->root = map->root;
    d->pgtbl = map->pgtbl;
}
......
rcu_read_unlock();

Then it can avoid ABA problem.

Thanks and regards,
Pingfan

> Doesn't work, it has ABA problem.  In my solution above, RCU avoids ABA
> because the read and the check are under the same RCU critical section.
>  In your solution, the read and the check are under different RCU
> critical sections, so it doesn't work.
>
> Paolo
>
>> Thanks and regards,
>> Pingfan
>>> Remember to have a matching smp_wmb() in mem_commit, and to write ->root
>>> first:
>>>
>>>     old_pgtbl = d->cur_pgtbl;
>>>     smp_wmb();
>>>     d->cur_root = d->next_root;
>>>
>>>     /* Write the root before updating the page table.  */
>>>     smp_wmb();
>>>     d->cur_pgtbl = d->next_pgtbl;
>>>
>>>     /* Write cur_pgtbl before possibly destroying the old one.  */
>>>     smp_mb();
>>>     page_table_unref(old_pgtbl); /* uses call_rcu if --refcount == 0 */
>>>
>>> If you are renaming fields, please do it as the first step.
>>>
>>> Paolo
>>>
>>>>      MemoryListener listener;
>>>>  };
>>>>
>>>>
>>>
>>
>>
>
Paolo Bonzini May 15, 2013, 8:22 a.m. UTC | #5
Il 15/05/2013 03:29, liu ping fan ha scritto:
>>>> Pointers are quite expensive here.  With RCU we can fetch a consistent
>>>> root/table pair like this:
>>>>
>>>>     rcu_read_lock();
>>>>     do {
>>>>         pgtbl = d->cur_pgtbl;
>>>>         smp_rmb();
>>>>         root = d->cur_root;
>>>>
>>>>         /* RCU ensures that d->cur_pgtbl remains alive, thus it cannot
>>>>          * be recycled while this loop is running.  If
>>>>          * d->cur_pgtbl == pgtbl, the root is the right one for this
>>>>          * pgtable.
>>>>          */
>>>>         smp_rmb();
>>>>     } while (d->cur_pgtbl == pgtbl);
>>
>> Ouch, != of course.
>>
>>>>     ...
>>>>     rcu_read_unlock();
>>>>
>>> It seems to break the semantic of rcu_dereference() and rcu_assign().
>>
>> It doesn't.  In fact it is even stronger, I'm using a "full" rmb instead
>> of read_barrier_depends.
>>
> rcu_dereference()/rcu_assign() ensure the switch from prev to next
> version, based on atomic-ops.

rcu_dereference()/rcu_assign() are not magic, they are simply
read+read_barrier_depends and wmb+write.

> I think your method _does_ work based on
> read+check+barrier skill, but it is not the standard RCU method, and
> export some concept (barrier) outside RCU.

It is a standard method to load 2 words and ensure it is consistent.  If
you want to use rcu_dereference(&d->cur_pgtbl) and
rcu_dereference(&d->cur_root), that's fine.  But you still need the read
barrier.

>>> If pointers are expensive, how about this:
>>> if (unlikely(d->prev_map!=d->cur_map)) {
>>>     d->root = d->cur_map->root;
>>>     d->pgtbl = d->cur_map->root;
>>>     d->prev_map = d->cur_map;
>>> }
>>> So usually, we use cache value.
>>
> rcu_read_lock();
> map = rcu_derefenrence(d->cur_map)
> if (unlikely(d->prev_map!=map) {
>     d->root = map->root;
>     d->pgtbl = map->pgtbl;
> }
> ......
> rcu_read_unlock();
> 
> Then it can avoid ABA problem.

I don't see the assignment of prev_map, which is where the ABA problem
arises.

Paolo
pingfan liu May 15, 2013, 9:11 a.m. UTC | #6
On Wed, May 15, 2013 at 4:22 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> Il 15/05/2013 03:29, liu ping fan ha scritto:
>>>>> Pointers are quite expensive here.  With RCU we can fetch a consistent
>>>>> root/table pair like this:
>>>>>
>>>>>     rcu_read_lock();
>>>>>     do {
>>>>>         pgtbl = d->cur_pgtbl;
>>>>>         smp_rmb();
>>>>>         root = d->cur_root;
>>>>>
>>>>>         /* RCU ensures that d->cur_pgtbl remains alive, thus it cannot
>>>>>          * be recycled while this loop is running.  If
>>>>>          * d->cur_pgtbl == pgtbl, the root is the right one for this
>>>>>          * pgtable.
>>>>>          */
>>>>>         smp_rmb();
>>>>>     } while (d->cur_pgtbl == pgtbl);
>>>
>>> Ouch, != of course.
>>>
>>>>>     ...
>>>>>     rcu_read_unlock();
>>>>>
>>>> It seems to break the semantic of rcu_dereference() and rcu_assign().
>>>
>>> It doesn't.  In fact it is even stronger, I'm using a "full" rmb instead
>>> of read_barrier_depends.
>>>
>> rcu_dereference()/rcu_assign() ensure the switch from prev to next
>> version, based on atomic-ops.
>
> rcu_dereference()/rcu_assign() are not magic, they are simply
> read+read_barrier_depends and wmb+write.
>
Yes.
>> I think your method _does_ work based on
>> read+check+barrier skill, but it is not the standard RCU method, and
>> export some concept (barrier) outside RCU.
>
> It is a standard method to load 2 words and ensure it is consistent.  If

Yes, it is.
> you want to use rcu_dereference(&d->cur_pgtbl) and
> rcu_dereference(&d->cur_root), that's fine.  But you still need the read
> barrier.
>
Ok.
>>>> If pointers are expensive, how about this:
>>>> if (unlikely(d->prev_map!=d->cur_map)) {
>>>>     d->root = d->cur_map->root;
>>>>     d->pgtbl = d->cur_map->root;
>>>>     d->prev_map = d->cur_map;
>>>> }
>>>> So usually, we use cache value.
>>>
>> rcu_read_lock();
>> map = rcu_derefenrence(d->cur_map)
>> if (unlikely(d->prev_map!=map) {
>>     d->root = map->root;
>>     d->pgtbl = map->pgtbl;
------------------------------------> d->prev_map = map // As you
mentioned ahead,  with RCU, ABA can not occur

Regards,
Pingfan
>> }
>> ......
>> rcu_read_unlock();
>>
>> Then it can avoid ABA problem.
>
> I don't see the assignment of prev_map, which is where the ABA problem
> arises.
>
> Paolo
Paolo Bonzini May 15, 2013, 9:19 a.m. UTC | #7
> > > rcu_read_lock();
> > > map = rcu_dereference(d->cur_map)
> > > if (unlikely(d->prev_map!=map) {
> > >     d->root = map->root;
> > >     d->pgtbl = map->pgtbl;
> > > }
> > > ......
> > > rcu_read_unlock();
> > >
> > > Then it can avoid ABA problem.
> >
> > I don't see the assignment of prev_map, which is where the ABA problem
> > arises.

If you do this:

> rcu_read_lock();
> map = rcu_derefenrence(d->cur_map)
> if (unlikely(d->prev_map!=map) {
>     d->root = map->root;
>     d->pgtbl = map->pgtbl;
>     d->prev_map = map;
> }
> ....
> rcu_read_unlock();

It can happen.  The problem is that prev_map can change _after_
the rcu_read_unlock.  It is the same as your previous solution.
Please just use the load-and-check idiom.  I'll post my RCU
implementation as soon as I finish the documentation.

Paolo
Paolo Bonzini May 16, 2013, 9:09 a.m. UTC | #8
Better:

    rcu_read_lock();
    do {
        pgtbl = d->cur_pgtbl;
        smp_rmb();
        root = d->cur_root;
        smp_rmb();

        /* d->cur_pgtbl == d->next_pgtbl only during an update.  */
    } while (pgtbl == d->next_pgtbl);
    ...
    rcu_read_unlock();

And in the writer:

    old_pgtbl = d->cur_pgtbl;

    /* Point to the new page table, tell readers cur_root is invalid.  */
    smp_wmb();
    d->cur_pgtbl = d->next_pgtbl;

    /* Write the root before updating the page table.  */
    smp_wmb();
    d->cur_root = d->next_root;

    /* Write cur_root before telling readers it is valid.  */
    smp_wmb();
    d->next_pgtbl = NULL;

    /* Drop reference once readers will be done with it.  */
    call_rcu(page_table_unref, old_pgtbl, rcu);

Paolo
diff mbox

Patch

diff --git a/exec.c b/exec.c
index bb4e540..e5871d6 100644
--- a/exec.c
+++ b/exec.c
@@ -186,24 +186,26 @@  static void phys_page_set(AddressSpaceDispatch *d,
                           hwaddr index, hwaddr nb,
                           uint16_t leaf)
 {
+    PhysPageMap *map = d->next_map;
     /* Wildly overreserve - it doesn't matter much. */
     phys_map_node_reserve(3 * P_L2_LEVELS);
 
-    phys_page_set_level(&d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
+    phys_page_set_level(&map->root, &index, &nb, leaf, P_L2_LEVELS - 1);
 }
 
 static PhysSection *phys_section_find(AddressSpaceDispatch *d,
                                       hwaddr index)
 {
-    PhysPageEntry lp = d->phys_map;
     PhysPageEntry *p;
-    PhysSection *phys_sections = cur_pgtbl->phys_sections;
-    Node *phys_map_nodes = cur_pgtbl->phys_map_nodes;
+    PhysPageEntry lp = d->cur_map->root;
+    PhysPageTable *pgtbl = d->cur_map->pgtbl;
+    PhysSection *phys_sections = pgtbl->phys_sections;
+    Node *phys_map_nodes = pgtbl->phys_map_nodes;
     int i;
 
     for (i = P_L2_LEVELS - 1; i >= 0 && !lp.is_leaf; i--) {
         if (lp.ptr == PHYS_MAP_NODE_NIL) {
-            return &phys_sections[cur_pgtbl->phys_section_unassigned];
+            return &phys_sections[pgtbl->phys_section_unassigned];
         }
         p = phys_map_nodes[lp.ptr];
         lp = p[(index >> (i * L2_BITS)) & (L2_SIZE - 1)];
@@ -234,7 +236,7 @@  MemoryRegionSection *address_space_translate(AddressSpace *as, hwaddr addr,
     IOMMUTLBEntry iotlb;
     MemoryRegionSection *section;
     hwaddr len = *plen;
-
+    PhysPageTable *pgtbl = cur_pgtbl;
     for (;;) {
         section = address_space_lookup_region(as, addr);
 
@@ -254,7 +256,7 @@  MemoryRegionSection *address_space_translate(AddressSpace *as, hwaddr addr,
                 | (addr & iotlb.addr_mask));
         len = MIN(len, (addr | iotlb.addr_mask) - addr + 1);
         if (!iotlb.perm[is_write]) {
-            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_unassigned].section;
+            section = &pgtbl->phys_sections[pgtbl->phys_section_unassigned].section;
             break;
         }
 
@@ -1703,7 +1705,21 @@  static void mem_begin(MemoryListener *listener)
 {
     AddressSpaceDispatch *d = container_of(listener, AddressSpaceDispatch, listener);
 
-    d->phys_map.ptr = PHYS_MAP_NODE_NIL;
+    d->next_map = g_new0(PhysPageMap, 1);
+    d->next_map->pgtbl = next_pgtbl;
+}
+
+static void mem_commit(MemoryListener *listener)
+{
+    AddressSpaceDispatch *d = container_of(listener, AddressSpaceDispatch, listener);
+    PhysPageMap *m = d->cur_map;
+
+    d->cur_map = d->next_map;
+    /* Fixme, Currently, we rely on biglock or address-space lock against
+    * reader. So here, we can safely drop it.
+    * After RCU, should change to call_rcu()
+    */
+    g_free(m);
 }
 
 static void core_begin(MemoryListener *listener)
@@ -1771,11 +1787,12 @@  void address_space_init_dispatch(AddressSpace *as)
 {
     AddressSpaceDispatch *d = g_new(AddressSpaceDispatch, 1);
 
-    d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .is_leaf = 0 };
+    d->cur_map = g_new0(PhysPageMap, 1);
     d->listener = (MemoryListener) {
         .begin = mem_begin,
         .region_add = mem_add,
         .region_nop = mem_add,
+        .commit = mem_commit,
         .priority = 0,
     };
     as->dispatch = d;
@@ -1787,6 +1804,7 @@  void address_space_destroy_dispatch(AddressSpace *as)
     AddressSpaceDispatch *d = as->dispatch;
 
     memory_listener_unregister(&d->listener);
+    g_free(d->cur_map);
     g_free(d);
     as->dispatch = NULL;
 }
diff --git a/include/exec/memory-internal.h b/include/exec/memory-internal.h
index 1b156fd..0dfe260 100644
--- a/include/exec/memory-internal.h
+++ b/include/exec/memory-internal.h
@@ -30,13 +30,22 @@  struct PhysPageEntry {
     uint16_t ptr : 15;
 };
 
+struct PhysPageTable;
+typedef struct PhysPageMap PhysPageMap;
+
+struct PhysPageMap {
+    PhysPageEntry root;
+    struct PhysPageTable *pgtbl;
+};
+
 typedef struct AddressSpaceDispatch AddressSpaceDispatch;
 
 struct AddressSpaceDispatch {
     /* This is a multi-level map on the physical address space.
      * The bottom level has pointers to MemoryRegionSections.
      */
-    PhysPageEntry phys_map;
+    PhysPageMap *cur_map;
+    PhysPageMap *next_map;
     MemoryListener listener;
 };