Patchwork [RFC,1/2] mem: make phys_section and phys_map_nodes prepared for RCU

login
register
mail settings
Submitter pingfan liu
Date May 13, 2013, 3:21 a.m.
Message ID <1368415264-10800-2-git-send-email-qemulist@gmail.com>
Download mbox | patch
Permalink /patch/243255/
State New
Headers show

Comments

pingfan liu - May 13, 2013, 3:21 a.m.
From: Liu Ping Fan <pingfank@linux.vnet.ibm.com>

Now, each AddressSpaceDispatch has its own radix-tree, and all of them
lie on phys_section[] and phys_map_nodes[]. When we want lockless
mmio dispatch, we need something like RCU.

Acheive this with PhysPageTable which contains all of the info for all
radix trees. After all address space listeners update (ie. excluding the
readers) we switch from PhysPageTable *cur_pgtbl to *next_pgtbl.
(The real RCU style is adopt by listener, see next patch)

Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
---
 exec.c                |  197 +++++++++++++++++++++++++------------------------
 include/exec/memory.h |    2 +
 memory.c              |    2 +
 3 files changed, 106 insertions(+), 95 deletions(-)
Paolo Bonzini - May 13, 2013, 9:20 a.m.
Il 13/05/2013 05:21, Liu Ping Fan ha scritto:
> From: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
> 
> Now, each AddressSpaceDispatch has its own radix-tree, and all of them
> lie on phys_section[] and phys_map_nodes[]. When we want lockless
> mmio dispatch, we need something like RCU.
> 
> Acheive this with PhysPageTable which contains all of the info for all
> radix trees. After all address space listeners update (ie. excluding the
> readers) we switch from PhysPageTable *cur_pgtbl to *next_pgtbl.
> (The real RCU style is adopt by listener, see next patch)
> 
> Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
> ---
>  exec.c                |  197 +++++++++++++++++++++++++------------------------
>  include/exec/memory.h |    2 +
>  memory.c              |    2 +
>  3 files changed, 106 insertions(+), 95 deletions(-)
> 
> diff --git a/exec.c b/exec.c
> index c5f8082..bb4e540 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -80,23 +80,34 @@ int use_icount;
>  #if !defined(CONFIG_USER_ONLY)
>  
>  #define SUBSECTION_IDX(addr)      ((addr) & ~TARGET_PAGE_MASK)
> -#define PHYS_SECTION_ID(psection) ((psection) - phys_sections)
> +#define PHYS_SECTION_ID(psection, base) ((psection) - base->phys_sections)
>  
>  typedef struct PhysSection {
>      MemoryRegionSection section;
>      uint16_t *sub_section;
>  } PhysSection;
>  
> -static PhysSection *phys_sections;
> -static unsigned phys_sections_nb, phys_sections_nb_alloc;
> -static uint16_t phys_section_unassigned;
> -static uint16_t phys_section_notdirty;
> -static uint16_t phys_section_rom;
> -static uint16_t phys_section_watch;
> +typedef PhysPageEntry Node[L2_SIZE];
>  
> -/* Simple allocator for PhysPageEntry nodes */
> -static PhysPageEntry (*phys_map_nodes)[L2_SIZE];
> -static unsigned phys_map_nodes_nb, phys_map_nodes_nb_alloc;
> +typedef struct PhysPageTable PhysPageTable;
> +
> +struct PhysPageTable {
> +    int ref;
> +    PhysSection *phys_sections;
> +    unsigned phys_sections_nb;
> +    unsigned phys_sections_nb_alloc;
> +    uint16_t phys_section_unassigned;
> +    uint16_t phys_section_notdirty;
> +    uint16_t phys_section_rom;
> +    uint16_t phys_section_watch;

These four should be constants.  Please make them #defines and add
assertions that they have the expected values (in a separate patch).

> +
> +    Node *phys_map_nodes;
> +    unsigned phys_map_nodes_nb;
> +    unsigned phys_map_nodes_nb_alloc;
> +};
> +static PhysPageTable *cur_pgtbl;
> +static PhysPageTable *next_pgtbl;

You shouldn't need cur_pgtbl.  Instead, each AddressSpaceDispatch should
have a pointer to its own cur_pgtbl.  In the commit hook you can then
take a lock, unref the old table, assign cur_pgtbl = next_pgtbl and ref
the new one.

The lock must also be taken in address_space_translate, together with a
relatively expensive ref/unref pair (two atomic operations).  We
probably need real RCU so that we can skip the ref/unref.

>  
>  #define PHYS_MAP_NODE_NIL (((uint16_t)~0) >> 1)
>  
> @@ -111,13 +122,13 @@ static MemoryRegion io_mem_watch;
>  
>  static void phys_map_node_reserve(unsigned nodes)
>  {
> -    if (phys_map_nodes_nb + nodes > phys_map_nodes_nb_alloc) {
> +    if (next_pgtbl->phys_map_nodes_nb + nodes > next_pgtbl->phys_map_nodes_nb_alloc) {
>          typedef PhysPageEntry Node[L2_SIZE];
> -        phys_map_nodes_nb_alloc = MAX(phys_map_nodes_nb_alloc * 2, 16);
> -        phys_map_nodes_nb_alloc = MAX(phys_map_nodes_nb_alloc,
> -                                      phys_map_nodes_nb + nodes);
> -        phys_map_nodes = g_renew(Node, phys_map_nodes,
> -                                 phys_map_nodes_nb_alloc);
> +        next_pgtbl->phys_map_nodes_nb_alloc = MAX(next_pgtbl->phys_map_nodes_nb_alloc * 2, 16);
> +        next_pgtbl->phys_map_nodes_nb_alloc = MAX(next_pgtbl->phys_map_nodes_nb_alloc,
> +                                      next_pgtbl->phys_map_nodes_nb + nodes);
> +        next_pgtbl->phys_map_nodes = g_renew(Node, next_pgtbl->phys_map_nodes,
> +                                 next_pgtbl->phys_map_nodes_nb_alloc);
>      }
>  }
>  
> @@ -126,22 +137,16 @@ static uint16_t phys_map_node_alloc(void)
>      unsigned i;
>      uint16_t ret;
>  
> -    ret = phys_map_nodes_nb++;
> +    ret = next_pgtbl->phys_map_nodes_nb++;
>      assert(ret != PHYS_MAP_NODE_NIL);
> -    assert(ret != phys_map_nodes_nb_alloc);
> +    assert(ret != next_pgtbl->phys_map_nodes_nb_alloc);
>      for (i = 0; i < L2_SIZE; ++i) {
> -        phys_map_nodes[ret][i].is_leaf = 0;
> -        phys_map_nodes[ret][i].ptr = PHYS_MAP_NODE_NIL;
> +        next_pgtbl->phys_map_nodes[ret][i].is_leaf = 0;
> +        next_pgtbl->phys_map_nodes[ret][i].ptr = PHYS_MAP_NODE_NIL;
>      }
>      return ret;
>  }
>  
> -static void phys_map_nodes_reset(void)
> -{
> -    phys_map_nodes_nb = 0;
> -}
> -
> -
>  static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index,
>                                  hwaddr *nb, uint16_t leaf,
>                                  int level)
> @@ -152,15 +157,15 @@ static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index,
>  
>      if (!lp->is_leaf && lp->ptr == PHYS_MAP_NODE_NIL) {
>          lp->ptr = phys_map_node_alloc();
> -        p = phys_map_nodes[lp->ptr];
> +        p = next_pgtbl->phys_map_nodes[lp->ptr];
>          if (level == 0) {
>              for (i = 0; i < L2_SIZE; i++) {
>                  p[i].is_leaf = 1;
> -                p[i].ptr = phys_section_unassigned;
> +                p[i].ptr = next_pgtbl->phys_section_unassigned;
>              }
>          }
>      } else {
> -        p = phys_map_nodes[lp->ptr];
> +        p = next_pgtbl->phys_map_nodes[lp->ptr];
>      }
>      lp = &p[(*index >> (level * L2_BITS)) & (L2_SIZE - 1)];
>  
> @@ -192,11 +197,13 @@ static PhysSection *phys_section_find(AddressSpaceDispatch *d,
>  {
>      PhysPageEntry lp = d->phys_map;
>      PhysPageEntry *p;
> +    PhysSection *phys_sections = cur_pgtbl->phys_sections;
> +    Node *phys_map_nodes = cur_pgtbl->phys_map_nodes;
>      int i;
>  
>      for (i = P_L2_LEVELS - 1; i >= 0 && !lp.is_leaf; i--) {
>          if (lp.ptr == PHYS_MAP_NODE_NIL) {
> -            return &phys_sections[phys_section_unassigned];
> +            return &phys_sections[cur_pgtbl->phys_section_unassigned];
>          }
>          p = phys_map_nodes[lp.ptr];
>          lp = p[(index >> (i * L2_BITS)) & (L2_SIZE - 1)];
> @@ -209,6 +216,7 @@ static MemoryRegionSection *address_space_lookup_region(AddressSpace *as,
>  {
>      PhysSection *psection;
>      uint16_t idx;
> +    PhysSection *phys_sections = cur_pgtbl->phys_sections;
>  
>      psection = phys_section_find(as->dispatch, addr >> TARGET_PAGE_BITS);
>      if (psection->sub_section) {
> @@ -246,7 +254,7 @@ MemoryRegionSection *address_space_translate(AddressSpace *as, hwaddr addr,
>                  | (addr & iotlb.addr_mask));
>          len = MIN(len, (addr | iotlb.addr_mask) - addr + 1);
>          if (!iotlb.perm[is_write]) {
> -            section = &phys_sections[phys_section_unassigned].section;
> +            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_unassigned].section;
>              break;
>          }
>  
> @@ -690,12 +698,12 @@ hwaddr memory_region_section_get_iotlb(CPUArchState *env,
>          iotlb = (memory_region_get_ram_addr(section->mr) & TARGET_PAGE_MASK)
>              + xlat;
>          if (!section->readonly) {
> -            iotlb |= phys_section_notdirty;
> +            iotlb |= cur_pgtbl->phys_section_notdirty;
>          } else {
> -            iotlb |= phys_section_rom;
> +            iotlb |= cur_pgtbl->phys_section_rom;
>          }
>      } else {
> -        iotlb = PHYS_SECTION_ID(container_of(section, PhysSection, section));
> +        iotlb = PHYS_SECTION_ID(container_of(section, PhysSection, section), cur_pgtbl);
>          iotlb += xlat;
>      }
>  
> @@ -705,7 +713,7 @@ hwaddr memory_region_section_get_iotlb(CPUArchState *env,
>          if (vaddr == (wp->vaddr & TARGET_PAGE_MASK)) {
>              /* Avoid trapping reads of pages with a write breakpoint. */
>              if ((prot & PAGE_WRITE) || (wp->flags & BP_MEM_READ)) {
> -                iotlb = phys_section_watch + paddr;
> +                iotlb = cur_pgtbl->phys_section_watch + paddr;
>                  *address |= TLB_MMIO;
>                  break;
>              }
> @@ -721,59 +729,40 @@ static int subsection_register(PhysSection *psection, uint32_t start,
>                                 uint32_t end, uint16_t section);
>  static void subsections_init(PhysSection *psection);
>  
> -static void destroy_page_desc(uint16_t section_index)
> +/* Call after all listener has been commit.
> +  * we do not walk over tree, just simply drop.
> +  */
> +static void destroy_pagetable(PhysPageTable *pgtbl)
>  {
> -    g_free(phys_sections[section_index].sub_section);
> -}
> -
> -static void destroy_l2_mapping(PhysPageEntry *lp, unsigned level)
> -{
> -    unsigned i;
> -    PhysPageEntry *p;
> +    int i;
>  
> -    if (lp->ptr == PHYS_MAP_NODE_NIL) {
> -        return;
> -    }
> +    g_free(pgtbl->phys_map_nodes);
>  
> -    p = phys_map_nodes[lp->ptr];
> -    for (i = 0; i < L2_SIZE; ++i) {
> -        if (!p[i].is_leaf) {
> -            destroy_l2_mapping(&p[i], level - 1);
> +    for (i = 0; i < pgtbl->phys_sections_nb_alloc; i++) {
> +        if (pgtbl->phys_sections[i].sub_section) {
> +            g_free(pgtbl->phys_sections[i].sub_section);
>          } else {
> -            destroy_page_desc(p[i].ptr);
> +            memory_region_unref(pgtbl->phys_sections[i].section.mr);
>          }
>      }
> -    lp->is_leaf = 0;
> -    lp->ptr = PHYS_MAP_NODE_NIL;
> -}
> +    g_free(pgtbl->phys_sections);
>  
> -static void destroy_all_mappings(AddressSpaceDispatch *d)
> -{
> -    destroy_l2_mapping(&d->phys_map, P_L2_LEVELS - 1);
> -    phys_map_nodes_reset();
> +    g_free(pgtbl);
>  }

This should be a separate patch.

> -static uint16_t phys_section_add(MemoryRegionSection *section)
> +static uint16_t phys_section_add(MemoryRegionSection *section, PhysPageTable *pgtbl)

phys_section_add will always add to next_pgtbl.

>  {
> -    assert(phys_sections_nb < TARGET_PAGE_SIZE);
> +    assert(pgtbl->phys_sections_nb < TARGET_PAGE_SIZE);
>  
> -    if (phys_sections_nb == phys_sections_nb_alloc) {
> -        phys_sections_nb_alloc = MAX(phys_sections_nb_alloc * 2, 16);
> -        phys_sections = g_renew(PhysSection, phys_sections,
> -                                phys_sections_nb_alloc);
> +    if (pgtbl->phys_sections_nb == pgtbl->phys_sections_nb_alloc) {
> +        pgtbl->phys_sections_nb_alloc = MAX(pgtbl->phys_sections_nb_alloc * 2, 16);
> +        pgtbl->phys_sections = g_renew(PhysSection, pgtbl->phys_sections,
> +                                pgtbl->phys_sections_nb_alloc);
>      }
> -    phys_sections[phys_sections_nb].section = *section;
> -    phys_sections[phys_sections_nb].sub_section = NULL;
> +    pgtbl->phys_sections[pgtbl->phys_sections_nb].section = *section;
> +    pgtbl->phys_sections[pgtbl->phys_sections_nb].sub_section = NULL;
>      memory_region_ref(section->mr);
> -    return phys_sections_nb++;
> -}
> -
> -static void phys_sections_clear(void)
> -{
> -    while (phys_sections_nb > 0) {
> -        PhysSection *phys_section = &phys_sections[--phys_sections_nb];
> -        memory_region_unref(phys_section->section.mr);
> -    }
> +    return pgtbl->phys_sections_nb++;
>  }
>  
>  static void register_subsection(AddressSpaceDispatch *d,
> @@ -793,18 +782,18 @@ static void register_subsection(AddressSpaceDispatch *d,
>             psection->section.mr == &io_mem_unassigned);
>  
>      if (!psection->sub_section) {
> -        new_section = phys_section_add(&subsection);
> -        psection = &phys_sections[new_section];
> +        new_section = phys_section_add(&subsection, next_pgtbl);
> +        psection = &next_pgtbl->phys_sections[new_section];
>          subsections_init(psection);
>          phys_page_set(d, base >> TARGET_PAGE_BITS, 1, new_section);
>      } else {
> -        new_section = PHYS_SECTION_ID(psection);
> +        new_section = PHYS_SECTION_ID(psection, next_pgtbl);
>      }
>  
> -    new_subsection = phys_section_add(section);
> +    new_subsection = phys_section_add(section, next_pgtbl);
>  
>      /* phys_section_add invalidates psection, reload it  */
> -    psection = &phys_sections[new_section];
> +    psection = &next_pgtbl->phys_sections[new_section];
>      start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
>      end = start + section->size - 1;
>      subsection_register(psection, start, end, new_subsection);
> @@ -816,7 +805,7 @@ static void register_multipage(AddressSpaceDispatch *d, MemoryRegionSection *sec
>      hwaddr start_addr = section->offset_within_address_space;
>      ram_addr_t size = section->size;
>      hwaddr addr;
> -    uint16_t section_index = phys_section_add(section);
> +    uint16_t section_index = phys_section_add(section, next_pgtbl);
>  
>      assert(size);
>  
> @@ -1653,7 +1642,7 @@ static void subsections_init(PhysSection *psection)
>  {
>      psection->sub_section = g_malloc0(sizeof(uint16_t) * TARGET_PAGE_SIZE);
>      subsection_register(psection, 0, TARGET_PAGE_SIZE-1,
> -                        phys_section_unassigned);
> +                        next_pgtbl->phys_section_unassigned);
>  }
>  
>  static uint16_t dummy_section(MemoryRegion *mr)
> @@ -1665,12 +1654,12 @@ static uint16_t dummy_section(MemoryRegion *mr)
>          .size = UINT64_MAX,
>      };
>  
> -    return phys_section_add(&section);
> +    return phys_section_add(&section, next_pgtbl);
>  }
>  
>  MemoryRegion *iotlb_to_region(hwaddr index)
>  {
> -    return phys_sections[index & ~TARGET_PAGE_MASK].section.mr;
> +    return cur_pgtbl->phys_sections[index & ~TARGET_PAGE_MASK].section.mr;
>  }
>  
>  static void io_mem_init(void)
> @@ -1685,21 +1674,40 @@ static void io_mem_init(void)
>                            "watch", UINT64_MAX);
>  }
>  
> +void global_begin(void)
> +{
> +    next_pgtbl = g_new0(PhysPageTable, 1);
> +    next_pgtbl->ref = 1;
> +    next_pgtbl->phys_section_unassigned = dummy_section(&io_mem_unassigned);
> +    next_pgtbl->phys_section_notdirty = dummy_section(&io_mem_notdirty);
> +    next_pgtbl->phys_section_rom = dummy_section(&io_mem_rom);
> +    next_pgtbl->phys_section_watch = dummy_section(&io_mem_watch);
> +}
> +
> +/* other listeners finished */
> +void global_commit(void)
> +{
> +    PhysPageTable *t = cur_pgtbl;
> +
> +    cur_pgtbl = next_pgtbl;
> +    /* Fix me,  currently, we rely on each address space listener agaist its
> +      * reader. So when we come here, no readers will touch old phys_map_node.
> +      * After rcu, should changed to call_rcu()
> +      */
> +    if (__sync_sub_and_fetch(&t->ref, 1) == 0) {
> +        destroy_pagetable(t);
> +    }

global_commit should not be needed, and global_begin should be simply
the new core_begin.  It should unref next_pgtbl and reallocate a new one.


> +}
> +
>  static void mem_begin(MemoryListener *listener)
>  {
>      AddressSpaceDispatch *d = container_of(listener, AddressSpaceDispatch, listener);
>  
> -    destroy_all_mappings(d);
>      d->phys_map.ptr = PHYS_MAP_NODE_NIL;
>  }
>  
>  static void core_begin(MemoryListener *listener)
>  {
> -    phys_sections_clear();
> -    phys_section_unassigned = dummy_section(&io_mem_unassigned);
> -    phys_section_notdirty = dummy_section(&io_mem_notdirty);
> -    phys_section_rom = dummy_section(&io_mem_rom);
> -    phys_section_watch = dummy_section(&io_mem_watch);
>  }
>  
>  static void tcg_commit(MemoryListener *listener)
> @@ -1779,7 +1787,6 @@ void address_space_destroy_dispatch(AddressSpace *as)
>      AddressSpaceDispatch *d = as->dispatch;
>  
>      memory_listener_unregister(&d->listener);
> -    destroy_l2_mapping(&d->phys_map, P_L2_LEVELS - 1);
>      g_free(d);
>      as->dispatch = NULL;
>  }
> @@ -2386,7 +2393,7 @@ void stl_phys_notdirty(hwaddr addr, uint32_t val)
>  
>      if (!memory_region_is_ram(section->mr) || section->readonly) {
>          if (memory_region_is_ram(section->mr)) {
> -            section = &phys_sections[phys_section_rom].section;
> +            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
>          }
>          io_mem_write(section->mr, addr, val, 4);
>      } else {
> @@ -2422,7 +2429,7 @@ void stq_phys_notdirty(hwaddr addr, uint64_t val)
>  
>      if (!memory_region_is_ram(section->mr) || section->readonly) {
>          if (memory_region_is_ram(section->mr)) {
> -            section = &phys_sections[phys_section_rom].section;
> +            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
>          }
>  #ifdef TARGET_WORDS_BIGENDIAN
>          io_mem_write(section->mr, addr, val >> 32, 4);
> @@ -2455,7 +2462,7 @@ static inline void stl_phys_internal(hwaddr addr, uint32_t val,
>  
>      if (!memory_region_is_ram(section->mr) || section->readonly) {
>          if (memory_region_is_ram(section->mr)) {
> -            section = &phys_sections[phys_section_rom].section;
> +            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
>          }
>  #if defined(TARGET_WORDS_BIGENDIAN)
>          if (endian == DEVICE_LITTLE_ENDIAN) {
> @@ -2526,7 +2533,7 @@ static inline void stw_phys_internal(hwaddr addr, uint32_t val,
>  
>      if (!memory_region_is_ram(section->mr) || section->readonly) {
>          if (memory_region_is_ram(section->mr)) {
> -            section = &phys_sections[phys_section_rom].section;
> +            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
>          }
>  #if defined(TARGET_WORDS_BIGENDIAN)
>          if (endian == DEVICE_LITTLE_ENDIAN) {
> diff --git a/include/exec/memory.h b/include/exec/memory.h
> index b97ace7..cc654fa 100644
> --- a/include/exec/memory.h
> +++ b/include/exec/memory.h
> @@ -992,6 +992,8 @@ void *address_space_map(AddressSpace *as, hwaddr addr,
>  void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
>                           int is_write, hwaddr access_len);
>  
> +void global_begin(void);
> +void global_commit(void);
>  
>  #endif
>  
> diff --git a/memory.c b/memory.c
> index 1a86607..da06dfd 100644
> --- a/memory.c
> +++ b/memory.c
> @@ -805,6 +805,7 @@ void memory_region_transaction_commit(void)
>      --memory_region_transaction_depth;
>      if (!memory_region_transaction_depth && memory_region_update_pending) {
>          memory_region_update_pending = false;
> +        global_begin();
>          MEMORY_LISTENER_CALL_GLOBAL(begin, Forward);
>  
>          QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) {
> @@ -812,6 +813,7 @@ void memory_region_transaction_commit(void)
>          }
>  
>          MEMORY_LISTENER_CALL_GLOBAL(commit, Forward);
> +        global_commit();
>      }
>  }
>  
>
pingfan liu - May 14, 2013, 3:38 a.m.
On Mon, May 13, 2013 at 5:20 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> Il 13/05/2013 05:21, Liu Ping Fan ha scritto:
>> From: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
>>
>> Now, each AddressSpaceDispatch has its own radix-tree, and all of them
>> lie on phys_section[] and phys_map_nodes[]. When we want lockless
>> mmio dispatch, we need something like RCU.
>>
>> Acheive this with PhysPageTable which contains all of the info for all
>> radix trees. After all address space listeners update (ie. excluding the
>> readers) we switch from PhysPageTable *cur_pgtbl to *next_pgtbl.
>> (The real RCU style is adopt by listener, see next patch)
>>
>> Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
>> ---
>>  exec.c                |  197 +++++++++++++++++++++++++------------------------
>>  include/exec/memory.h |    2 +
>>  memory.c              |    2 +
>>  3 files changed, 106 insertions(+), 95 deletions(-)
>>
>> diff --git a/exec.c b/exec.c
>> index c5f8082..bb4e540 100644
>> --- a/exec.c
>> +++ b/exec.c
>> @@ -80,23 +80,34 @@ int use_icount;
>>  #if !defined(CONFIG_USER_ONLY)
>>
>>  #define SUBSECTION_IDX(addr)      ((addr) & ~TARGET_PAGE_MASK)
>> -#define PHYS_SECTION_ID(psection) ((psection) - phys_sections)
>> +#define PHYS_SECTION_ID(psection, base) ((psection) - base->phys_sections)
>>
>>  typedef struct PhysSection {
>>      MemoryRegionSection section;
>>      uint16_t *sub_section;
>>  } PhysSection;
>>
>> -static PhysSection *phys_sections;
>> -static unsigned phys_sections_nb, phys_sections_nb_alloc;
>> -static uint16_t phys_section_unassigned;
>> -static uint16_t phys_section_notdirty;
>> -static uint16_t phys_section_rom;
>> -static uint16_t phys_section_watch;
>> +typedef PhysPageEntry Node[L2_SIZE];
>>
>> -/* Simple allocator for PhysPageEntry nodes */
>> -static PhysPageEntry (*phys_map_nodes)[L2_SIZE];
>> -static unsigned phys_map_nodes_nb, phys_map_nodes_nb_alloc;
>> +typedef struct PhysPageTable PhysPageTable;
>> +
>> +struct PhysPageTable {
>> +    int ref;
>> +    PhysSection *phys_sections;
>> +    unsigned phys_sections_nb;
>> +    unsigned phys_sections_nb_alloc;
>> +    uint16_t phys_section_unassigned;
>> +    uint16_t phys_section_notdirty;
>> +    uint16_t phys_section_rom;
>> +    uint16_t phys_section_watch;
>
> These four should be constants.  Please make them #defines and add
> assertions that they have the expected values (in a separate patch).
>
Things like the following?
#define  phys_section_unassigned 0
t = dummy_section(&io_mem_unassigned);
assert(t ==phys_section_unassigned );

>> +
>> +    Node *phys_map_nodes;
>> +    unsigned phys_map_nodes_nb;
>> +    unsigned phys_map_nodes_nb_alloc;
>> +};
>> +static PhysPageTable *cur_pgtbl;
>> +static PhysPageTable *next_pgtbl;
>
> You shouldn't need cur_pgtbl.  Instead, each AddressSpaceDispatch should
> have a pointer to its own cur_pgtbl.  In the commit hook you can then
> take a lock, unref the old table, assign cur_pgtbl = next_pgtbl and ref
> the new one.
>
Here is a trick. All AddressSpaceDispatch will update its radix tree
at the same time, so the re-claimer will come after all of them commit
to drop the old phys_map_node[] which holds the radix trees all at
once. Is it OK? Or you still prefer  each AddressSpaceDispatch has its
own space for radix tree?

> The lock must also be taken in address_space_translate, together with a
> relatively expensive ref/unref pair (two atomic operations).  We
> probably need real RCU so that we can skip the ref/unref.
>
Currently, without RCU, do we rely on as->lock? So no need ref/unref
on cur_pgtbl?  With RCU, I think we need ref/unref on mr, since
mr->ops->read/write can block, and should be excluded from RCU
section.
>>
>>  #define PHYS_MAP_NODE_NIL (((uint16_t)~0) >> 1)
>>
>> @@ -111,13 +122,13 @@ static MemoryRegion io_mem_watch;
>>
>>  static void phys_map_node_reserve(unsigned nodes)
>>  {
>> -    if (phys_map_nodes_nb + nodes > phys_map_nodes_nb_alloc) {
>> +    if (next_pgtbl->phys_map_nodes_nb + nodes > next_pgtbl->phys_map_nodes_nb_alloc) {
>>          typedef PhysPageEntry Node[L2_SIZE];
>> -        phys_map_nodes_nb_alloc = MAX(phys_map_nodes_nb_alloc * 2, 16);
>> -        phys_map_nodes_nb_alloc = MAX(phys_map_nodes_nb_alloc,
>> -                                      phys_map_nodes_nb + nodes);
>> -        phys_map_nodes = g_renew(Node, phys_map_nodes,
>> -                                 phys_map_nodes_nb_alloc);
>> +        next_pgtbl->phys_map_nodes_nb_alloc = MAX(next_pgtbl->phys_map_nodes_nb_alloc * 2, 16);
>> +        next_pgtbl->phys_map_nodes_nb_alloc = MAX(next_pgtbl->phys_map_nodes_nb_alloc,
>> +                                      next_pgtbl->phys_map_nodes_nb + nodes);
>> +        next_pgtbl->phys_map_nodes = g_renew(Node, next_pgtbl->phys_map_nodes,
>> +                                 next_pgtbl->phys_map_nodes_nb_alloc);
>>      }
>>  }
>>
>> @@ -126,22 +137,16 @@ static uint16_t phys_map_node_alloc(void)
>>      unsigned i;
>>      uint16_t ret;
>>
>> -    ret = phys_map_nodes_nb++;
>> +    ret = next_pgtbl->phys_map_nodes_nb++;
>>      assert(ret != PHYS_MAP_NODE_NIL);
>> -    assert(ret != phys_map_nodes_nb_alloc);
>> +    assert(ret != next_pgtbl->phys_map_nodes_nb_alloc);
>>      for (i = 0; i < L2_SIZE; ++i) {
>> -        phys_map_nodes[ret][i].is_leaf = 0;
>> -        phys_map_nodes[ret][i].ptr = PHYS_MAP_NODE_NIL;
>> +        next_pgtbl->phys_map_nodes[ret][i].is_leaf = 0;
>> +        next_pgtbl->phys_map_nodes[ret][i].ptr = PHYS_MAP_NODE_NIL;
>>      }
>>      return ret;
>>  }
>>
>> -static void phys_map_nodes_reset(void)
>> -{
>> -    phys_map_nodes_nb = 0;
>> -}
>> -
>> -
>>  static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index,
>>                                  hwaddr *nb, uint16_t leaf,
>>                                  int level)
>> @@ -152,15 +157,15 @@ static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index,
>>
>>      if (!lp->is_leaf && lp->ptr == PHYS_MAP_NODE_NIL) {
>>          lp->ptr = phys_map_node_alloc();
>> -        p = phys_map_nodes[lp->ptr];
>> +        p = next_pgtbl->phys_map_nodes[lp->ptr];
>>          if (level == 0) {
>>              for (i = 0; i < L2_SIZE; i++) {
>>                  p[i].is_leaf = 1;
>> -                p[i].ptr = phys_section_unassigned;
>> +                p[i].ptr = next_pgtbl->phys_section_unassigned;
>>              }
>>          }
>>      } else {
>> -        p = phys_map_nodes[lp->ptr];
>> +        p = next_pgtbl->phys_map_nodes[lp->ptr];
>>      }
>>      lp = &p[(*index >> (level * L2_BITS)) & (L2_SIZE - 1)];
>>
>> @@ -192,11 +197,13 @@ static PhysSection *phys_section_find(AddressSpaceDispatch *d,
>>  {
>>      PhysPageEntry lp = d->phys_map;
>>      PhysPageEntry *p;
>> +    PhysSection *phys_sections = cur_pgtbl->phys_sections;
>> +    Node *phys_map_nodes = cur_pgtbl->phys_map_nodes;
>>      int i;
>>
>>      for (i = P_L2_LEVELS - 1; i >= 0 && !lp.is_leaf; i--) {
>>          if (lp.ptr == PHYS_MAP_NODE_NIL) {
>> -            return &phys_sections[phys_section_unassigned];
>> +            return &phys_sections[cur_pgtbl->phys_section_unassigned];
>>          }
>>          p = phys_map_nodes[lp.ptr];
>>          lp = p[(index >> (i * L2_BITS)) & (L2_SIZE - 1)];
>> @@ -209,6 +216,7 @@ static MemoryRegionSection *address_space_lookup_region(AddressSpace *as,
>>  {
>>      PhysSection *psection;
>>      uint16_t idx;
>> +    PhysSection *phys_sections = cur_pgtbl->phys_sections;
>>
>>      psection = phys_section_find(as->dispatch, addr >> TARGET_PAGE_BITS);
>>      if (psection->sub_section) {
>> @@ -246,7 +254,7 @@ MemoryRegionSection *address_space_translate(AddressSpace *as, hwaddr addr,
>>                  | (addr & iotlb.addr_mask));
>>          len = MIN(len, (addr | iotlb.addr_mask) - addr + 1);
>>          if (!iotlb.perm[is_write]) {
>> -            section = &phys_sections[phys_section_unassigned].section;
>> +            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_unassigned].section;
>>              break;
>>          }
>>
>> @@ -690,12 +698,12 @@ hwaddr memory_region_section_get_iotlb(CPUArchState *env,
>>          iotlb = (memory_region_get_ram_addr(section->mr) & TARGET_PAGE_MASK)
>>              + xlat;
>>          if (!section->readonly) {
>> -            iotlb |= phys_section_notdirty;
>> +            iotlb |= cur_pgtbl->phys_section_notdirty;
>>          } else {
>> -            iotlb |= phys_section_rom;
>> +            iotlb |= cur_pgtbl->phys_section_rom;
>>          }
>>      } else {
>> -        iotlb = PHYS_SECTION_ID(container_of(section, PhysSection, section));
>> +        iotlb = PHYS_SECTION_ID(container_of(section, PhysSection, section), cur_pgtbl);
>>          iotlb += xlat;
>>      }
>>
>> @@ -705,7 +713,7 @@ hwaddr memory_region_section_get_iotlb(CPUArchState *env,
>>          if (vaddr == (wp->vaddr & TARGET_PAGE_MASK)) {
>>              /* Avoid trapping reads of pages with a write breakpoint. */
>>              if ((prot & PAGE_WRITE) || (wp->flags & BP_MEM_READ)) {
>> -                iotlb = phys_section_watch + paddr;
>> +                iotlb = cur_pgtbl->phys_section_watch + paddr;
>>                  *address |= TLB_MMIO;
>>                  break;
>>              }
>> @@ -721,59 +729,40 @@ static int subsection_register(PhysSection *psection, uint32_t start,
>>                                 uint32_t end, uint16_t section);
>>  static void subsections_init(PhysSection *psection);
>>
>> -static void destroy_page_desc(uint16_t section_index)
>> +/* Call after all listener has been commit.
>> +  * we do not walk over tree, just simply drop.
>> +  */
>> +static void destroy_pagetable(PhysPageTable *pgtbl)
>>  {
>> -    g_free(phys_sections[section_index].sub_section);
>> -}
>> -
>> -static void destroy_l2_mapping(PhysPageEntry *lp, unsigned level)
>> -{
>> -    unsigned i;
>> -    PhysPageEntry *p;
>> +    int i;
>>
>> -    if (lp->ptr == PHYS_MAP_NODE_NIL) {
>> -        return;
>> -    }
>> +    g_free(pgtbl->phys_map_nodes);
>>
>> -    p = phys_map_nodes[lp->ptr];
>> -    for (i = 0; i < L2_SIZE; ++i) {
>> -        if (!p[i].is_leaf) {
>> -            destroy_l2_mapping(&p[i], level - 1);
>> +    for (i = 0; i < pgtbl->phys_sections_nb_alloc; i++) {
>> +        if (pgtbl->phys_sections[i].sub_section) {
>> +            g_free(pgtbl->phys_sections[i].sub_section);
>>          } else {
>> -            destroy_page_desc(p[i].ptr);
>> +            memory_region_unref(pgtbl->phys_sections[i].section.mr);
>>          }
>>      }
>> -    lp->is_leaf = 0;
>> -    lp->ptr = PHYS_MAP_NODE_NIL;
>> -}
>> +    g_free(pgtbl->phys_sections);
>>
>> -static void destroy_all_mappings(AddressSpaceDispatch *d)
>> -{
>> -    destroy_l2_mapping(&d->phys_map, P_L2_LEVELS - 1);
>> -    phys_map_nodes_reset();
>> +    g_free(pgtbl);
>>  }
>
> This should be a separate patch.
>
Sorry, this is part of reclaimer to play destruction of prev pgtbl.
Why to separate it?

>> -static uint16_t phys_section_add(MemoryRegionSection *section)
>> +static uint16_t phys_section_add(MemoryRegionSection *section, PhysPageTable *pgtbl)
>
> phys_section_add will always add to next_pgtbl.
>
Ok, will drop @pgtbl.
>>  {
>> -    assert(phys_sections_nb < TARGET_PAGE_SIZE);
>> +    assert(pgtbl->phys_sections_nb < TARGET_PAGE_SIZE);
>>
>> -    if (phys_sections_nb == phys_sections_nb_alloc) {
>> -        phys_sections_nb_alloc = MAX(phys_sections_nb_alloc * 2, 16);
>> -        phys_sections = g_renew(PhysSection, phys_sections,
>> -                                phys_sections_nb_alloc);
>> +    if (pgtbl->phys_sections_nb == pgtbl->phys_sections_nb_alloc) {
>> +        pgtbl->phys_sections_nb_alloc = MAX(pgtbl->phys_sections_nb_alloc * 2, 16);
>> +        pgtbl->phys_sections = g_renew(PhysSection, pgtbl->phys_sections,
>> +                                pgtbl->phys_sections_nb_alloc);
>>      }
>> -    phys_sections[phys_sections_nb].section = *section;
>> -    phys_sections[phys_sections_nb].sub_section = NULL;
>> +    pgtbl->phys_sections[pgtbl->phys_sections_nb].section = *section;
>> +    pgtbl->phys_sections[pgtbl->phys_sections_nb].sub_section = NULL;
>>      memory_region_ref(section->mr);
>> -    return phys_sections_nb++;
>> -}
>> -
>> -static void phys_sections_clear(void)
>> -{
>> -    while (phys_sections_nb > 0) {
>> -        PhysSection *phys_section = &phys_sections[--phys_sections_nb];
>> -        memory_region_unref(phys_section->section.mr);
>> -    }
>> +    return pgtbl->phys_sections_nb++;
>>  }
>>
>>  static void register_subsection(AddressSpaceDispatch *d,
>> @@ -793,18 +782,18 @@ static void register_subsection(AddressSpaceDispatch *d,
>>             psection->section.mr == &io_mem_unassigned);
>>
>>      if (!psection->sub_section) {
>> -        new_section = phys_section_add(&subsection);
>> -        psection = &phys_sections[new_section];
>> +        new_section = phys_section_add(&subsection, next_pgtbl);
>> +        psection = &next_pgtbl->phys_sections[new_section];
>>          subsections_init(psection);
>>          phys_page_set(d, base >> TARGET_PAGE_BITS, 1, new_section);
>>      } else {
>> -        new_section = PHYS_SECTION_ID(psection);
>> +        new_section = PHYS_SECTION_ID(psection, next_pgtbl);
>>      }
>>
>> -    new_subsection = phys_section_add(section);
>> +    new_subsection = phys_section_add(section, next_pgtbl);
>>
>>      /* phys_section_add invalidates psection, reload it  */
>> -    psection = &phys_sections[new_section];
>> +    psection = &next_pgtbl->phys_sections[new_section];
>>      start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
>>      end = start + section->size - 1;
>>      subsection_register(psection, start, end, new_subsection);
>> @@ -816,7 +805,7 @@ static void register_multipage(AddressSpaceDispatch *d, MemoryRegionSection *sec
>>      hwaddr start_addr = section->offset_within_address_space;
>>      ram_addr_t size = section->size;
>>      hwaddr addr;
>> -    uint16_t section_index = phys_section_add(section);
>> +    uint16_t section_index = phys_section_add(section, next_pgtbl);
>>
>>      assert(size);
>>
>> @@ -1653,7 +1642,7 @@ static void subsections_init(PhysSection *psection)
>>  {
>>      psection->sub_section = g_malloc0(sizeof(uint16_t) * TARGET_PAGE_SIZE);
>>      subsection_register(psection, 0, TARGET_PAGE_SIZE-1,
>> -                        phys_section_unassigned);
>> +                        next_pgtbl->phys_section_unassigned);
>>  }
>>
>>  static uint16_t dummy_section(MemoryRegion *mr)
>> @@ -1665,12 +1654,12 @@ static uint16_t dummy_section(MemoryRegion *mr)
>>          .size = UINT64_MAX,
>>      };
>>
>> -    return phys_section_add(&section);
>> +    return phys_section_add(&section, next_pgtbl);
>>  }
>>
>>  MemoryRegion *iotlb_to_region(hwaddr index)
>>  {
>> -    return phys_sections[index & ~TARGET_PAGE_MASK].section.mr;
>> +    return cur_pgtbl->phys_sections[index & ~TARGET_PAGE_MASK].section.mr;
>>  }
>>
>>  static void io_mem_init(void)
>> @@ -1685,21 +1674,40 @@ static void io_mem_init(void)
>>                            "watch", UINT64_MAX);
>>  }
>>
>> +void global_begin(void)
>> +{
>> +    next_pgtbl = g_new0(PhysPageTable, 1);
>> +    next_pgtbl->ref = 1;
>> +    next_pgtbl->phys_section_unassigned = dummy_section(&io_mem_unassigned);
>> +    next_pgtbl->phys_section_notdirty = dummy_section(&io_mem_notdirty);
>> +    next_pgtbl->phys_section_rom = dummy_section(&io_mem_rom);
>> +    next_pgtbl->phys_section_watch = dummy_section(&io_mem_watch);
>> +}
>> +
>> +/* other listeners finished */
>> +void global_commit(void)
>> +{
>> +    PhysPageTable *t = cur_pgtbl;
>> +
>> +    cur_pgtbl = next_pgtbl;
>> +    /* Fix me,  currently, we rely on each address space listener agaist its
>> +      * reader. So when we come here, no readers will touch old phys_map_node.
>> +      * After rcu, should changed to call_rcu()
>> +      */
>> +    if (__sync_sub_and_fetch(&t->ref, 1) == 0) {
>> +        destroy_pagetable(t);
>> +    }
>
> global_commit should not be needed, and global_begin should be simply
> the new core_begin.  It should unref next_pgtbl and reallocate a new one.
>
Good idea, thanks.

Regards,
Pingfan
>
>> +}
>> +
>>  static void mem_begin(MemoryListener *listener)
>>  {
>>      AddressSpaceDispatch *d = container_of(listener, AddressSpaceDispatch, listener);
>>
>> -    destroy_all_mappings(d);
>>      d->phys_map.ptr = PHYS_MAP_NODE_NIL;
>>  }
>>
>>  static void core_begin(MemoryListener *listener)
>>  {
>> -    phys_sections_clear();
>> -    phys_section_unassigned = dummy_section(&io_mem_unassigned);
>> -    phys_section_notdirty = dummy_section(&io_mem_notdirty);
>> -    phys_section_rom = dummy_section(&io_mem_rom);
>> -    phys_section_watch = dummy_section(&io_mem_watch);
>>  }
>>
>>  static void tcg_commit(MemoryListener *listener)
>> @@ -1779,7 +1787,6 @@ void address_space_destroy_dispatch(AddressSpace *as)
>>      AddressSpaceDispatch *d = as->dispatch;
>>
>>      memory_listener_unregister(&d->listener);
>> -    destroy_l2_mapping(&d->phys_map, P_L2_LEVELS - 1);
>>      g_free(d);
>>      as->dispatch = NULL;
>>  }
>> @@ -2386,7 +2393,7 @@ void stl_phys_notdirty(hwaddr addr, uint32_t val)
>>
>>      if (!memory_region_is_ram(section->mr) || section->readonly) {
>>          if (memory_region_is_ram(section->mr)) {
>> -            section = &phys_sections[phys_section_rom].section;
>> +            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
>>          }
>>          io_mem_write(section->mr, addr, val, 4);
>>      } else {
>> @@ -2422,7 +2429,7 @@ void stq_phys_notdirty(hwaddr addr, uint64_t val)
>>
>>      if (!memory_region_is_ram(section->mr) || section->readonly) {
>>          if (memory_region_is_ram(section->mr)) {
>> -            section = &phys_sections[phys_section_rom].section;
>> +            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
>>          }
>>  #ifdef TARGET_WORDS_BIGENDIAN
>>          io_mem_write(section->mr, addr, val >> 32, 4);
>> @@ -2455,7 +2462,7 @@ static inline void stl_phys_internal(hwaddr addr, uint32_t val,
>>
>>      if (!memory_region_is_ram(section->mr) || section->readonly) {
>>          if (memory_region_is_ram(section->mr)) {
>> -            section = &phys_sections[phys_section_rom].section;
>> +            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
>>          }
>>  #if defined(TARGET_WORDS_BIGENDIAN)
>>          if (endian == DEVICE_LITTLE_ENDIAN) {
>> @@ -2526,7 +2533,7 @@ static inline void stw_phys_internal(hwaddr addr, uint32_t val,
>>
>>      if (!memory_region_is_ram(section->mr) || section->readonly) {
>>          if (memory_region_is_ram(section->mr)) {
>> -            section = &phys_sections[phys_section_rom].section;
>> +            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
>>          }
>>  #if defined(TARGET_WORDS_BIGENDIAN)
>>          if (endian == DEVICE_LITTLE_ENDIAN) {
>> diff --git a/include/exec/memory.h b/include/exec/memory.h
>> index b97ace7..cc654fa 100644
>> --- a/include/exec/memory.h
>> +++ b/include/exec/memory.h
>> @@ -992,6 +992,8 @@ void *address_space_map(AddressSpace *as, hwaddr addr,
>>  void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
>>                           int is_write, hwaddr access_len);
>>
>> +void global_begin(void);
>> +void global_commit(void);
>>
>>  #endif
>>
>> diff --git a/memory.c b/memory.c
>> index 1a86607..da06dfd 100644
>> --- a/memory.c
>> +++ b/memory.c
>> @@ -805,6 +805,7 @@ void memory_region_transaction_commit(void)
>>      --memory_region_transaction_depth;
>>      if (!memory_region_transaction_depth && memory_region_update_pending) {
>>          memory_region_update_pending = false;
>> +        global_begin();
>>          MEMORY_LISTENER_CALL_GLOBAL(begin, Forward);
>>
>>          QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) {
>> @@ -812,6 +813,7 @@ void memory_region_transaction_commit(void)
>>          }
>>
>>          MEMORY_LISTENER_CALL_GLOBAL(commit, Forward);
>> +        global_commit();
>>      }
>>  }
>>
>>
>
Paolo Bonzini - May 14, 2013, 9:27 a.m.
Il 14/05/2013 05:38, liu ping fan ha scritto:
>>> +struct PhysPageTable {
>>> +    int ref;
>>> +    PhysSection *phys_sections;
>>> +    unsigned phys_sections_nb;
>>> +    unsigned phys_sections_nb_alloc;
>>> +    uint16_t phys_section_unassigned;
>>> +    uint16_t phys_section_notdirty;
>>> +    uint16_t phys_section_rom;
>>> +    uint16_t phys_section_watch;
>>
>> These four should be constants.  Please make them #defines and add
>> assertions that they have the expected values (in a separate patch).
>>
> Things like the following?
> #define  phys_section_unassigned 0
> t = dummy_section(&io_mem_unassigned);
> assert(t ==phys_section_unassigned );

Yes.

>>> +
>>> +    Node *phys_map_nodes;
>>> +    unsigned phys_map_nodes_nb;
>>> +    unsigned phys_map_nodes_nb_alloc;
>>> +};
>>> +static PhysPageTable *cur_pgtbl;
>>> +static PhysPageTable *next_pgtbl;
>>
>> You shouldn't need cur_pgtbl.  Instead, each AddressSpaceDispatch should
>> have a pointer to its own cur_pgtbl.  In the commit hook you can then
>> take a lock, unref the old table, assign cur_pgtbl = next_pgtbl and ref
>> the new one.
>>
> Here is a trick. All AddressSpaceDispatch will update its radix tree
> at the same time, so the re-claimer will come after all of them commit
> to drop the old phys_map_node[] which holds the radix trees all at
> once. Is it OK? Or you still prefer  each AddressSpaceDispatch has its
> own space for radix tree?

I'm not sure... I prefer to have more stuff in AddressSpaceDispatch to
make things as streamlined as possible on the read side.

>> The lock must also be taken in address_space_translate, together with a
>> relatively expensive ref/unref pair (two atomic operations).  We
>> probably need real RCU so that we can skip the ref/unref.
>>
> Currently, without RCU, do we rely on as->lock? So no need ref/unref
> on cur_pgtbl?  With RCU, I think we need ref/unref on mr, since
> mr->ops->read/write can block, and should be excluded from RCU
> section.

Yes, we need ref/unref on the region, but in many cases that will be a
no-op (most important, for access to system RAM).  And if we can do the
dispatch in the RCU section, similar to what I proposed in reviewing
your patch 2, we have no ref/unref in the common case of accessing RAM.

>>>  #define PHYS_MAP_NODE_NIL (((uint16_t)~0) >> 1)
>>>
>>> @@ -111,13 +122,13 @@ static MemoryRegion io_mem_watch;
>>>
>>>  static void phys_map_node_reserve(unsigned nodes)
>>>  {
>>> -    if (phys_map_nodes_nb + nodes > phys_map_nodes_nb_alloc) {
>>> +    if (next_pgtbl->phys_map_nodes_nb + nodes > next_pgtbl->phys_map_nodes_nb_alloc) {
>>>          typedef PhysPageEntry Node[L2_SIZE];
>>> -        phys_map_nodes_nb_alloc = MAX(phys_map_nodes_nb_alloc * 2, 16);
>>> -        phys_map_nodes_nb_alloc = MAX(phys_map_nodes_nb_alloc,
>>> -                                      phys_map_nodes_nb + nodes);
>>> -        phys_map_nodes = g_renew(Node, phys_map_nodes,
>>> -                                 phys_map_nodes_nb_alloc);
>>> +        next_pgtbl->phys_map_nodes_nb_alloc = MAX(next_pgtbl->phys_map_nodes_nb_alloc * 2, 16);
>>> +        next_pgtbl->phys_map_nodes_nb_alloc = MAX(next_pgtbl->phys_map_nodes_nb_alloc,
>>> +                                      next_pgtbl->phys_map_nodes_nb + nodes);
>>> +        next_pgtbl->phys_map_nodes = g_renew(Node, next_pgtbl->phys_map_nodes,
>>> +                                 next_pgtbl->phys_map_nodes_nb_alloc);
>>>      }
>>>  }
>>>
>>> @@ -126,22 +137,16 @@ static uint16_t phys_map_node_alloc(void)
>>>      unsigned i;
>>>      uint16_t ret;
>>>
>>> -    ret = phys_map_nodes_nb++;
>>> +    ret = next_pgtbl->phys_map_nodes_nb++;
>>>      assert(ret != PHYS_MAP_NODE_NIL);
>>> -    assert(ret != phys_map_nodes_nb_alloc);
>>> +    assert(ret != next_pgtbl->phys_map_nodes_nb_alloc);
>>>      for (i = 0; i < L2_SIZE; ++i) {
>>> -        phys_map_nodes[ret][i].is_leaf = 0;
>>> -        phys_map_nodes[ret][i].ptr = PHYS_MAP_NODE_NIL;
>>> +        next_pgtbl->phys_map_nodes[ret][i].is_leaf = 0;
>>> +        next_pgtbl->phys_map_nodes[ret][i].ptr = PHYS_MAP_NODE_NIL;
>>>      }
>>>      return ret;
>>>  }
>>>
>>> -static void phys_map_nodes_reset(void)
>>> -{
>>> -    phys_map_nodes_nb = 0;
>>> -}
>>> -
>>> -
>>>  static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index,
>>>                                  hwaddr *nb, uint16_t leaf,
>>>                                  int level)
>>> @@ -152,15 +157,15 @@ static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index,
>>>
>>>      if (!lp->is_leaf && lp->ptr == PHYS_MAP_NODE_NIL) {
>>>          lp->ptr = phys_map_node_alloc();
>>> -        p = phys_map_nodes[lp->ptr];
>>> +        p = next_pgtbl->phys_map_nodes[lp->ptr];
>>>          if (level == 0) {
>>>              for (i = 0; i < L2_SIZE; i++) {
>>>                  p[i].is_leaf = 1;
>>> -                p[i].ptr = phys_section_unassigned;
>>> +                p[i].ptr = next_pgtbl->phys_section_unassigned;
>>>              }
>>>          }
>>>      } else {
>>> -        p = phys_map_nodes[lp->ptr];
>>> +        p = next_pgtbl->phys_map_nodes[lp->ptr];
>>>      }
>>>      lp = &p[(*index >> (level * L2_BITS)) & (L2_SIZE - 1)];
>>>
>>> @@ -192,11 +197,13 @@ static PhysSection *phys_section_find(AddressSpaceDispatch *d,
>>>  {
>>>      PhysPageEntry lp = d->phys_map;
>>>      PhysPageEntry *p;
>>> +    PhysSection *phys_sections = cur_pgtbl->phys_sections;
>>> +    Node *phys_map_nodes = cur_pgtbl->phys_map_nodes;
>>>      int i;
>>>
>>>      for (i = P_L2_LEVELS - 1; i >= 0 && !lp.is_leaf; i--) {
>>>          if (lp.ptr == PHYS_MAP_NODE_NIL) {
>>> -            return &phys_sections[phys_section_unassigned];
>>> +            return &phys_sections[cur_pgtbl->phys_section_unassigned];
>>>          }
>>>          p = phys_map_nodes[lp.ptr];
>>>          lp = p[(index >> (i * L2_BITS)) & (L2_SIZE - 1)];
>>> @@ -209,6 +216,7 @@ static MemoryRegionSection *address_space_lookup_region(AddressSpace *as,
>>>  {
>>>      PhysSection *psection;
>>>      uint16_t idx;
>>> +    PhysSection *phys_sections = cur_pgtbl->phys_sections;
>>>
>>>      psection = phys_section_find(as->dispatch, addr >> TARGET_PAGE_BITS);
>>>      if (psection->sub_section) {
>>> @@ -246,7 +254,7 @@ MemoryRegionSection *address_space_translate(AddressSpace *as, hwaddr addr,
>>>                  | (addr & iotlb.addr_mask));
>>>          len = MIN(len, (addr | iotlb.addr_mask) - addr + 1);
>>>          if (!iotlb.perm[is_write]) {
>>> -            section = &phys_sections[phys_section_unassigned].section;
>>> +            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_unassigned].section;
>>>              break;
>>>          }
>>>
>>> @@ -690,12 +698,12 @@ hwaddr memory_region_section_get_iotlb(CPUArchState *env,
>>>          iotlb = (memory_region_get_ram_addr(section->mr) & TARGET_PAGE_MASK)
>>>              + xlat;
>>>          if (!section->readonly) {
>>> -            iotlb |= phys_section_notdirty;
>>> +            iotlb |= cur_pgtbl->phys_section_notdirty;
>>>          } else {
>>> -            iotlb |= phys_section_rom;
>>> +            iotlb |= cur_pgtbl->phys_section_rom;
>>>          }
>>>      } else {
>>> -        iotlb = PHYS_SECTION_ID(container_of(section, PhysSection, section));
>>> +        iotlb = PHYS_SECTION_ID(container_of(section, PhysSection, section), cur_pgtbl);
>>>          iotlb += xlat;
>>>      }
>>>
>>> @@ -705,7 +713,7 @@ hwaddr memory_region_section_get_iotlb(CPUArchState *env,
>>>          if (vaddr == (wp->vaddr & TARGET_PAGE_MASK)) {
>>>              /* Avoid trapping reads of pages with a write breakpoint. */
>>>              if ((prot & PAGE_WRITE) || (wp->flags & BP_MEM_READ)) {
>>> -                iotlb = phys_section_watch + paddr;
>>> +                iotlb = cur_pgtbl->phys_section_watch + paddr;
>>>                  *address |= TLB_MMIO;
>>>                  break;
>>>              }
>>> @@ -721,59 +729,40 @@ static int subsection_register(PhysSection *psection, uint32_t start,
>>>                                 uint32_t end, uint16_t section);
>>>  static void subsections_init(PhysSection *psection);
>>>
>>> -static void destroy_page_desc(uint16_t section_index)
>>> +/* Call after all listener has been commit.
>>> +  * we do not walk over tree, just simply drop.
>>> +  */
>>> +static void destroy_pagetable(PhysPageTable *pgtbl)
>>>  {
>>> -    g_free(phys_sections[section_index].sub_section);
>>> -}
>>> -
>>> -static void destroy_l2_mapping(PhysPageEntry *lp, unsigned level)
>>> -{
>>> -    unsigned i;
>>> -    PhysPageEntry *p;
>>> +    int i;
>>>
>>> -    if (lp->ptr == PHYS_MAP_NODE_NIL) {
>>> -        return;
>>> -    }
>>> +    g_free(pgtbl->phys_map_nodes);
>>>
>>> -    p = phys_map_nodes[lp->ptr];
>>> -    for (i = 0; i < L2_SIZE; ++i) {
>>> -        if (!p[i].is_leaf) {
>>> -            destroy_l2_mapping(&p[i], level - 1);
>>> +    for (i = 0; i < pgtbl->phys_sections_nb_alloc; i++) {
>>> +        if (pgtbl->phys_sections[i].sub_section) {
>>> +            g_free(pgtbl->phys_sections[i].sub_section);
>>>          } else {
>>> -            destroy_page_desc(p[i].ptr);
>>> +            memory_region_unref(pgtbl->phys_sections[i].section.mr);
>>>          }
>>>      }
>>> -    lp->is_leaf = 0;
>>> -    lp->ptr = PHYS_MAP_NODE_NIL;
>>> -}
>>> +    g_free(pgtbl->phys_sections);
>>>
>>> -static void destroy_all_mappings(AddressSpaceDispatch *d)
>>> -{
>>> -    destroy_l2_mapping(&d->phys_map, P_L2_LEVELS - 1);
>>> -    phys_map_nodes_reset();
>>> +    g_free(pgtbl);
>>>  }
>>
>> This should be a separate patch.
>>
> Sorry, this is part of reclaimer to play destruction of prev pgtbl.
> Why to separate it?

You are changing the algorithm by which the old pagetable is destroyed.
 This applies just as well to the current code.

BTW, your patch includes Jan's subpage patch which was broken.  I'll
drop it from the branch next time I push.

Paolo

>>> -static uint16_t phys_section_add(MemoryRegionSection *section)
>>> +static uint16_t phys_section_add(MemoryRegionSection *section, PhysPageTable *pgtbl)
>>
>> phys_section_add will always add to next_pgtbl.
>>
> Ok, will drop @pgtbl.
>>>  {
>>> -    assert(phys_sections_nb < TARGET_PAGE_SIZE);
>>> +    assert(pgtbl->phys_sections_nb < TARGET_PAGE_SIZE);
>>>
>>> -    if (phys_sections_nb == phys_sections_nb_alloc) {
>>> -        phys_sections_nb_alloc = MAX(phys_sections_nb_alloc * 2, 16);
>>> -        phys_sections = g_renew(PhysSection, phys_sections,
>>> -                                phys_sections_nb_alloc);
>>> +    if (pgtbl->phys_sections_nb == pgtbl->phys_sections_nb_alloc) {
>>> +        pgtbl->phys_sections_nb_alloc = MAX(pgtbl->phys_sections_nb_alloc * 2, 16);
>>> +        pgtbl->phys_sections = g_renew(PhysSection, pgtbl->phys_sections,
>>> +                                pgtbl->phys_sections_nb_alloc);
>>>      }
>>> -    phys_sections[phys_sections_nb].section = *section;
>>> -    phys_sections[phys_sections_nb].sub_section = NULL;
>>> +    pgtbl->phys_sections[pgtbl->phys_sections_nb].section = *section;
>>> +    pgtbl->phys_sections[pgtbl->phys_sections_nb].sub_section = NULL;
>>>      memory_region_ref(section->mr);
>>> -    return phys_sections_nb++;
>>> -}
>>> -
>>> -static void phys_sections_clear(void)
>>> -{
>>> -    while (phys_sections_nb > 0) {
>>> -        PhysSection *phys_section = &phys_sections[--phys_sections_nb];
>>> -        memory_region_unref(phys_section->section.mr);
>>> -    }
>>> +    return pgtbl->phys_sections_nb++;
>>>  }
>>>
>>>  static void register_subsection(AddressSpaceDispatch *d,
>>> @@ -793,18 +782,18 @@ static void register_subsection(AddressSpaceDispatch *d,
>>>             psection->section.mr == &io_mem_unassigned);
>>>
>>>      if (!psection->sub_section) {
>>> -        new_section = phys_section_add(&subsection);
>>> -        psection = &phys_sections[new_section];
>>> +        new_section = phys_section_add(&subsection, next_pgtbl);
>>> +        psection = &next_pgtbl->phys_sections[new_section];
>>>          subsections_init(psection);
>>>          phys_page_set(d, base >> TARGET_PAGE_BITS, 1, new_section);
>>>      } else {
>>> -        new_section = PHYS_SECTION_ID(psection);
>>> +        new_section = PHYS_SECTION_ID(psection, next_pgtbl);
>>>      }
>>>
>>> -    new_subsection = phys_section_add(section);
>>> +    new_subsection = phys_section_add(section, next_pgtbl);
>>>
>>>      /* phys_section_add invalidates psection, reload it  */
>>> -    psection = &phys_sections[new_section];
>>> +    psection = &next_pgtbl->phys_sections[new_section];
>>>      start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
>>>      end = start + section->size - 1;
>>>      subsection_register(psection, start, end, new_subsection);
>>> @@ -816,7 +805,7 @@ static void register_multipage(AddressSpaceDispatch *d, MemoryRegionSection *sec
>>>      hwaddr start_addr = section->offset_within_address_space;
>>>      ram_addr_t size = section->size;
>>>      hwaddr addr;
>>> -    uint16_t section_index = phys_section_add(section);
>>> +    uint16_t section_index = phys_section_add(section, next_pgtbl);
>>>
>>>      assert(size);
>>>
>>> @@ -1653,7 +1642,7 @@ static void subsections_init(PhysSection *psection)
>>>  {
>>>      psection->sub_section = g_malloc0(sizeof(uint16_t) * TARGET_PAGE_SIZE);
>>>      subsection_register(psection, 0, TARGET_PAGE_SIZE-1,
>>> -                        phys_section_unassigned);
>>> +                        next_pgtbl->phys_section_unassigned);
>>>  }
>>>
>>>  static uint16_t dummy_section(MemoryRegion *mr)
>>> @@ -1665,12 +1654,12 @@ static uint16_t dummy_section(MemoryRegion *mr)
>>>          .size = UINT64_MAX,
>>>      };
>>>
>>> -    return phys_section_add(&section);
>>> +    return phys_section_add(&section, next_pgtbl);
>>>  }
>>>
>>>  MemoryRegion *iotlb_to_region(hwaddr index)
>>>  {
>>> -    return phys_sections[index & ~TARGET_PAGE_MASK].section.mr;
>>> +    return cur_pgtbl->phys_sections[index & ~TARGET_PAGE_MASK].section.mr;
>>>  }
>>>
>>>  static void io_mem_init(void)
>>> @@ -1685,21 +1674,40 @@ static void io_mem_init(void)
>>>                            "watch", UINT64_MAX);
>>>  }
>>>
>>> +void global_begin(void)
>>> +{
>>> +    next_pgtbl = g_new0(PhysPageTable, 1);
>>> +    next_pgtbl->ref = 1;
>>> +    next_pgtbl->phys_section_unassigned = dummy_section(&io_mem_unassigned);
>>> +    next_pgtbl->phys_section_notdirty = dummy_section(&io_mem_notdirty);
>>> +    next_pgtbl->phys_section_rom = dummy_section(&io_mem_rom);
>>> +    next_pgtbl->phys_section_watch = dummy_section(&io_mem_watch);
>>> +}
>>> +
>>> +/* other listeners finished */
>>> +void global_commit(void)
>>> +{
>>> +    PhysPageTable *t = cur_pgtbl;
>>> +
>>> +    cur_pgtbl = next_pgtbl;
>>> +    /* Fix me,  currently, we rely on each address space listener agaist its
>>> +      * reader. So when we come here, no readers will touch old phys_map_node.
>>> +      * After rcu, should changed to call_rcu()
>>> +      */
>>> +    if (__sync_sub_and_fetch(&t->ref, 1) == 0) {
>>> +        destroy_pagetable(t);
>>> +    }
>>
>> global_commit should not be needed, and global_begin should be simply
>> the new core_begin.  It should unref next_pgtbl and reallocate a new one.
>>
> Good idea, thanks.
> 
> Regards,
> Pingfan
>>
>>> +}
>>> +
>>>  static void mem_begin(MemoryListener *listener)
>>>  {
>>>      AddressSpaceDispatch *d = container_of(listener, AddressSpaceDispatch, listener);
>>>
>>> -    destroy_all_mappings(d);
>>>      d->phys_map.ptr = PHYS_MAP_NODE_NIL;
>>>  }
>>>
>>>  static void core_begin(MemoryListener *listener)
>>>  {
>>> -    phys_sections_clear();
>>> -    phys_section_unassigned = dummy_section(&io_mem_unassigned);
>>> -    phys_section_notdirty = dummy_section(&io_mem_notdirty);
>>> -    phys_section_rom = dummy_section(&io_mem_rom);
>>> -    phys_section_watch = dummy_section(&io_mem_watch);
>>>  }
>>>
>>>  static void tcg_commit(MemoryListener *listener)
>>> @@ -1779,7 +1787,6 @@ void address_space_destroy_dispatch(AddressSpace *as)
>>>      AddressSpaceDispatch *d = as->dispatch;
>>>
>>>      memory_listener_unregister(&d->listener);
>>> -    destroy_l2_mapping(&d->phys_map, P_L2_LEVELS - 1);
>>>      g_free(d);
>>>      as->dispatch = NULL;
>>>  }
>>> @@ -2386,7 +2393,7 @@ void stl_phys_notdirty(hwaddr addr, uint32_t val)
>>>
>>>      if (!memory_region_is_ram(section->mr) || section->readonly) {
>>>          if (memory_region_is_ram(section->mr)) {
>>> -            section = &phys_sections[phys_section_rom].section;
>>> +            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
>>>          }
>>>          io_mem_write(section->mr, addr, val, 4);
>>>      } else {
>>> @@ -2422,7 +2429,7 @@ void stq_phys_notdirty(hwaddr addr, uint64_t val)
>>>
>>>      if (!memory_region_is_ram(section->mr) || section->readonly) {
>>>          if (memory_region_is_ram(section->mr)) {
>>> -            section = &phys_sections[phys_section_rom].section;
>>> +            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
>>>          }
>>>  #ifdef TARGET_WORDS_BIGENDIAN
>>>          io_mem_write(section->mr, addr, val >> 32, 4);
>>> @@ -2455,7 +2462,7 @@ static inline void stl_phys_internal(hwaddr addr, uint32_t val,
>>>
>>>      if (!memory_region_is_ram(section->mr) || section->readonly) {
>>>          if (memory_region_is_ram(section->mr)) {
>>> -            section = &phys_sections[phys_section_rom].section;
>>> +            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
>>>          }
>>>  #if defined(TARGET_WORDS_BIGENDIAN)
>>>          if (endian == DEVICE_LITTLE_ENDIAN) {
>>> @@ -2526,7 +2533,7 @@ static inline void stw_phys_internal(hwaddr addr, uint32_t val,
>>>
>>>      if (!memory_region_is_ram(section->mr) || section->readonly) {
>>>          if (memory_region_is_ram(section->mr)) {
>>> -            section = &phys_sections[phys_section_rom].section;
>>> +            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
>>>          }
>>>  #if defined(TARGET_WORDS_BIGENDIAN)
>>>          if (endian == DEVICE_LITTLE_ENDIAN) {
>>> diff --git a/include/exec/memory.h b/include/exec/memory.h
>>> index b97ace7..cc654fa 100644
>>> --- a/include/exec/memory.h
>>> +++ b/include/exec/memory.h
>>> @@ -992,6 +992,8 @@ void *address_space_map(AddressSpace *as, hwaddr addr,
>>>  void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
>>>                           int is_write, hwaddr access_len);
>>>
>>> +void global_begin(void);
>>> +void global_commit(void);
>>>
>>>  #endif
>>>
>>> diff --git a/memory.c b/memory.c
>>> index 1a86607..da06dfd 100644
>>> --- a/memory.c
>>> +++ b/memory.c
>>> @@ -805,6 +805,7 @@ void memory_region_transaction_commit(void)
>>>      --memory_region_transaction_depth;
>>>      if (!memory_region_transaction_depth && memory_region_update_pending) {
>>>          memory_region_update_pending = false;
>>> +        global_begin();
>>>          MEMORY_LISTENER_CALL_GLOBAL(begin, Forward);
>>>
>>>          QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) {
>>> @@ -812,6 +813,7 @@ void memory_region_transaction_commit(void)
>>>          }
>>>
>>>          MEMORY_LISTENER_CALL_GLOBAL(commit, Forward);
>>> +        global_commit();
>>>      }
>>>  }
>>>
>>>
>>
> 
>
pingfan liu - May 15, 2013, 7:04 a.m.
On Tue, May 14, 2013 at 5:27 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> Il 14/05/2013 05:38, liu ping fan ha scritto:
>>>> +struct PhysPageTable {
>>>> +    int ref;
>>>> +    PhysSection *phys_sections;
>>>> +    unsigned phys_sections_nb;
>>>> +    unsigned phys_sections_nb_alloc;
>>>> +    uint16_t phys_section_unassigned;
>>>> +    uint16_t phys_section_notdirty;
>>>> +    uint16_t phys_section_rom;
>>>> +    uint16_t phys_section_watch;
>>>
>>> These four should be constants.  Please make them #defines and add
>>> assertions that they have the expected values (in a separate patch).
>>>
>> Things like the following?
>> #define  phys_section_unassigned 0
>> t = dummy_section(&io_mem_unassigned);
>> assert(t ==phys_section_unassigned );
>
> Yes.
>
>>>> +
>>>> +    Node *phys_map_nodes;
>>>> +    unsigned phys_map_nodes_nb;
>>>> +    unsigned phys_map_nodes_nb_alloc;
>>>> +};
>>>> +static PhysPageTable *cur_pgtbl;
>>>> +static PhysPageTable *next_pgtbl;
>>>
>>> You shouldn't need cur_pgtbl.  Instead, each AddressSpaceDispatch should
>>> have a pointer to its own cur_pgtbl.  In the commit hook you can then
>>> take a lock, unref the old table, assign cur_pgtbl = next_pgtbl and ref
>>> the new one.
>>>
>> Here is a trick. All AddressSpaceDispatch will update its radix tree
>> at the same time, so the re-claimer will come after all of them commit
>> to drop the old phys_map_node[] which holds the radix trees all at
>> once. Is it OK? Or you still prefer  each AddressSpaceDispatch has its
>> own space for radix tree?
>
> I'm not sure... I prefer to have more stuff in AddressSpaceDispatch to
> make things as streamlined as possible on the read side.
>
Ok, will implement based on each AddressSpaceDispatch

>>> The lock must also be taken in address_space_translate, together with a
>>> relatively expensive ref/unref pair (two atomic operations).  We
>>> probably need real RCU so that we can skip the ref/unref.
>>>
>> Currently, without RCU, do we rely on as->lock? So no need ref/unref
>> on cur_pgtbl?  With RCU, I think we need ref/unref on mr, since
>> mr->ops->read/write can block, and should be excluded from RCU
>> section.
>
> Yes, we need ref/unref on the region, but in many cases that will be a
> no-op (most important, for access to system RAM).  And if we can do the
> dispatch in the RCU section, similar to what I proposed in reviewing
> your patch 2, we have no ref/unref in the common case of accessing RAM.
>
>>>>  #define PHYS_MAP_NODE_NIL (((uint16_t)~0) >> 1)
>>>>
>>>> @@ -111,13 +122,13 @@ static MemoryRegion io_mem_watch;
>>>>
>>>>  static void phys_map_node_reserve(unsigned nodes)
>>>>  {
>>>> -    if (phys_map_nodes_nb + nodes > phys_map_nodes_nb_alloc) {
>>>> +    if (next_pgtbl->phys_map_nodes_nb + nodes > next_pgtbl->phys_map_nodes_nb_alloc) {
>>>>          typedef PhysPageEntry Node[L2_SIZE];
>>>> -        phys_map_nodes_nb_alloc = MAX(phys_map_nodes_nb_alloc * 2, 16);
>>>> -        phys_map_nodes_nb_alloc = MAX(phys_map_nodes_nb_alloc,
>>>> -                                      phys_map_nodes_nb + nodes);
>>>> -        phys_map_nodes = g_renew(Node, phys_map_nodes,
>>>> -                                 phys_map_nodes_nb_alloc);
>>>> +        next_pgtbl->phys_map_nodes_nb_alloc = MAX(next_pgtbl->phys_map_nodes_nb_alloc * 2, 16);
>>>> +        next_pgtbl->phys_map_nodes_nb_alloc = MAX(next_pgtbl->phys_map_nodes_nb_alloc,
>>>> +                                      next_pgtbl->phys_map_nodes_nb + nodes);
>>>> +        next_pgtbl->phys_map_nodes = g_renew(Node, next_pgtbl->phys_map_nodes,
>>>> +                                 next_pgtbl->phys_map_nodes_nb_alloc);
>>>>      }
>>>>  }
>>>>
>>>> @@ -126,22 +137,16 @@ static uint16_t phys_map_node_alloc(void)
>>>>      unsigned i;
>>>>      uint16_t ret;
>>>>
>>>> -    ret = phys_map_nodes_nb++;
>>>> +    ret = next_pgtbl->phys_map_nodes_nb++;
>>>>      assert(ret != PHYS_MAP_NODE_NIL);
>>>> -    assert(ret != phys_map_nodes_nb_alloc);
>>>> +    assert(ret != next_pgtbl->phys_map_nodes_nb_alloc);
>>>>      for (i = 0; i < L2_SIZE; ++i) {
>>>> -        phys_map_nodes[ret][i].is_leaf = 0;
>>>> -        phys_map_nodes[ret][i].ptr = PHYS_MAP_NODE_NIL;
>>>> +        next_pgtbl->phys_map_nodes[ret][i].is_leaf = 0;
>>>> +        next_pgtbl->phys_map_nodes[ret][i].ptr = PHYS_MAP_NODE_NIL;
>>>>      }
>>>>      return ret;
>>>>  }
>>>>
>>>> -static void phys_map_nodes_reset(void)
>>>> -{
>>>> -    phys_map_nodes_nb = 0;
>>>> -}
>>>> -
>>>> -
>>>>  static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index,
>>>>                                  hwaddr *nb, uint16_t leaf,
>>>>                                  int level)
>>>> @@ -152,15 +157,15 @@ static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index,
>>>>
>>>>      if (!lp->is_leaf && lp->ptr == PHYS_MAP_NODE_NIL) {
>>>>          lp->ptr = phys_map_node_alloc();
>>>> -        p = phys_map_nodes[lp->ptr];
>>>> +        p = next_pgtbl->phys_map_nodes[lp->ptr];
>>>>          if (level == 0) {
>>>>              for (i = 0; i < L2_SIZE; i++) {
>>>>                  p[i].is_leaf = 1;
>>>> -                p[i].ptr = phys_section_unassigned;
>>>> +                p[i].ptr = next_pgtbl->phys_section_unassigned;
>>>>              }
>>>>          }
>>>>      } else {
>>>> -        p = phys_map_nodes[lp->ptr];
>>>> +        p = next_pgtbl->phys_map_nodes[lp->ptr];
>>>>      }
>>>>      lp = &p[(*index >> (level * L2_BITS)) & (L2_SIZE - 1)];
>>>>
>>>> @@ -192,11 +197,13 @@ static PhysSection *phys_section_find(AddressSpaceDispatch *d,
>>>>  {
>>>>      PhysPageEntry lp = d->phys_map;
>>>>      PhysPageEntry *p;
>>>> +    PhysSection *phys_sections = cur_pgtbl->phys_sections;
>>>> +    Node *phys_map_nodes = cur_pgtbl->phys_map_nodes;
>>>>      int i;
>>>>
>>>>      for (i = P_L2_LEVELS - 1; i >= 0 && !lp.is_leaf; i--) {
>>>>          if (lp.ptr == PHYS_MAP_NODE_NIL) {
>>>> -            return &phys_sections[phys_section_unassigned];
>>>> +            return &phys_sections[cur_pgtbl->phys_section_unassigned];
>>>>          }
>>>>          p = phys_map_nodes[lp.ptr];
>>>>          lp = p[(index >> (i * L2_BITS)) & (L2_SIZE - 1)];
>>>> @@ -209,6 +216,7 @@ static MemoryRegionSection *address_space_lookup_region(AddressSpace *as,
>>>>  {
>>>>      PhysSection *psection;
>>>>      uint16_t idx;
>>>> +    PhysSection *phys_sections = cur_pgtbl->phys_sections;
>>>>
>>>>      psection = phys_section_find(as->dispatch, addr >> TARGET_PAGE_BITS);
>>>>      if (psection->sub_section) {
>>>> @@ -246,7 +254,7 @@ MemoryRegionSection *address_space_translate(AddressSpace *as, hwaddr addr,
>>>>                  | (addr & iotlb.addr_mask));
>>>>          len = MIN(len, (addr | iotlb.addr_mask) - addr + 1);
>>>>          if (!iotlb.perm[is_write]) {
>>>> -            section = &phys_sections[phys_section_unassigned].section;
>>>> +            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_unassigned].section;
>>>>              break;
>>>>          }
>>>>
>>>> @@ -690,12 +698,12 @@ hwaddr memory_region_section_get_iotlb(CPUArchState *env,
>>>>          iotlb = (memory_region_get_ram_addr(section->mr) & TARGET_PAGE_MASK)
>>>>              + xlat;
>>>>          if (!section->readonly) {
>>>> -            iotlb |= phys_section_notdirty;
>>>> +            iotlb |= cur_pgtbl->phys_section_notdirty;
>>>>          } else {
>>>> -            iotlb |= phys_section_rom;
>>>> +            iotlb |= cur_pgtbl->phys_section_rom;
>>>>          }
>>>>      } else {
>>>> -        iotlb = PHYS_SECTION_ID(container_of(section, PhysSection, section));
>>>> +        iotlb = PHYS_SECTION_ID(container_of(section, PhysSection, section), cur_pgtbl);
>>>>          iotlb += xlat;
>>>>      }
>>>>
>>>> @@ -705,7 +713,7 @@ hwaddr memory_region_section_get_iotlb(CPUArchState *env,
>>>>          if (vaddr == (wp->vaddr & TARGET_PAGE_MASK)) {
>>>>              /* Avoid trapping reads of pages with a write breakpoint. */
>>>>              if ((prot & PAGE_WRITE) || (wp->flags & BP_MEM_READ)) {
>>>> -                iotlb = phys_section_watch + paddr;
>>>> +                iotlb = cur_pgtbl->phys_section_watch + paddr;
>>>>                  *address |= TLB_MMIO;
>>>>                  break;
>>>>              }
>>>> @@ -721,59 +729,40 @@ static int subsection_register(PhysSection *psection, uint32_t start,
>>>>                                 uint32_t end, uint16_t section);
>>>>  static void subsections_init(PhysSection *psection);
>>>>
>>>> -static void destroy_page_desc(uint16_t section_index)
>>>> +/* Call after all listener has been commit.
>>>> +  * we do not walk over tree, just simply drop.
>>>> +  */
>>>> +static void destroy_pagetable(PhysPageTable *pgtbl)
>>>>  {
>>>> -    g_free(phys_sections[section_index].sub_section);
>>>> -}
>>>> -
>>>> -static void destroy_l2_mapping(PhysPageEntry *lp, unsigned level)
>>>> -{
>>>> -    unsigned i;
>>>> -    PhysPageEntry *p;
>>>> +    int i;
>>>>
>>>> -    if (lp->ptr == PHYS_MAP_NODE_NIL) {
>>>> -        return;
>>>> -    }
>>>> +    g_free(pgtbl->phys_map_nodes);
>>>>
>>>> -    p = phys_map_nodes[lp->ptr];
>>>> -    for (i = 0; i < L2_SIZE; ++i) {
>>>> -        if (!p[i].is_leaf) {
>>>> -            destroy_l2_mapping(&p[i], level - 1);
>>>> +    for (i = 0; i < pgtbl->phys_sections_nb_alloc; i++) {
>>>> +        if (pgtbl->phys_sections[i].sub_section) {
>>>> +            g_free(pgtbl->phys_sections[i].sub_section);
>>>>          } else {
>>>> -            destroy_page_desc(p[i].ptr);
>>>> +            memory_region_unref(pgtbl->phys_sections[i].section.mr);
>>>>          }
>>>>      }
>>>> -    lp->is_leaf = 0;
>>>> -    lp->ptr = PHYS_MAP_NODE_NIL;
>>>> -}
>>>> +    g_free(pgtbl->phys_sections);
>>>>
>>>> -static void destroy_all_mappings(AddressSpaceDispatch *d)
>>>> -{
>>>> -    destroy_l2_mapping(&d->phys_map, P_L2_LEVELS - 1);
>>>> -    phys_map_nodes_reset();
>>>> +    g_free(pgtbl);
>>>>  }
>>>
>>> This should be a separate patch.
>>>
>> Sorry, this is part of reclaimer to play destruction of prev pgtbl.
>> Why to separate it?
>
> You are changing the algorithm by which the old pagetable is destroyed.
>  This applies just as well to the current code.
>
> BTW, your patch includes Jan's subpage patch which was broken.  I'll
> drop it from the branch next time I push.
>
I will update based on your next version

Regards,
Pingfan

> Paolo
>
>>>> -static uint16_t phys_section_add(MemoryRegionSection *section)
>>>> +static uint16_t phys_section_add(MemoryRegionSection *section, PhysPageTable *pgtbl)
>>>
>>> phys_section_add will always add to next_pgtbl.
>>>
>> Ok, will drop @pgtbl.
>>>>  {
>>>> -    assert(phys_sections_nb < TARGET_PAGE_SIZE);
>>>> +    assert(pgtbl->phys_sections_nb < TARGET_PAGE_SIZE);
>>>>
>>>> -    if (phys_sections_nb == phys_sections_nb_alloc) {
>>>> -        phys_sections_nb_alloc = MAX(phys_sections_nb_alloc * 2, 16);
>>>> -        phys_sections = g_renew(PhysSection, phys_sections,
>>>> -                                phys_sections_nb_alloc);
>>>> +    if (pgtbl->phys_sections_nb == pgtbl->phys_sections_nb_alloc) {
>>>> +        pgtbl->phys_sections_nb_alloc = MAX(pgtbl->phys_sections_nb_alloc * 2, 16);
>>>> +        pgtbl->phys_sections = g_renew(PhysSection, pgtbl->phys_sections,
>>>> +                                pgtbl->phys_sections_nb_alloc);
>>>>      }
>>>> -    phys_sections[phys_sections_nb].section = *section;
>>>> -    phys_sections[phys_sections_nb].sub_section = NULL;
>>>> +    pgtbl->phys_sections[pgtbl->phys_sections_nb].section = *section;
>>>> +    pgtbl->phys_sections[pgtbl->phys_sections_nb].sub_section = NULL;
>>>>      memory_region_ref(section->mr);
>>>> -    return phys_sections_nb++;
>>>> -}
>>>> -
>>>> -static void phys_sections_clear(void)
>>>> -{
>>>> -    while (phys_sections_nb > 0) {
>>>> -        PhysSection *phys_section = &phys_sections[--phys_sections_nb];
>>>> -        memory_region_unref(phys_section->section.mr);
>>>> -    }
>>>> +    return pgtbl->phys_sections_nb++;
>>>>  }
>>>>
>>>>  static void register_subsection(AddressSpaceDispatch *d,
>>>> @@ -793,18 +782,18 @@ static void register_subsection(AddressSpaceDispatch *d,
>>>>             psection->section.mr == &io_mem_unassigned);
>>>>
>>>>      if (!psection->sub_section) {
>>>> -        new_section = phys_section_add(&subsection);
>>>> -        psection = &phys_sections[new_section];
>>>> +        new_section = phys_section_add(&subsection, next_pgtbl);
>>>> +        psection = &next_pgtbl->phys_sections[new_section];
>>>>          subsections_init(psection);
>>>>          phys_page_set(d, base >> TARGET_PAGE_BITS, 1, new_section);
>>>>      } else {
>>>> -        new_section = PHYS_SECTION_ID(psection);
>>>> +        new_section = PHYS_SECTION_ID(psection, next_pgtbl);
>>>>      }
>>>>
>>>> -    new_subsection = phys_section_add(section);
>>>> +    new_subsection = phys_section_add(section, next_pgtbl);
>>>>
>>>>      /* phys_section_add invalidates psection, reload it  */
>>>> -    psection = &phys_sections[new_section];
>>>> +    psection = &next_pgtbl->phys_sections[new_section];
>>>>      start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
>>>>      end = start + section->size - 1;
>>>>      subsection_register(psection, start, end, new_subsection);
>>>> @@ -816,7 +805,7 @@ static void register_multipage(AddressSpaceDispatch *d, MemoryRegionSection *sec
>>>>      hwaddr start_addr = section->offset_within_address_space;
>>>>      ram_addr_t size = section->size;
>>>>      hwaddr addr;
>>>> -    uint16_t section_index = phys_section_add(section);
>>>> +    uint16_t section_index = phys_section_add(section, next_pgtbl);
>>>>
>>>>      assert(size);
>>>>
>>>> @@ -1653,7 +1642,7 @@ static void subsections_init(PhysSection *psection)
>>>>  {
>>>>      psection->sub_section = g_malloc0(sizeof(uint16_t) * TARGET_PAGE_SIZE);
>>>>      subsection_register(psection, 0, TARGET_PAGE_SIZE-1,
>>>> -                        phys_section_unassigned);
>>>> +                        next_pgtbl->phys_section_unassigned);
>>>>  }
>>>>
>>>>  static uint16_t dummy_section(MemoryRegion *mr)
>>>> @@ -1665,12 +1654,12 @@ static uint16_t dummy_section(MemoryRegion *mr)
>>>>          .size = UINT64_MAX,
>>>>      };
>>>>
>>>> -    return phys_section_add(&section);
>>>> +    return phys_section_add(&section, next_pgtbl);
>>>>  }
>>>>
>>>>  MemoryRegion *iotlb_to_region(hwaddr index)
>>>>  {
>>>> -    return phys_sections[index & ~TARGET_PAGE_MASK].section.mr;
>>>> +    return cur_pgtbl->phys_sections[index & ~TARGET_PAGE_MASK].section.mr;
>>>>  }
>>>>
>>>>  static void io_mem_init(void)
>>>> @@ -1685,21 +1674,40 @@ static void io_mem_init(void)
>>>>                            "watch", UINT64_MAX);
>>>>  }
>>>>
>>>> +void global_begin(void)
>>>> +{
>>>> +    next_pgtbl = g_new0(PhysPageTable, 1);
>>>> +    next_pgtbl->ref = 1;
>>>> +    next_pgtbl->phys_section_unassigned = dummy_section(&io_mem_unassigned);
>>>> +    next_pgtbl->phys_section_notdirty = dummy_section(&io_mem_notdirty);
>>>> +    next_pgtbl->phys_section_rom = dummy_section(&io_mem_rom);
>>>> +    next_pgtbl->phys_section_watch = dummy_section(&io_mem_watch);
>>>> +}
>>>> +
>>>> +/* other listeners finished */
>>>> +void global_commit(void)
>>>> +{
>>>> +    PhysPageTable *t = cur_pgtbl;
>>>> +
>>>> +    cur_pgtbl = next_pgtbl;
>>>> +    /* Fix me,  currently, we rely on each address space listener agaist its
>>>> +      * reader. So when we come here, no readers will touch old phys_map_node.
>>>> +      * After rcu, should changed to call_rcu()
>>>> +      */
>>>> +    if (__sync_sub_and_fetch(&t->ref, 1) == 0) {
>>>> +        destroy_pagetable(t);
>>>> +    }
>>>
>>> global_commit should not be needed, and global_begin should be simply
>>> the new core_begin.  It should unref next_pgtbl and reallocate a new one.
>>>
>> Good idea, thanks.
>>
>> Regards,
>> Pingfan
>>>
>>>> +}
>>>> +
>>>>  static void mem_begin(MemoryListener *listener)
>>>>  {
>>>>      AddressSpaceDispatch *d = container_of(listener, AddressSpaceDispatch, listener);
>>>>
>>>> -    destroy_all_mappings(d);
>>>>      d->phys_map.ptr = PHYS_MAP_NODE_NIL;
>>>>  }
>>>>
>>>>  static void core_begin(MemoryListener *listener)
>>>>  {
>>>> -    phys_sections_clear();
>>>> -    phys_section_unassigned = dummy_section(&io_mem_unassigned);
>>>> -    phys_section_notdirty = dummy_section(&io_mem_notdirty);
>>>> -    phys_section_rom = dummy_section(&io_mem_rom);
>>>> -    phys_section_watch = dummy_section(&io_mem_watch);
>>>>  }
>>>>
>>>>  static void tcg_commit(MemoryListener *listener)
>>>> @@ -1779,7 +1787,6 @@ void address_space_destroy_dispatch(AddressSpace *as)
>>>>      AddressSpaceDispatch *d = as->dispatch;
>>>>
>>>>      memory_listener_unregister(&d->listener);
>>>> -    destroy_l2_mapping(&d->phys_map, P_L2_LEVELS - 1);
>>>>      g_free(d);
>>>>      as->dispatch = NULL;
>>>>  }
>>>> @@ -2386,7 +2393,7 @@ void stl_phys_notdirty(hwaddr addr, uint32_t val)
>>>>
>>>>      if (!memory_region_is_ram(section->mr) || section->readonly) {
>>>>          if (memory_region_is_ram(section->mr)) {
>>>> -            section = &phys_sections[phys_section_rom].section;
>>>> +            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
>>>>          }
>>>>          io_mem_write(section->mr, addr, val, 4);
>>>>      } else {
>>>> @@ -2422,7 +2429,7 @@ void stq_phys_notdirty(hwaddr addr, uint64_t val)
>>>>
>>>>      if (!memory_region_is_ram(section->mr) || section->readonly) {
>>>>          if (memory_region_is_ram(section->mr)) {
>>>> -            section = &phys_sections[phys_section_rom].section;
>>>> +            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
>>>>          }
>>>>  #ifdef TARGET_WORDS_BIGENDIAN
>>>>          io_mem_write(section->mr, addr, val >> 32, 4);
>>>> @@ -2455,7 +2462,7 @@ static inline void stl_phys_internal(hwaddr addr, uint32_t val,
>>>>
>>>>      if (!memory_region_is_ram(section->mr) || section->readonly) {
>>>>          if (memory_region_is_ram(section->mr)) {
>>>> -            section = &phys_sections[phys_section_rom].section;
>>>> +            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
>>>>          }
>>>>  #if defined(TARGET_WORDS_BIGENDIAN)
>>>>          if (endian == DEVICE_LITTLE_ENDIAN) {
>>>> @@ -2526,7 +2533,7 @@ static inline void stw_phys_internal(hwaddr addr, uint32_t val,
>>>>
>>>>      if (!memory_region_is_ram(section->mr) || section->readonly) {
>>>>          if (memory_region_is_ram(section->mr)) {
>>>> -            section = &phys_sections[phys_section_rom].section;
>>>> +            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
>>>>          }
>>>>  #if defined(TARGET_WORDS_BIGENDIAN)
>>>>          if (endian == DEVICE_LITTLE_ENDIAN) {
>>>> diff --git a/include/exec/memory.h b/include/exec/memory.h
>>>> index b97ace7..cc654fa 100644
>>>> --- a/include/exec/memory.h
>>>> +++ b/include/exec/memory.h
>>>> @@ -992,6 +992,8 @@ void *address_space_map(AddressSpace *as, hwaddr addr,
>>>>  void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
>>>>                           int is_write, hwaddr access_len);
>>>>
>>>> +void global_begin(void);
>>>> +void global_commit(void);
>>>>
>>>>  #endif
>>>>
>>>> diff --git a/memory.c b/memory.c
>>>> index 1a86607..da06dfd 100644
>>>> --- a/memory.c
>>>> +++ b/memory.c
>>>> @@ -805,6 +805,7 @@ void memory_region_transaction_commit(void)
>>>>      --memory_region_transaction_depth;
>>>>      if (!memory_region_transaction_depth && memory_region_update_pending) {
>>>>          memory_region_update_pending = false;
>>>> +        global_begin();
>>>>          MEMORY_LISTENER_CALL_GLOBAL(begin, Forward);
>>>>
>>>>          QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) {
>>>> @@ -812,6 +813,7 @@ void memory_region_transaction_commit(void)
>>>>          }
>>>>
>>>>          MEMORY_LISTENER_CALL_GLOBAL(commit, Forward);
>>>> +        global_commit();
>>>>      }
>>>>  }
>>>>
>>>>
>>>
>>
>>
>
pingfan liu - May 26, 2013, 1:02 p.m.
[...]
>>>> +static PhysPageTable *cur_pgtbl;
>>>> +static PhysPageTable *next_pgtbl;
>>>
>>> You shouldn't need cur_pgtbl.  Instead, each AddressSpaceDispatch should
>>> have a pointer to its own cur_pgtbl.  In the commit hook you can then
>>> take a lock, unref the old table, assign cur_pgtbl = next_pgtbl and ref
>>> the new one.
>>>
>> Here is a trick. All AddressSpaceDispatch will update its radix tree
>> at the same time, so the re-claimer will come after all of them commit
>> to drop the old phys_map_node[] which holds the radix trees all at
>> once. Is it OK? Or you still prefer  each AddressSpaceDispatch has its
>> own space for radix tree?
>
> I'm not sure... I prefer to have more stuff in AddressSpaceDispatch to
> make things as streamlined as possible on the read side.
>
When I try to make each AddressSpaceDispatch own a separate radix
tree, I found in subpage_read/write(), we need to retrieve
"phys_sections" for this subpage, ie,
   section = &phys_sections[mmio->sub_section[idx]]; will be changed to
   section = &subpage->as_dispatch->phys_sections[mmio->sub_section[idx]];
It will sacrifice performance.  Is it acceptable?

Regards,
Pingfan

>>> The lock must also be taken in address_space_translate, together with a
>>> relatively expensive ref/unref pair (two atomic operations).  We
>>> probably need real RCU so that we can skip the ref/unref.
>>>
>> Currently, without RCU, do we rely on as->lock? So no need ref/unref
>> on cur_pgtbl?  With RCU, I think we need ref/unref on mr, since
>> mr->ops->read/write can block, and should be excluded from RCU
>> section.
>
> Yes, we need ref/unref on the region, but in many cases that will be a
> no-op (most important, for access to system RAM).  And if we can do the
> dispatch in the RCU section, similar to what I proposed in reviewing
> your patch 2, we have no ref/unref in the common case of accessing RAM.
>
>>>>  #define PHYS_MAP_NODE_NIL (((uint16_t)~0) >> 1)
>>>>
>>>> @@ -111,13 +122,13 @@ static MemoryRegion io_mem_watch;
>>>>
>>>>  static void phys_map_node_reserve(unsigned nodes)
>>>>  {
>>>> -    if (phys_map_nodes_nb + nodes > phys_map_nodes_nb_alloc) {
>>>> +    if (next_pgtbl->phys_map_nodes_nb + nodes > next_pgtbl->phys_map_nodes_nb_alloc) {
>>>>          typedef PhysPageEntry Node[L2_SIZE];
>>>> -        phys_map_nodes_nb_alloc = MAX(phys_map_nodes_nb_alloc * 2, 16);
>>>> -        phys_map_nodes_nb_alloc = MAX(phys_map_nodes_nb_alloc,
>>>> -                                      phys_map_nodes_nb + nodes);
>>>> -        phys_map_nodes = g_renew(Node, phys_map_nodes,
>>>> -                                 phys_map_nodes_nb_alloc);
>>>> +        next_pgtbl->phys_map_nodes_nb_alloc = MAX(next_pgtbl->phys_map_nodes_nb_alloc * 2, 16);
>>>> +        next_pgtbl->phys_map_nodes_nb_alloc = MAX(next_pgtbl->phys_map_nodes_nb_alloc,
>>>> +                                      next_pgtbl->phys_map_nodes_nb + nodes);
>>>> +        next_pgtbl->phys_map_nodes = g_renew(Node, next_pgtbl->phys_map_nodes,
>>>> +                                 next_pgtbl->phys_map_nodes_nb_alloc);
>>>>      }
>>>>  }
>>>>
>>>> @@ -126,22 +137,16 @@ static uint16_t phys_map_node_alloc(void)
>>>>      unsigned i;
>>>>      uint16_t ret;
>>>>
>>>> -    ret = phys_map_nodes_nb++;
>>>> +    ret = next_pgtbl->phys_map_nodes_nb++;
>>>>      assert(ret != PHYS_MAP_NODE_NIL);
>>>> -    assert(ret != phys_map_nodes_nb_alloc);
>>>> +    assert(ret != next_pgtbl->phys_map_nodes_nb_alloc);
>>>>      for (i = 0; i < L2_SIZE; ++i) {
>>>> -        phys_map_nodes[ret][i].is_leaf = 0;
>>>> -        phys_map_nodes[ret][i].ptr = PHYS_MAP_NODE_NIL;
>>>> +        next_pgtbl->phys_map_nodes[ret][i].is_leaf = 0;
>>>> +        next_pgtbl->phys_map_nodes[ret][i].ptr = PHYS_MAP_NODE_NIL;
>>>>      }
>>>>      return ret;
>>>>  }
>>>>
>>>> -static void phys_map_nodes_reset(void)
>>>> -{
>>>> -    phys_map_nodes_nb = 0;
>>>> -}
>>>> -
>>>> -
>>>>  static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index,
>>>>                                  hwaddr *nb, uint16_t leaf,
>>>>                                  int level)
>>>> @@ -152,15 +157,15 @@ static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index,
>>>>
>>>>      if (!lp->is_leaf && lp->ptr == PHYS_MAP_NODE_NIL) {
>>>>          lp->ptr = phys_map_node_alloc();
>>>> -        p = phys_map_nodes[lp->ptr];
>>>> +        p = next_pgtbl->phys_map_nodes[lp->ptr];
>>>>          if (level == 0) {
>>>>              for (i = 0; i < L2_SIZE; i++) {
>>>>                  p[i].is_leaf = 1;
>>>> -                p[i].ptr = phys_section_unassigned;
>>>> +                p[i].ptr = next_pgtbl->phys_section_unassigned;
>>>>              }
>>>>          }
>>>>      } else {
>>>> -        p = phys_map_nodes[lp->ptr];
>>>> +        p = next_pgtbl->phys_map_nodes[lp->ptr];
>>>>      }
>>>>      lp = &p[(*index >> (level * L2_BITS)) & (L2_SIZE - 1)];
>>>>
>>>> @@ -192,11 +197,13 @@ static PhysSection *phys_section_find(AddressSpaceDispatch *d,
>>>>  {
>>>>      PhysPageEntry lp = d->phys_map;
>>>>      PhysPageEntry *p;
>>>> +    PhysSection *phys_sections = cur_pgtbl->phys_sections;
>>>> +    Node *phys_map_nodes = cur_pgtbl->phys_map_nodes;
>>>>      int i;
>>>>
>>>>      for (i = P_L2_LEVELS - 1; i >= 0 && !lp.is_leaf; i--) {
>>>>          if (lp.ptr == PHYS_MAP_NODE_NIL) {
>>>> -            return &phys_sections[phys_section_unassigned];
>>>> +            return &phys_sections[cur_pgtbl->phys_section_unassigned];
>>>>          }
>>>>          p = phys_map_nodes[lp.ptr];
>>>>          lp = p[(index >> (i * L2_BITS)) & (L2_SIZE - 1)];
>>>> @@ -209,6 +216,7 @@ static MemoryRegionSection *address_space_lookup_region(AddressSpace *as,
>>>>  {
>>>>      PhysSection *psection;
>>>>      uint16_t idx;
>>>> +    PhysSection *phys_sections = cur_pgtbl->phys_sections;
>>>>
>>>>      psection = phys_section_find(as->dispatch, addr >> TARGET_PAGE_BITS);
>>>>      if (psection->sub_section) {
>>>> @@ -246,7 +254,7 @@ MemoryRegionSection *address_space_translate(AddressSpace *as, hwaddr addr,
>>>>                  | (addr & iotlb.addr_mask));
>>>>          len = MIN(len, (addr | iotlb.addr_mask) - addr + 1);
>>>>          if (!iotlb.perm[is_write]) {
>>>> -            section = &phys_sections[phys_section_unassigned].section;
>>>> +            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_unassigned].section;
>>>>              break;
>>>>          }
>>>>
>>>> @@ -690,12 +698,12 @@ hwaddr memory_region_section_get_iotlb(CPUArchState *env,
>>>>          iotlb = (memory_region_get_ram_addr(section->mr) & TARGET_PAGE_MASK)
>>>>              + xlat;
>>>>          if (!section->readonly) {
>>>> -            iotlb |= phys_section_notdirty;
>>>> +            iotlb |= cur_pgtbl->phys_section_notdirty;
>>>>          } else {
>>>> -            iotlb |= phys_section_rom;
>>>> +            iotlb |= cur_pgtbl->phys_section_rom;
>>>>          }
>>>>      } else {
>>>> -        iotlb = PHYS_SECTION_ID(container_of(section, PhysSection, section));
>>>> +        iotlb = PHYS_SECTION_ID(container_of(section, PhysSection, section), cur_pgtbl);
>>>>          iotlb += xlat;
>>>>      }
>>>>
>>>> @@ -705,7 +713,7 @@ hwaddr memory_region_section_get_iotlb(CPUArchState *env,
>>>>          if (vaddr == (wp->vaddr & TARGET_PAGE_MASK)) {
>>>>              /* Avoid trapping reads of pages with a write breakpoint. */
>>>>              if ((prot & PAGE_WRITE) || (wp->flags & BP_MEM_READ)) {
>>>> -                iotlb = phys_section_watch + paddr;
>>>> +                iotlb = cur_pgtbl->phys_section_watch + paddr;
>>>>                  *address |= TLB_MMIO;
>>>>                  break;
>>>>              }
>>>> @@ -721,59 +729,40 @@ static int subsection_register(PhysSection *psection, uint32_t start,
>>>>                                 uint32_t end, uint16_t section);
>>>>  static void subsections_init(PhysSection *psection);
>>>>
>>>> -static void destroy_page_desc(uint16_t section_index)
>>>> +/* Call after all listener has been commit.
>>>> +  * we do not walk over tree, just simply drop.
>>>> +  */
>>>> +static void destroy_pagetable(PhysPageTable *pgtbl)
>>>>  {
>>>> -    g_free(phys_sections[section_index].sub_section);
>>>> -}
>>>> -
>>>> -static void destroy_l2_mapping(PhysPageEntry *lp, unsigned level)
>>>> -{
>>>> -    unsigned i;
>>>> -    PhysPageEntry *p;
>>>> +    int i;
>>>>
>>>> -    if (lp->ptr == PHYS_MAP_NODE_NIL) {
>>>> -        return;
>>>> -    }
>>>> +    g_free(pgtbl->phys_map_nodes);
>>>>
>>>> -    p = phys_map_nodes[lp->ptr];
>>>> -    for (i = 0; i < L2_SIZE; ++i) {
>>>> -        if (!p[i].is_leaf) {
>>>> -            destroy_l2_mapping(&p[i], level - 1);
>>>> +    for (i = 0; i < pgtbl->phys_sections_nb_alloc; i++) {
>>>> +        if (pgtbl->phys_sections[i].sub_section) {
>>>> +            g_free(pgtbl->phys_sections[i].sub_section);
>>>>          } else {
>>>> -            destroy_page_desc(p[i].ptr);
>>>> +            memory_region_unref(pgtbl->phys_sections[i].section.mr);
>>>>          }
>>>>      }
>>>> -    lp->is_leaf = 0;
>>>> -    lp->ptr = PHYS_MAP_NODE_NIL;
>>>> -}
>>>> +    g_free(pgtbl->phys_sections);
>>>>
>>>> -static void destroy_all_mappings(AddressSpaceDispatch *d)
>>>> -{
>>>> -    destroy_l2_mapping(&d->phys_map, P_L2_LEVELS - 1);
>>>> -    phys_map_nodes_reset();
>>>> +    g_free(pgtbl);
>>>>  }
>>>
>>> This should be a separate patch.
>>>
>> Sorry, this is part of reclaimer to play destruction of prev pgtbl.
>> Why to separate it?
>
> You are changing the algorithm by which the old pagetable is destroyed.
>  This applies just as well to the current code.
>
> BTW, your patch includes Jan's subpage patch which was broken.  I'll
> drop it from the branch next time I push.
>
> Paolo
>
>>>> -static uint16_t phys_section_add(MemoryRegionSection *section)
>>>> +static uint16_t phys_section_add(MemoryRegionSection *section, PhysPageTable *pgtbl)
>>>
>>> phys_section_add will always add to next_pgtbl.
>>>
>> Ok, will drop @pgtbl.
>>>>  {
>>>> -    assert(phys_sections_nb < TARGET_PAGE_SIZE);
>>>> +    assert(pgtbl->phys_sections_nb < TARGET_PAGE_SIZE);
>>>>
>>>> -    if (phys_sections_nb == phys_sections_nb_alloc) {
>>>> -        phys_sections_nb_alloc = MAX(phys_sections_nb_alloc * 2, 16);
>>>> -        phys_sections = g_renew(PhysSection, phys_sections,
>>>> -                                phys_sections_nb_alloc);
>>>> +    if (pgtbl->phys_sections_nb == pgtbl->phys_sections_nb_alloc) {
>>>> +        pgtbl->phys_sections_nb_alloc = MAX(pgtbl->phys_sections_nb_alloc * 2, 16);
>>>> +        pgtbl->phys_sections = g_renew(PhysSection, pgtbl->phys_sections,
>>>> +                                pgtbl->phys_sections_nb_alloc);
>>>>      }
>>>> -    phys_sections[phys_sections_nb].section = *section;
>>>> -    phys_sections[phys_sections_nb].sub_section = NULL;
>>>> +    pgtbl->phys_sections[pgtbl->phys_sections_nb].section = *section;
>>>> +    pgtbl->phys_sections[pgtbl->phys_sections_nb].sub_section = NULL;
>>>>      memory_region_ref(section->mr);
>>>> -    return phys_sections_nb++;
>>>> -}
>>>> -
>>>> -static void phys_sections_clear(void)
>>>> -{
>>>> -    while (phys_sections_nb > 0) {
>>>> -        PhysSection *phys_section = &phys_sections[--phys_sections_nb];
>>>> -        memory_region_unref(phys_section->section.mr);
>>>> -    }
>>>> +    return pgtbl->phys_sections_nb++;
>>>>  }
>>>>
>>>>  static void register_subsection(AddressSpaceDispatch *d,
>>>> @@ -793,18 +782,18 @@ static void register_subsection(AddressSpaceDispatch *d,
>>>>             psection->section.mr == &io_mem_unassigned);
>>>>
>>>>      if (!psection->sub_section) {
>>>> -        new_section = phys_section_add(&subsection);
>>>> -        psection = &phys_sections[new_section];
>>>> +        new_section = phys_section_add(&subsection, next_pgtbl);
>>>> +        psection = &next_pgtbl->phys_sections[new_section];
>>>>          subsections_init(psection);
>>>>          phys_page_set(d, base >> TARGET_PAGE_BITS, 1, new_section);
>>>>      } else {
>>>> -        new_section = PHYS_SECTION_ID(psection);
>>>> +        new_section = PHYS_SECTION_ID(psection, next_pgtbl);
>>>>      }
>>>>
>>>> -    new_subsection = phys_section_add(section);
>>>> +    new_subsection = phys_section_add(section, next_pgtbl);
>>>>
>>>>      /* phys_section_add invalidates psection, reload it  */
>>>> -    psection = &phys_sections[new_section];
>>>> +    psection = &next_pgtbl->phys_sections[new_section];
>>>>      start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
>>>>      end = start + section->size - 1;
>>>>      subsection_register(psection, start, end, new_subsection);
>>>> @@ -816,7 +805,7 @@ static void register_multipage(AddressSpaceDispatch *d, MemoryRegionSection *sec
>>>>      hwaddr start_addr = section->offset_within_address_space;
>>>>      ram_addr_t size = section->size;
>>>>      hwaddr addr;
>>>> -    uint16_t section_index = phys_section_add(section);
>>>> +    uint16_t section_index = phys_section_add(section, next_pgtbl);
>>>>
>>>>      assert(size);
>>>>
>>>> @@ -1653,7 +1642,7 @@ static void subsections_init(PhysSection *psection)
>>>>  {
>>>>      psection->sub_section = g_malloc0(sizeof(uint16_t) * TARGET_PAGE_SIZE);
>>>>      subsection_register(psection, 0, TARGET_PAGE_SIZE-1,
>>>> -                        phys_section_unassigned);
>>>> +                        next_pgtbl->phys_section_unassigned);
>>>>  }
>>>>
>>>>  static uint16_t dummy_section(MemoryRegion *mr)
>>>> @@ -1665,12 +1654,12 @@ static uint16_t dummy_section(MemoryRegion *mr)
>>>>          .size = UINT64_MAX,
>>>>      };
>>>>
>>>> -    return phys_section_add(&section);
>>>> +    return phys_section_add(&section, next_pgtbl);
>>>>  }
>>>>
>>>>  MemoryRegion *iotlb_to_region(hwaddr index)
>>>>  {
>>>> -    return phys_sections[index & ~TARGET_PAGE_MASK].section.mr;
>>>> +    return cur_pgtbl->phys_sections[index & ~TARGET_PAGE_MASK].section.mr;
>>>>  }
>>>>
>>>>  static void io_mem_init(void)
>>>> @@ -1685,21 +1674,40 @@ static void io_mem_init(void)
>>>>                            "watch", UINT64_MAX);
>>>>  }
>>>>
>>>> +void global_begin(void)
>>>> +{
>>>> +    next_pgtbl = g_new0(PhysPageTable, 1);
>>>> +    next_pgtbl->ref = 1;
>>>> +    next_pgtbl->phys_section_unassigned = dummy_section(&io_mem_unassigned);
>>>> +    next_pgtbl->phys_section_notdirty = dummy_section(&io_mem_notdirty);
>>>> +    next_pgtbl->phys_section_rom = dummy_section(&io_mem_rom);
>>>> +    next_pgtbl->phys_section_watch = dummy_section(&io_mem_watch);
>>>> +}
>>>> +
>>>> +/* other listeners finished */
>>>> +void global_commit(void)
>>>> +{
>>>> +    PhysPageTable *t = cur_pgtbl;
>>>> +
>>>> +    cur_pgtbl = next_pgtbl;
>>>> +    /* Fix me,  currently, we rely on each address space listener agaist its
>>>> +      * reader. So when we come here, no readers will touch old phys_map_node.
>>>> +      * After rcu, should changed to call_rcu()
>>>> +      */
>>>> +    if (__sync_sub_and_fetch(&t->ref, 1) == 0) {
>>>> +        destroy_pagetable(t);
>>>> +    }
>>>
>>> global_commit should not be needed, and global_begin should be simply
>>> the new core_begin.  It should unref next_pgtbl and reallocate a new one.
>>>
>> Good idea, thanks.
>>
>> Regards,
>> Pingfan
>>>
>>>> +}
>>>> +
>>>>  static void mem_begin(MemoryListener *listener)
>>>>  {
>>>>      AddressSpaceDispatch *d = container_of(listener, AddressSpaceDispatch, listener);
>>>>
>>>> -    destroy_all_mappings(d);
>>>>      d->phys_map.ptr = PHYS_MAP_NODE_NIL;
>>>>  }
>>>>
>>>>  static void core_begin(MemoryListener *listener)
>>>>  {
>>>> -    phys_sections_clear();
>>>> -    phys_section_unassigned = dummy_section(&io_mem_unassigned);
>>>> -    phys_section_notdirty = dummy_section(&io_mem_notdirty);
>>>> -    phys_section_rom = dummy_section(&io_mem_rom);
>>>> -    phys_section_watch = dummy_section(&io_mem_watch);
>>>>  }
>>>>
>>>>  static void tcg_commit(MemoryListener *listener)
>>>> @@ -1779,7 +1787,6 @@ void address_space_destroy_dispatch(AddressSpace *as)
>>>>      AddressSpaceDispatch *d = as->dispatch;
>>>>
>>>>      memory_listener_unregister(&d->listener);
>>>> -    destroy_l2_mapping(&d->phys_map, P_L2_LEVELS - 1);
>>>>      g_free(d);
>>>>      as->dispatch = NULL;
>>>>  }
>>>> @@ -2386,7 +2393,7 @@ void stl_phys_notdirty(hwaddr addr, uint32_t val)
>>>>
>>>>      if (!memory_region_is_ram(section->mr) || section->readonly) {
>>>>          if (memory_region_is_ram(section->mr)) {
>>>> -            section = &phys_sections[phys_section_rom].section;
>>>> +            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
>>>>          }
>>>>          io_mem_write(section->mr, addr, val, 4);
>>>>      } else {
>>>> @@ -2422,7 +2429,7 @@ void stq_phys_notdirty(hwaddr addr, uint64_t val)
>>>>
>>>>      if (!memory_region_is_ram(section->mr) || section->readonly) {
>>>>          if (memory_region_is_ram(section->mr)) {
>>>> -            section = &phys_sections[phys_section_rom].section;
>>>> +            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
>>>>          }
>>>>  #ifdef TARGET_WORDS_BIGENDIAN
>>>>          io_mem_write(section->mr, addr, val >> 32, 4);
>>>> @@ -2455,7 +2462,7 @@ static inline void stl_phys_internal(hwaddr addr, uint32_t val,
>>>>
>>>>      if (!memory_region_is_ram(section->mr) || section->readonly) {
>>>>          if (memory_region_is_ram(section->mr)) {
>>>> -            section = &phys_sections[phys_section_rom].section;
>>>> +            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
>>>>          }
>>>>  #if defined(TARGET_WORDS_BIGENDIAN)
>>>>          if (endian == DEVICE_LITTLE_ENDIAN) {
>>>> @@ -2526,7 +2533,7 @@ static inline void stw_phys_internal(hwaddr addr, uint32_t val,
>>>>
>>>>      if (!memory_region_is_ram(section->mr) || section->readonly) {
>>>>          if (memory_region_is_ram(section->mr)) {
>>>> -            section = &phys_sections[phys_section_rom].section;
>>>> +            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
>>>>          }
>>>>  #if defined(TARGET_WORDS_BIGENDIAN)
>>>>          if (endian == DEVICE_LITTLE_ENDIAN) {
>>>> diff --git a/include/exec/memory.h b/include/exec/memory.h
>>>> index b97ace7..cc654fa 100644
>>>> --- a/include/exec/memory.h
>>>> +++ b/include/exec/memory.h
>>>> @@ -992,6 +992,8 @@ void *address_space_map(AddressSpace *as, hwaddr addr,
>>>>  void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
>>>>                           int is_write, hwaddr access_len);
>>>>
>>>> +void global_begin(void);
>>>> +void global_commit(void);
>>>>
>>>>  #endif
>>>>
>>>> diff --git a/memory.c b/memory.c
>>>> index 1a86607..da06dfd 100644
>>>> --- a/memory.c
>>>> +++ b/memory.c
>>>> @@ -805,6 +805,7 @@ void memory_region_transaction_commit(void)
>>>>      --memory_region_transaction_depth;
>>>>      if (!memory_region_transaction_depth && memory_region_update_pending) {
>>>>          memory_region_update_pending = false;
>>>> +        global_begin();
>>>>          MEMORY_LISTENER_CALL_GLOBAL(begin, Forward);
>>>>
>>>>          QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) {
>>>> @@ -812,6 +813,7 @@ void memory_region_transaction_commit(void)
>>>>          }
>>>>
>>>>          MEMORY_LISTENER_CALL_GLOBAL(commit, Forward);
>>>> +        global_commit();
>>>>      }
>>>>  }
>>>>
>>>>
>>>
>>
>>
>
Paolo Bonzini - May 27, 2013, 11:54 a.m.
Il 26/05/2013 15:02, liu ping fan ha scritto:
> [...]
>>>>> +static PhysPageTable *cur_pgtbl;
>>>>> +static PhysPageTable *next_pgtbl;
>>>>
>>>> You shouldn't need cur_pgtbl.  Instead, each AddressSpaceDispatch should
>>>> have a pointer to its own cur_pgtbl.  In the commit hook you can then
>>>> take a lock, unref the old table, assign cur_pgtbl = next_pgtbl and ref
>>>> the new one.
>>>>
>>> Here is a trick. All AddressSpaceDispatch will update its radix tree
>>> at the same time, so the re-claimer will come after all of them commit
>>> to drop the old phys_map_node[] which holds the radix trees all at
>>> once. Is it OK? Or you still prefer  each AddressSpaceDispatch has its
>>> own space for radix tree?
>>
>> I'm not sure... I prefer to have more stuff in AddressSpaceDispatch to
>> make things as streamlined as possible on the read side.
>>
> When I try to make each AddressSpaceDispatch own a separate radix
> tree,

Just to check we are on the same page: the radix tree will usually be
the same for all AddressSpaceDispatch instances, except if the commit
hook is running concurrently.  Is this right?

 I found in subpage_read/write(), we need to retrieve
> "phys_sections" for this subpage, ie,
>    section = &phys_sections[mmio->sub_section[idx]]; will be changed to
>    section = &subpage->as_dispatch->phys_sections[mmio->sub_section[idx]];
> It will sacrifice performance.  Is it acceptable?

It is acceptable, but it is also not necessary anymore. :)  subpage_read
is now doing simply:

static uint64_t subpage_read(void *opaque, hwaddr addr,
                             unsigned len)
{
    subpage_t *subpage = opaque;
    uint8_t buf[4];

#if defined(DEBUG_SUBPAGE)
    printf("%s: subpage %p len %d addr " TARGET_FMT_plx "\n", __func__,
           subpage, len, addr);
#endif
    address_space_read(subpage->as, addr + subpage->base, buf, len);
    switch (len) {
    case 1:
        return ldub_p(buf);
    case 2:
        return lduw_p(buf);
    case 4:
        return ldl_p(buf);
    default:
        abort();
    }
}



You can base your work on the latest version of the code, which is in my
github rcu branch.  It is not very tested, but it should work.

Paolo


> Regards,
> Pingfan
> 
>>>> The lock must also be taken in address_space_translate, together with a
>>>> relatively expensive ref/unref pair (two atomic operations).  We
>>>> probably need real RCU so that we can skip the ref/unref.
>>>>
>>> Currently, without RCU, do we rely on as->lock? So no need ref/unref
>>> on cur_pgtbl?  With RCU, I think we need ref/unref on mr, since
>>> mr->ops->read/write can block, and should be excluded from RCU
>>> section.
>>
>> Yes, we need ref/unref on the region, but in many cases that will be a
>> no-op (most important, for access to system RAM).  And if we can do the
>> dispatch in the RCU section, similar to what I proposed in reviewing
>> your patch 2, we have no ref/unref in the common case of accessing RAM.
>>
>>>>>  #define PHYS_MAP_NODE_NIL (((uint16_t)~0) >> 1)
>>>>>
>>>>> @@ -111,13 +122,13 @@ static MemoryRegion io_mem_watch;
>>>>>
>>>>>  static void phys_map_node_reserve(unsigned nodes)
>>>>>  {
>>>>> -    if (phys_map_nodes_nb + nodes > phys_map_nodes_nb_alloc) {
>>>>> +    if (next_pgtbl->phys_map_nodes_nb + nodes > next_pgtbl->phys_map_nodes_nb_alloc) {
>>>>>          typedef PhysPageEntry Node[L2_SIZE];
>>>>> -        phys_map_nodes_nb_alloc = MAX(phys_map_nodes_nb_alloc * 2, 16);
>>>>> -        phys_map_nodes_nb_alloc = MAX(phys_map_nodes_nb_alloc,
>>>>> -                                      phys_map_nodes_nb + nodes);
>>>>> -        phys_map_nodes = g_renew(Node, phys_map_nodes,
>>>>> -                                 phys_map_nodes_nb_alloc);
>>>>> +        next_pgtbl->phys_map_nodes_nb_alloc = MAX(next_pgtbl->phys_map_nodes_nb_alloc * 2, 16);
>>>>> +        next_pgtbl->phys_map_nodes_nb_alloc = MAX(next_pgtbl->phys_map_nodes_nb_alloc,
>>>>> +                                      next_pgtbl->phys_map_nodes_nb + nodes);
>>>>> +        next_pgtbl->phys_map_nodes = g_renew(Node, next_pgtbl->phys_map_nodes,
>>>>> +                                 next_pgtbl->phys_map_nodes_nb_alloc);
>>>>>      }
>>>>>  }
>>>>>
>>>>> @@ -126,22 +137,16 @@ static uint16_t phys_map_node_alloc(void)
>>>>>      unsigned i;
>>>>>      uint16_t ret;
>>>>>
>>>>> -    ret = phys_map_nodes_nb++;
>>>>> +    ret = next_pgtbl->phys_map_nodes_nb++;
>>>>>      assert(ret != PHYS_MAP_NODE_NIL);
>>>>> -    assert(ret != phys_map_nodes_nb_alloc);
>>>>> +    assert(ret != next_pgtbl->phys_map_nodes_nb_alloc);
>>>>>      for (i = 0; i < L2_SIZE; ++i) {
>>>>> -        phys_map_nodes[ret][i].is_leaf = 0;
>>>>> -        phys_map_nodes[ret][i].ptr = PHYS_MAP_NODE_NIL;
>>>>> +        next_pgtbl->phys_map_nodes[ret][i].is_leaf = 0;
>>>>> +        next_pgtbl->phys_map_nodes[ret][i].ptr = PHYS_MAP_NODE_NIL;
>>>>>      }
>>>>>      return ret;
>>>>>  }
>>>>>
>>>>> -static void phys_map_nodes_reset(void)
>>>>> -{
>>>>> -    phys_map_nodes_nb = 0;
>>>>> -}
>>>>> -
>>>>> -
>>>>>  static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index,
>>>>>                                  hwaddr *nb, uint16_t leaf,
>>>>>                                  int level)
>>>>> @@ -152,15 +157,15 @@ static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index,
>>>>>
>>>>>      if (!lp->is_leaf && lp->ptr == PHYS_MAP_NODE_NIL) {
>>>>>          lp->ptr = phys_map_node_alloc();
>>>>> -        p = phys_map_nodes[lp->ptr];
>>>>> +        p = next_pgtbl->phys_map_nodes[lp->ptr];
>>>>>          if (level == 0) {
>>>>>              for (i = 0; i < L2_SIZE; i++) {
>>>>>                  p[i].is_leaf = 1;
>>>>> -                p[i].ptr = phys_section_unassigned;
>>>>> +                p[i].ptr = next_pgtbl->phys_section_unassigned;
>>>>>              }
>>>>>          }
>>>>>      } else {
>>>>> -        p = phys_map_nodes[lp->ptr];
>>>>> +        p = next_pgtbl->phys_map_nodes[lp->ptr];
>>>>>      }
>>>>>      lp = &p[(*index >> (level * L2_BITS)) & (L2_SIZE - 1)];
>>>>>
>>>>> @@ -192,11 +197,13 @@ static PhysSection *phys_section_find(AddressSpaceDispatch *d,
>>>>>  {
>>>>>      PhysPageEntry lp = d->phys_map;
>>>>>      PhysPageEntry *p;
>>>>> +    PhysSection *phys_sections = cur_pgtbl->phys_sections;
>>>>> +    Node *phys_map_nodes = cur_pgtbl->phys_map_nodes;
>>>>>      int i;
>>>>>
>>>>>      for (i = P_L2_LEVELS - 1; i >= 0 && !lp.is_leaf; i--) {
>>>>>          if (lp.ptr == PHYS_MAP_NODE_NIL) {
>>>>> -            return &phys_sections[phys_section_unassigned];
>>>>> +            return &phys_sections[cur_pgtbl->phys_section_unassigned];
>>>>>          }
>>>>>          p = phys_map_nodes[lp.ptr];
>>>>>          lp = p[(index >> (i * L2_BITS)) & (L2_SIZE - 1)];
>>>>> @@ -209,6 +216,7 @@ static MemoryRegionSection *address_space_lookup_region(AddressSpace *as,
>>>>>  {
>>>>>      PhysSection *psection;
>>>>>      uint16_t idx;
>>>>> +    PhysSection *phys_sections = cur_pgtbl->phys_sections;
>>>>>
>>>>>      psection = phys_section_find(as->dispatch, addr >> TARGET_PAGE_BITS);
>>>>>      if (psection->sub_section) {
>>>>> @@ -246,7 +254,7 @@ MemoryRegionSection *address_space_translate(AddressSpace *as, hwaddr addr,
>>>>>                  | (addr & iotlb.addr_mask));
>>>>>          len = MIN(len, (addr | iotlb.addr_mask) - addr + 1);
>>>>>          if (!iotlb.perm[is_write]) {
>>>>> -            section = &phys_sections[phys_section_unassigned].section;
>>>>> +            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_unassigned].section;
>>>>>              break;
>>>>>          }
>>>>>
>>>>> @@ -690,12 +698,12 @@ hwaddr memory_region_section_get_iotlb(CPUArchState *env,
>>>>>          iotlb = (memory_region_get_ram_addr(section->mr) & TARGET_PAGE_MASK)
>>>>>              + xlat;
>>>>>          if (!section->readonly) {
>>>>> -            iotlb |= phys_section_notdirty;
>>>>> +            iotlb |= cur_pgtbl->phys_section_notdirty;
>>>>>          } else {
>>>>> -            iotlb |= phys_section_rom;
>>>>> +            iotlb |= cur_pgtbl->phys_section_rom;
>>>>>          }
>>>>>      } else {
>>>>> -        iotlb = PHYS_SECTION_ID(container_of(section, PhysSection, section));
>>>>> +        iotlb = PHYS_SECTION_ID(container_of(section, PhysSection, section), cur_pgtbl);
>>>>>          iotlb += xlat;
>>>>>      }
>>>>>
>>>>> @@ -705,7 +713,7 @@ hwaddr memory_region_section_get_iotlb(CPUArchState *env,
>>>>>          if (vaddr == (wp->vaddr & TARGET_PAGE_MASK)) {
>>>>>              /* Avoid trapping reads of pages with a write breakpoint. */
>>>>>              if ((prot & PAGE_WRITE) || (wp->flags & BP_MEM_READ)) {
>>>>> -                iotlb = phys_section_watch + paddr;
>>>>> +                iotlb = cur_pgtbl->phys_section_watch + paddr;
>>>>>                  *address |= TLB_MMIO;
>>>>>                  break;
>>>>>              }
>>>>> @@ -721,59 +729,40 @@ static int subsection_register(PhysSection *psection, uint32_t start,
>>>>>                                 uint32_t end, uint16_t section);
>>>>>  static void subsections_init(PhysSection *psection);
>>>>>
>>>>> -static void destroy_page_desc(uint16_t section_index)
>>>>> +/* Call after all listener has been commit.
>>>>> +  * we do not walk over tree, just simply drop.
>>>>> +  */
>>>>> +static void destroy_pagetable(PhysPageTable *pgtbl)
>>>>>  {
>>>>> -    g_free(phys_sections[section_index].sub_section);
>>>>> -}
>>>>> -
>>>>> -static void destroy_l2_mapping(PhysPageEntry *lp, unsigned level)
>>>>> -{
>>>>> -    unsigned i;
>>>>> -    PhysPageEntry *p;
>>>>> +    int i;
>>>>>
>>>>> -    if (lp->ptr == PHYS_MAP_NODE_NIL) {
>>>>> -        return;
>>>>> -    }
>>>>> +    g_free(pgtbl->phys_map_nodes);
>>>>>
>>>>> -    p = phys_map_nodes[lp->ptr];
>>>>> -    for (i = 0; i < L2_SIZE; ++i) {
>>>>> -        if (!p[i].is_leaf) {
>>>>> -            destroy_l2_mapping(&p[i], level - 1);
>>>>> +    for (i = 0; i < pgtbl->phys_sections_nb_alloc; i++) {
>>>>> +        if (pgtbl->phys_sections[i].sub_section) {
>>>>> +            g_free(pgtbl->phys_sections[i].sub_section);
>>>>>          } else {
>>>>> -            destroy_page_desc(p[i].ptr);
>>>>> +            memory_region_unref(pgtbl->phys_sections[i].section.mr);
>>>>>          }
>>>>>      }
>>>>> -    lp->is_leaf = 0;
>>>>> -    lp->ptr = PHYS_MAP_NODE_NIL;
>>>>> -}
>>>>> +    g_free(pgtbl->phys_sections);
>>>>>
>>>>> -static void destroy_all_mappings(AddressSpaceDispatch *d)
>>>>> -{
>>>>> -    destroy_l2_mapping(&d->phys_map, P_L2_LEVELS - 1);
>>>>> -    phys_map_nodes_reset();
>>>>> +    g_free(pgtbl);
>>>>>  }
>>>>
>>>> This should be a separate patch.
>>>>
>>> Sorry, this is part of reclaimer to play destruction of prev pgtbl.
>>> Why to separate it?
>>
>> You are changing the algorithm by which the old pagetable is destroyed.
>>  This applies just as well to the current code.
>>
>> BTW, your patch includes Jan's subpage patch which was broken.  I'll
>> drop it from the branch next time I push.
>>
>> Paolo
>>
>>>>> -static uint16_t phys_section_add(MemoryRegionSection *section)
>>>>> +static uint16_t phys_section_add(MemoryRegionSection *section, PhysPageTable *pgtbl)
>>>>
>>>> phys_section_add will always add to next_pgtbl.
>>>>
>>> Ok, will drop @pgtbl.
>>>>>  {
>>>>> -    assert(phys_sections_nb < TARGET_PAGE_SIZE);
>>>>> +    assert(pgtbl->phys_sections_nb < TARGET_PAGE_SIZE);
>>>>>
>>>>> -    if (phys_sections_nb == phys_sections_nb_alloc) {
>>>>> -        phys_sections_nb_alloc = MAX(phys_sections_nb_alloc * 2, 16);
>>>>> -        phys_sections = g_renew(PhysSection, phys_sections,
>>>>> -                                phys_sections_nb_alloc);
>>>>> +    if (pgtbl->phys_sections_nb == pgtbl->phys_sections_nb_alloc) {
>>>>> +        pgtbl->phys_sections_nb_alloc = MAX(pgtbl->phys_sections_nb_alloc * 2, 16);
>>>>> +        pgtbl->phys_sections = g_renew(PhysSection, pgtbl->phys_sections,
>>>>> +                                pgtbl->phys_sections_nb_alloc);
>>>>>      }
>>>>> -    phys_sections[phys_sections_nb].section = *section;
>>>>> -    phys_sections[phys_sections_nb].sub_section = NULL;
>>>>> +    pgtbl->phys_sections[pgtbl->phys_sections_nb].section = *section;
>>>>> +    pgtbl->phys_sections[pgtbl->phys_sections_nb].sub_section = NULL;
>>>>>      memory_region_ref(section->mr);
>>>>> -    return phys_sections_nb++;
>>>>> -}
>>>>> -
>>>>> -static void phys_sections_clear(void)
>>>>> -{
>>>>> -    while (phys_sections_nb > 0) {
>>>>> -        PhysSection *phys_section = &phys_sections[--phys_sections_nb];
>>>>> -        memory_region_unref(phys_section->section.mr);
>>>>> -    }
>>>>> +    return pgtbl->phys_sections_nb++;
>>>>>  }
>>>>>
>>>>>  static void register_subsection(AddressSpaceDispatch *d,
>>>>> @@ -793,18 +782,18 @@ static void register_subsection(AddressSpaceDispatch *d,
>>>>>             psection->section.mr == &io_mem_unassigned);
>>>>>
>>>>>      if (!psection->sub_section) {
>>>>> -        new_section = phys_section_add(&subsection);
>>>>> -        psection = &phys_sections[new_section];
>>>>> +        new_section = phys_section_add(&subsection, next_pgtbl);
>>>>> +        psection = &next_pgtbl->phys_sections[new_section];
>>>>>          subsections_init(psection);
>>>>>          phys_page_set(d, base >> TARGET_PAGE_BITS, 1, new_section);
>>>>>      } else {
>>>>> -        new_section = PHYS_SECTION_ID(psection);
>>>>> +        new_section = PHYS_SECTION_ID(psection, next_pgtbl);
>>>>>      }
>>>>>
>>>>> -    new_subsection = phys_section_add(section);
>>>>> +    new_subsection = phys_section_add(section, next_pgtbl);
>>>>>
>>>>>      /* phys_section_add invalidates psection, reload it  */
>>>>> -    psection = &phys_sections[new_section];
>>>>> +    psection = &next_pgtbl->phys_sections[new_section];
>>>>>      start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
>>>>>      end = start + section->size - 1;
>>>>>      subsection_register(psection, start, end, new_subsection);
>>>>> @@ -816,7 +805,7 @@ static void register_multipage(AddressSpaceDispatch *d, MemoryRegionSection *sec
>>>>>      hwaddr start_addr = section->offset_within_address_space;
>>>>>      ram_addr_t size = section->size;
>>>>>      hwaddr addr;
>>>>> -    uint16_t section_index = phys_section_add(section);
>>>>> +    uint16_t section_index = phys_section_add(section, next_pgtbl);
>>>>>
>>>>>      assert(size);
>>>>>
>>>>> @@ -1653,7 +1642,7 @@ static void subsections_init(PhysSection *psection)
>>>>>  {
>>>>>      psection->sub_section = g_malloc0(sizeof(uint16_t) * TARGET_PAGE_SIZE);
>>>>>      subsection_register(psection, 0, TARGET_PAGE_SIZE-1,
>>>>> -                        phys_section_unassigned);
>>>>> +                        next_pgtbl->phys_section_unassigned);
>>>>>  }
>>>>>
>>>>>  static uint16_t dummy_section(MemoryRegion *mr)
>>>>> @@ -1665,12 +1654,12 @@ static uint16_t dummy_section(MemoryRegion *mr)
>>>>>          .size = UINT64_MAX,
>>>>>      };
>>>>>
>>>>> -    return phys_section_add(&section);
>>>>> +    return phys_section_add(&section, next_pgtbl);
>>>>>  }
>>>>>
>>>>>  MemoryRegion *iotlb_to_region(hwaddr index)
>>>>>  {
>>>>> -    return phys_sections[index & ~TARGET_PAGE_MASK].section.mr;
>>>>> +    return cur_pgtbl->phys_sections[index & ~TARGET_PAGE_MASK].section.mr;
>>>>>  }
>>>>>
>>>>>  static void io_mem_init(void)
>>>>> @@ -1685,21 +1674,40 @@ static void io_mem_init(void)
>>>>>                            "watch", UINT64_MAX);
>>>>>  }
>>>>>
>>>>> +void global_begin(void)
>>>>> +{
>>>>> +    next_pgtbl = g_new0(PhysPageTable, 1);
>>>>> +    next_pgtbl->ref = 1;
>>>>> +    next_pgtbl->phys_section_unassigned = dummy_section(&io_mem_unassigned);
>>>>> +    next_pgtbl->phys_section_notdirty = dummy_section(&io_mem_notdirty);
>>>>> +    next_pgtbl->phys_section_rom = dummy_section(&io_mem_rom);
>>>>> +    next_pgtbl->phys_section_watch = dummy_section(&io_mem_watch);
>>>>> +}
>>>>> +
>>>>> +/* other listeners finished */
>>>>> +void global_commit(void)
>>>>> +{
>>>>> +    PhysPageTable *t = cur_pgtbl;
>>>>> +
>>>>> +    cur_pgtbl = next_pgtbl;
>>>>> +    /* Fix me,  currently, we rely on each address space listener agaist its
>>>>> +      * reader. So when we come here, no readers will touch old phys_map_node.
>>>>> +      * After rcu, should changed to call_rcu()
>>>>> +      */
>>>>> +    if (__sync_sub_and_fetch(&t->ref, 1) == 0) {
>>>>> +        destroy_pagetable(t);
>>>>> +    }
>>>>
>>>> global_commit should not be needed, and global_begin should be simply
>>>> the new core_begin.  It should unref next_pgtbl and reallocate a new one.
>>>>
>>> Good idea, thanks.
>>>
>>> Regards,
>>> Pingfan
>>>>
>>>>> +}
>>>>> +
>>>>>  static void mem_begin(MemoryListener *listener)
>>>>>  {
>>>>>      AddressSpaceDispatch *d = container_of(listener, AddressSpaceDispatch, listener);
>>>>>
>>>>> -    destroy_all_mappings(d);
>>>>>      d->phys_map.ptr = PHYS_MAP_NODE_NIL;
>>>>>  }
>>>>>
>>>>>  static void core_begin(MemoryListener *listener)
>>>>>  {
>>>>> -    phys_sections_clear();
>>>>> -    phys_section_unassigned = dummy_section(&io_mem_unassigned);
>>>>> -    phys_section_notdirty = dummy_section(&io_mem_notdirty);
>>>>> -    phys_section_rom = dummy_section(&io_mem_rom);
>>>>> -    phys_section_watch = dummy_section(&io_mem_watch);
>>>>>  }
>>>>>
>>>>>  static void tcg_commit(MemoryListener *listener)
>>>>> @@ -1779,7 +1787,6 @@ void address_space_destroy_dispatch(AddressSpace *as)
>>>>>      AddressSpaceDispatch *d = as->dispatch;
>>>>>
>>>>>      memory_listener_unregister(&d->listener);
>>>>> -    destroy_l2_mapping(&d->phys_map, P_L2_LEVELS - 1);
>>>>>      g_free(d);
>>>>>      as->dispatch = NULL;
>>>>>  }
>>>>> @@ -2386,7 +2393,7 @@ void stl_phys_notdirty(hwaddr addr, uint32_t val)
>>>>>
>>>>>      if (!memory_region_is_ram(section->mr) || section->readonly) {
>>>>>          if (memory_region_is_ram(section->mr)) {
>>>>> -            section = &phys_sections[phys_section_rom].section;
>>>>> +            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
>>>>>          }
>>>>>          io_mem_write(section->mr, addr, val, 4);
>>>>>      } else {
>>>>> @@ -2422,7 +2429,7 @@ void stq_phys_notdirty(hwaddr addr, uint64_t val)
>>>>>
>>>>>      if (!memory_region_is_ram(section->mr) || section->readonly) {
>>>>>          if (memory_region_is_ram(section->mr)) {
>>>>> -            section = &phys_sections[phys_section_rom].section;
>>>>> +            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
>>>>>          }
>>>>>  #ifdef TARGET_WORDS_BIGENDIAN
>>>>>          io_mem_write(section->mr, addr, val >> 32, 4);
>>>>> @@ -2455,7 +2462,7 @@ static inline void stl_phys_internal(hwaddr addr, uint32_t val,
>>>>>
>>>>>      if (!memory_region_is_ram(section->mr) || section->readonly) {
>>>>>          if (memory_region_is_ram(section->mr)) {
>>>>> -            section = &phys_sections[phys_section_rom].section;
>>>>> +            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
>>>>>          }
>>>>>  #if defined(TARGET_WORDS_BIGENDIAN)
>>>>>          if (endian == DEVICE_LITTLE_ENDIAN) {
>>>>> @@ -2526,7 +2533,7 @@ static inline void stw_phys_internal(hwaddr addr, uint32_t val,
>>>>>
>>>>>      if (!memory_region_is_ram(section->mr) || section->readonly) {
>>>>>          if (memory_region_is_ram(section->mr)) {
>>>>> -            section = &phys_sections[phys_section_rom].section;
>>>>> +            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
>>>>>          }
>>>>>  #if defined(TARGET_WORDS_BIGENDIAN)
>>>>>          if (endian == DEVICE_LITTLE_ENDIAN) {
>>>>> diff --git a/include/exec/memory.h b/include/exec/memory.h
>>>>> index b97ace7..cc654fa 100644
>>>>> --- a/include/exec/memory.h
>>>>> +++ b/include/exec/memory.h
>>>>> @@ -992,6 +992,8 @@ void *address_space_map(AddressSpace *as, hwaddr addr,
>>>>>  void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
>>>>>                           int is_write, hwaddr access_len);
>>>>>
>>>>> +void global_begin(void);
>>>>> +void global_commit(void);
>>>>>
>>>>>  #endif
>>>>>
>>>>> diff --git a/memory.c b/memory.c
>>>>> index 1a86607..da06dfd 100644
>>>>> --- a/memory.c
>>>>> +++ b/memory.c
>>>>> @@ -805,6 +805,7 @@ void memory_region_transaction_commit(void)
>>>>>      --memory_region_transaction_depth;
>>>>>      if (!memory_region_transaction_depth && memory_region_update_pending) {
>>>>>          memory_region_update_pending = false;
>>>>> +        global_begin();
>>>>>          MEMORY_LISTENER_CALL_GLOBAL(begin, Forward);
>>>>>
>>>>>          QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) {
>>>>> @@ -812,6 +813,7 @@ void memory_region_transaction_commit(void)
>>>>>          }
>>>>>
>>>>>          MEMORY_LISTENER_CALL_GLOBAL(commit, Forward);
>>>>> +        global_commit();
>>>>>      }
>>>>>  }
>>>>>
>>>>>
>>>>
>>>
>>>
>>
pingfan liu - May 29, 2013, 1:52 a.m.
On Mon, May 27, 2013 at 7:54 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> Il 26/05/2013 15:02, liu ping fan ha scritto:
>> [...]
>>>>>> +static PhysPageTable *cur_pgtbl;
>>>>>> +static PhysPageTable *next_pgtbl;
>>>>>
>>>>> You shouldn't need cur_pgtbl.  Instead, each AddressSpaceDispatch should
>>>>> have a pointer to its own cur_pgtbl.  In the commit hook you can then
>>>>> take a lock, unref the old table, assign cur_pgtbl = next_pgtbl and ref
>>>>> the new one.
>>>>>
>>>> Here is a trick. All AddressSpaceDispatch will update its radix tree
>>>> at the same time, so the re-claimer will come after all of them commit
>>>> to drop the old phys_map_node[] which holds the radix trees all at
>>>> once. Is it OK? Or you still prefer  each AddressSpaceDispatch has its
>>>> own space for radix tree?
>>>
>>> I'm not sure... I prefer to have more stuff in AddressSpaceDispatch to
>>> make things as streamlined as possible on the read side.
>>>
>> When I try to make each AddressSpaceDispatch own a separate radix
>> tree,
>
> Just to check we are on the same page: the radix tree will usually be the same for all AddressSpaceDispatch instances,

Not sure your meaning, but the radix tree for AddressSpaceDispatch is
decided by the as->root's topology,
so I think they are not the same.

> , except if the commit hook is running concurrently.  Is this right?
>
yes, :)   I found the same issue here. I.E. when mem topology update,
all the radix tree will be rebuilt, so the collection of radix tree
compose the dispatch context. AddressSpaces are union and not broken,
should switched atomically.

>  I found in subpage_read/write(), we need to retrieve
>> "phys_sections" for this subpage, ie,
>>    section = &phys_sections[mmio->sub_section[idx]]; will be changed to
>>    section = &subpage->as_dispatch->phys_sections[mmio->sub_section[idx]];
>> It will sacrifice performance.  Is it acceptable?
>
> It is acceptable, but it is also not necessary anymore. :)  subpage_read
> is now doing simply:
>
> static uint64_t subpage_read(void *opaque, hwaddr addr,
>                              unsigned len)
> {
>     subpage_t *subpage = opaque;
>     uint8_t buf[4];
>
> #if defined(DEBUG_SUBPAGE)
>     printf("%s: subpage %p len %d addr " TARGET_FMT_plx "\n", __func__,
>            subpage, len, addr);
> #endif
>     address_space_read(subpage->as, addr + subpage->base, buf, len);

ah, seems I miss such code :)
>     switch (len) {
>     case 1:
>         return ldub_p(buf);
>     case 2:
>         return lduw_p(buf);
>     case 4:
>         return ldl_p(buf);
>     default:
>         abort();
>     }
> }
>
>
>
> You can base your work on the latest version of the code, which is in my
> github rcu branch.  It is not very tested, but it should work.
>
I have done some stuff based on your iommu branch.  I will rebase it
onto you rcu branch for next verion.

Thanks,
Pingfan

> Paolo
>
>
>> Regards,
>> Pingfan
>>
>>>>> The lock must also be taken in address_space_translate, together with a
>>>>> relatively expensive ref/unref pair (two atomic operations).  We
>>>>> probably need real RCU so that we can skip the ref/unref.
>>>>>
>>>> Currently, without RCU, do we rely on as->lock? So no need ref/unref
>>>> on cur_pgtbl?  With RCU, I think we need ref/unref on mr, since
>>>> mr->ops->read/write can block, and should be excluded from RCU
>>>> section.
>>>
>>> Yes, we need ref/unref on the region, but in many cases that will be a
>>> no-op (most important, for access to system RAM).  And if we can do the
>>> dispatch in the RCU section, similar to what I proposed in reviewing
>>> your patch 2, we have no ref/unref in the common case of accessing RAM.
>>>
>>>>>>  #define PHYS_MAP_NODE_NIL (((uint16_t)~0) >> 1)
>>>>>>
>>>>>> @@ -111,13 +122,13 @@ static MemoryRegion io_mem_watch;
>>>>>>
>>>>>>  static void phys_map_node_reserve(unsigned nodes)
>>>>>>  {
>>>>>> -    if (phys_map_nodes_nb + nodes > phys_map_nodes_nb_alloc) {
>>>>>> +    if (next_pgtbl->phys_map_nodes_nb + nodes > next_pgtbl->phys_map_nodes_nb_alloc) {
>>>>>>          typedef PhysPageEntry Node[L2_SIZE];
>>>>>> -        phys_map_nodes_nb_alloc = MAX(phys_map_nodes_nb_alloc * 2, 16);
>>>>>> -        phys_map_nodes_nb_alloc = MAX(phys_map_nodes_nb_alloc,
>>>>>> -                                      phys_map_nodes_nb + nodes);
>>>>>> -        phys_map_nodes = g_renew(Node, phys_map_nodes,
>>>>>> -                                 phys_map_nodes_nb_alloc);
>>>>>> +        next_pgtbl->phys_map_nodes_nb_alloc = MAX(next_pgtbl->phys_map_nodes_nb_alloc * 2, 16);
>>>>>> +        next_pgtbl->phys_map_nodes_nb_alloc = MAX(next_pgtbl->phys_map_nodes_nb_alloc,
>>>>>> +                                      next_pgtbl->phys_map_nodes_nb + nodes);
>>>>>> +        next_pgtbl->phys_map_nodes = g_renew(Node, next_pgtbl->phys_map_nodes,
>>>>>> +                                 next_pgtbl->phys_map_nodes_nb_alloc);
>>>>>>      }
>>>>>>  }
>>>>>>
>>>>>> @@ -126,22 +137,16 @@ static uint16_t phys_map_node_alloc(void)
>>>>>>      unsigned i;
>>>>>>      uint16_t ret;
>>>>>>
>>>>>> -    ret = phys_map_nodes_nb++;
>>>>>> +    ret = next_pgtbl->phys_map_nodes_nb++;
>>>>>>      assert(ret != PHYS_MAP_NODE_NIL);
>>>>>> -    assert(ret != phys_map_nodes_nb_alloc);
>>>>>> +    assert(ret != next_pgtbl->phys_map_nodes_nb_alloc);
>>>>>>      for (i = 0; i < L2_SIZE; ++i) {
>>>>>> -        phys_map_nodes[ret][i].is_leaf = 0;
>>>>>> -        phys_map_nodes[ret][i].ptr = PHYS_MAP_NODE_NIL;
>>>>>> +        next_pgtbl->phys_map_nodes[ret][i].is_leaf = 0;
>>>>>> +        next_pgtbl->phys_map_nodes[ret][i].ptr = PHYS_MAP_NODE_NIL;
>>>>>>      }
>>>>>>      return ret;
>>>>>>  }
>>>>>>
>>>>>> -static void phys_map_nodes_reset(void)
>>>>>> -{
>>>>>> -    phys_map_nodes_nb = 0;
>>>>>> -}
>>>>>> -
>>>>>> -
>>>>>>  static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index,
>>>>>>                                  hwaddr *nb, uint16_t leaf,
>>>>>>                                  int level)
>>>>>> @@ -152,15 +157,15 @@ static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index,
>>>>>>
>>>>>>      if (!lp->is_leaf && lp->ptr == PHYS_MAP_NODE_NIL) {
>>>>>>          lp->ptr = phys_map_node_alloc();
>>>>>> -        p = phys_map_nodes[lp->ptr];
>>>>>> +        p = next_pgtbl->phys_map_nodes[lp->ptr];
>>>>>>          if (level == 0) {
>>>>>>              for (i = 0; i < L2_SIZE; i++) {
>>>>>>                  p[i].is_leaf = 1;
>>>>>> -                p[i].ptr = phys_section_unassigned;
>>>>>> +                p[i].ptr = next_pgtbl->phys_section_unassigned;
>>>>>>              }
>>>>>>          }
>>>>>>      } else {
>>>>>> -        p = phys_map_nodes[lp->ptr];
>>>>>> +        p = next_pgtbl->phys_map_nodes[lp->ptr];
>>>>>>      }
>>>>>>      lp = &p[(*index >> (level * L2_BITS)) & (L2_SIZE - 1)];
>>>>>>
>>>>>> @@ -192,11 +197,13 @@ static PhysSection *phys_section_find(AddressSpaceDispatch *d,
>>>>>>  {
>>>>>>      PhysPageEntry lp = d->phys_map;
>>>>>>      PhysPageEntry *p;
>>>>>> +    PhysSection *phys_sections = cur_pgtbl->phys_sections;
>>>>>> +    Node *phys_map_nodes = cur_pgtbl->phys_map_nodes;
>>>>>>      int i;
>>>>>>
>>>>>>      for (i = P_L2_LEVELS - 1; i >= 0 && !lp.is_leaf; i--) {
>>>>>>          if (lp.ptr == PHYS_MAP_NODE_NIL) {
>>>>>> -            return &phys_sections[phys_section_unassigned];
>>>>>> +            return &phys_sections[cur_pgtbl->phys_section_unassigned];
>>>>>>          }
>>>>>>          p = phys_map_nodes[lp.ptr];
>>>>>>          lp = p[(index >> (i * L2_BITS)) & (L2_SIZE - 1)];
>>>>>> @@ -209,6 +216,7 @@ static MemoryRegionSection *address_space_lookup_region(AddressSpace *as,
>>>>>>  {
>>>>>>      PhysSection *psection;
>>>>>>      uint16_t idx;
>>>>>> +    PhysSection *phys_sections = cur_pgtbl->phys_sections;
>>>>>>
>>>>>>      psection = phys_section_find(as->dispatch, addr >> TARGET_PAGE_BITS);
>>>>>>      if (psection->sub_section) {
>>>>>> @@ -246,7 +254,7 @@ MemoryRegionSection *address_space_translate(AddressSpace *as, hwaddr addr,
>>>>>>                  | (addr & iotlb.addr_mask));
>>>>>>          len = MIN(len, (addr | iotlb.addr_mask) - addr + 1);
>>>>>>          if (!iotlb.perm[is_write]) {
>>>>>> -            section = &phys_sections[phys_section_unassigned].section;
>>>>>> +            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_unassigned].section;
>>>>>>              break;
>>>>>>          }
>>>>>>
>>>>>> @@ -690,12 +698,12 @@ hwaddr memory_region_section_get_iotlb(CPUArchState *env,
>>>>>>          iotlb = (memory_region_get_ram_addr(section->mr) & TARGET_PAGE_MASK)
>>>>>>              + xlat;
>>>>>>          if (!section->readonly) {
>>>>>> -            iotlb |= phys_section_notdirty;
>>>>>> +            iotlb |= cur_pgtbl->phys_section_notdirty;
>>>>>>          } else {
>>>>>> -            iotlb |= phys_section_rom;
>>>>>> +            iotlb |= cur_pgtbl->phys_section_rom;
>>>>>>          }
>>>>>>      } else {
>>>>>> -        iotlb = PHYS_SECTION_ID(container_of(section, PhysSection, section));
>>>>>> +        iotlb = PHYS_SECTION_ID(container_of(section, PhysSection, section), cur_pgtbl);
>>>>>>          iotlb += xlat;
>>>>>>      }
>>>>>>
>>>>>> @@ -705,7 +713,7 @@ hwaddr memory_region_section_get_iotlb(CPUArchState *env,
>>>>>>          if (vaddr == (wp->vaddr & TARGET_PAGE_MASK)) {
>>>>>>              /* Avoid trapping reads of pages with a write breakpoint. */
>>>>>>              if ((prot & PAGE_WRITE) || (wp->flags & BP_MEM_READ)) {
>>>>>> -                iotlb = phys_section_watch + paddr;
>>>>>> +                iotlb = cur_pgtbl->phys_section_watch + paddr;
>>>>>>                  *address |= TLB_MMIO;
>>>>>>                  break;
>>>>>>              }
>>>>>> @@ -721,59 +729,40 @@ static int subsection_register(PhysSection *psection, uint32_t start,
>>>>>>                                 uint32_t end, uint16_t section);
>>>>>>  static void subsections_init(PhysSection *psection);
>>>>>>
>>>>>> -static void destroy_page_desc(uint16_t section_index)
>>>>>> +/* Call after all listener has been commit.
>>>>>> +  * we do not walk over tree, just simply drop.
>>>>>> +  */
>>>>>> +static void destroy_pagetable(PhysPageTable *pgtbl)
>>>>>>  {
>>>>>> -    g_free(phys_sections[section_index].sub_section);
>>>>>> -}
>>>>>> -
>>>>>> -static void destroy_l2_mapping(PhysPageEntry *lp, unsigned level)
>>>>>> -{
>>>>>> -    unsigned i;
>>>>>> -    PhysPageEntry *p;
>>>>>> +    int i;
>>>>>>
>>>>>> -    if (lp->ptr == PHYS_MAP_NODE_NIL) {
>>>>>> -        return;
>>>>>> -    }
>>>>>> +    g_free(pgtbl->phys_map_nodes);
>>>>>>
>>>>>> -    p = phys_map_nodes[lp->ptr];
>>>>>> -    for (i = 0; i < L2_SIZE; ++i) {
>>>>>> -        if (!p[i].is_leaf) {
>>>>>> -            destroy_l2_mapping(&p[i], level - 1);
>>>>>> +    for (i = 0; i < pgtbl->phys_sections_nb_alloc; i++) {
>>>>>> +        if (pgtbl->phys_sections[i].sub_section) {
>>>>>> +            g_free(pgtbl->phys_sections[i].sub_section);
>>>>>>          } else {
>>>>>> -            destroy_page_desc(p[i].ptr);
>>>>>> +            memory_region_unref(pgtbl->phys_sections[i].section.mr);
>>>>>>          }
>>>>>>      }
>>>>>> -    lp->is_leaf = 0;
>>>>>> -    lp->ptr = PHYS_MAP_NODE_NIL;
>>>>>> -}
>>>>>> +    g_free(pgtbl->phys_sections);
>>>>>>
>>>>>> -static void destroy_all_mappings(AddressSpaceDispatch *d)
>>>>>> -{
>>>>>> -    destroy_l2_mapping(&d->phys_map, P_L2_LEVELS - 1);
>>>>>> -    phys_map_nodes_reset();
>>>>>> +    g_free(pgtbl);
>>>>>>  }
>>>>>
>>>>> This should be a separate patch.
>>>>>
>>>> Sorry, this is part of reclaimer to play destruction of prev pgtbl.
>>>> Why to separate it?
>>>
>>> You are changing the algorithm by which the old pagetable is destroyed.
>>>  This applies just as well to the current code.
>>>
>>> BTW, your patch includes Jan's subpage patch which was broken.  I'll
>>> drop it from the branch next time I push.
>>>
>>> Paolo
>>>
>>>>>> -static uint16_t phys_section_add(MemoryRegionSection *section)
>>>>>> +static uint16_t phys_section_add(MemoryRegionSection *section, PhysPageTable *pgtbl)
>>>>>
>>>>> phys_section_add will always add to next_pgtbl.
>>>>>
>>>> Ok, will drop @pgtbl.
>>>>>>  {
>>>>>> -    assert(phys_sections_nb < TARGET_PAGE_SIZE);
>>>>>> +    assert(pgtbl->phys_sections_nb < TARGET_PAGE_SIZE);
>>>>>>
>>>>>> -    if (phys_sections_nb == phys_sections_nb_alloc) {
>>>>>> -        phys_sections_nb_alloc = MAX(phys_sections_nb_alloc * 2, 16);
>>>>>> -        phys_sections = g_renew(PhysSection, phys_sections,
>>>>>> -                                phys_sections_nb_alloc);
>>>>>> +    if (pgtbl->phys_sections_nb == pgtbl->phys_sections_nb_alloc) {
>>>>>> +        pgtbl->phys_sections_nb_alloc = MAX(pgtbl->phys_sections_nb_alloc * 2, 16);
>>>>>> +        pgtbl->phys_sections = g_renew(PhysSection, pgtbl->phys_sections,
>>>>>> +                                pgtbl->phys_sections_nb_alloc);
>>>>>>      }
>>>>>> -    phys_sections[phys_sections_nb].section = *section;
>>>>>> -    phys_sections[phys_sections_nb].sub_section = NULL;
>>>>>> +    pgtbl->phys_sections[pgtbl->phys_sections_nb].section = *section;
>>>>>> +    pgtbl->phys_sections[pgtbl->phys_sections_nb].sub_section = NULL;
>>>>>>      memory_region_ref(section->mr);
>>>>>> -    return phys_sections_nb++;
>>>>>> -}
>>>>>> -
>>>>>> -static void phys_sections_clear(void)
>>>>>> -{
>>>>>> -    while (phys_sections_nb > 0) {
>>>>>> -        PhysSection *phys_section = &phys_sections[--phys_sections_nb];
>>>>>> -        memory_region_unref(phys_section->section.mr);
>>>>>> -    }
>>>>>> +    return pgtbl->phys_sections_nb++;
>>>>>>  }
>>>>>>
>>>>>>  static void register_subsection(AddressSpaceDispatch *d,
>>>>>> @@ -793,18 +782,18 @@ static void register_subsection(AddressSpaceDispatch *d,
>>>>>>             psection->section.mr == &io_mem_unassigned);
>>>>>>
>>>>>>      if (!psection->sub_section) {
>>>>>> -        new_section = phys_section_add(&subsection);
>>>>>> -        psection = &phys_sections[new_section];
>>>>>> +        new_section = phys_section_add(&subsection, next_pgtbl);
>>>>>> +        psection = &next_pgtbl->phys_sections[new_section];
>>>>>>          subsections_init(psection);
>>>>>>          phys_page_set(d, base >> TARGET_PAGE_BITS, 1, new_section);
>>>>>>      } else {
>>>>>> -        new_section = PHYS_SECTION_ID(psection);
>>>>>> +        new_section = PHYS_SECTION_ID(psection, next_pgtbl);
>>>>>>      }
>>>>>>
>>>>>> -    new_subsection = phys_section_add(section);
>>>>>> +    new_subsection = phys_section_add(section, next_pgtbl);
>>>>>>
>>>>>>      /* phys_section_add invalidates psection, reload it  */
>>>>>> -    psection = &phys_sections[new_section];
>>>>>> +    psection = &next_pgtbl->phys_sections[new_section];
>>>>>>      start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
>>>>>>      end = start + section->size - 1;
>>>>>>      subsection_register(psection, start, end, new_subsection);
>>>>>> @@ -816,7 +805,7 @@ static void register_multipage(AddressSpaceDispatch *d, MemoryRegionSection *sec
>>>>>>      hwaddr start_addr = section->offset_within_address_space;
>>>>>>      ram_addr_t size = section->size;
>>>>>>      hwaddr addr;
>>>>>> -    uint16_t section_index = phys_section_add(section);
>>>>>> +    uint16_t section_index = phys_section_add(section, next_pgtbl);
>>>>>>
>>>>>>      assert(size);
>>>>>>
>>>>>> @@ -1653,7 +1642,7 @@ static void subsections_init(PhysSection *psection)
>>>>>>  {
>>>>>>      psection->sub_section = g_malloc0(sizeof(uint16_t) * TARGET_PAGE_SIZE);
>>>>>>      subsection_register(psection, 0, TARGET_PAGE_SIZE-1,
>>>>>> -                        phys_section_unassigned);
>>>>>> +                        next_pgtbl->phys_section_unassigned);
>>>>>>  }
>>>>>>
>>>>>>  static uint16_t dummy_section(MemoryRegion *mr)
>>>>>> @@ -1665,12 +1654,12 @@ static uint16_t dummy_section(MemoryRegion *mr)
>>>>>>          .size = UINT64_MAX,
>>>>>>      };
>>>>>>
>>>>>> -    return phys_section_add(&section);
>>>>>> +    return phys_section_add(&section, next_pgtbl);
>>>>>>  }
>>>>>>
>>>>>>  MemoryRegion *iotlb_to_region(hwaddr index)
>>>>>>  {
>>>>>> -    return phys_sections[index & ~TARGET_PAGE_MASK].section.mr;
>>>>>> +    return cur_pgtbl->phys_sections[index & ~TARGET_PAGE_MASK].section.mr;
>>>>>>  }
>>>>>>
>>>>>>  static void io_mem_init(void)
>>>>>> @@ -1685,21 +1674,40 @@ static void io_mem_init(void)
>>>>>>                            "watch", UINT64_MAX);
>>>>>>  }
>>>>>>
>>>>>> +void global_begin(void)
>>>>>> +{
>>>>>> +    next_pgtbl = g_new0(PhysPageTable, 1);
>>>>>> +    next_pgtbl->ref = 1;
>>>>>> +    next_pgtbl->phys_section_unassigned = dummy_section(&io_mem_unassigned);
>>>>>> +    next_pgtbl->phys_section_notdirty = dummy_section(&io_mem_notdirty);
>>>>>> +    next_pgtbl->phys_section_rom = dummy_section(&io_mem_rom);
>>>>>> +    next_pgtbl->phys_section_watch = dummy_section(&io_mem_watch);
>>>>>> +}
>>>>>> +
>>>>>> +/* other listeners finished */
>>>>>> +void global_commit(void)
>>>>>> +{
>>>>>> +    PhysPageTable *t = cur_pgtbl;
>>>>>> +
>>>>>> +    cur_pgtbl = next_pgtbl;
>>>>>> +    /* Fix me,  currently, we rely on each address space listener agaist its
>>>>>> +      * reader. So when we come here, no readers will touch old phys_map_node.
>>>>>> +      * After rcu, should changed to call_rcu()
>>>>>> +      */
>>>>>> +    if (__sync_sub_and_fetch(&t->ref, 1) == 0) {
>>>>>> +        destroy_pagetable(t);
>>>>>> +    }
>>>>>
>>>>> global_commit should not be needed, and global_begin should be simply
>>>>> the new core_begin.  It should unref next_pgtbl and reallocate a new one.
>>>>>
>>>> Good idea, thanks.
>>>>
>>>> Regards,
>>>> Pingfan
>>>>>
>>>>>> +}
>>>>>> +
>>>>>>  static void mem_begin(MemoryListener *listener)
>>>>>>  {
>>>>>>      AddressSpaceDispatch *d = container_of(listener, AddressSpaceDispatch, listener);
>>>>>>
>>>>>> -    destroy_all_mappings(d);
>>>>>>      d->phys_map.ptr = PHYS_MAP_NODE_NIL;
>>>>>>  }
>>>>>>
>>>>>>  static void core_begin(MemoryListener *listener)
>>>>>>  {
>>>>>> -    phys_sections_clear();
>>>>>> -    phys_section_unassigned = dummy_section(&io_mem_unassigned);
>>>>>> -    phys_section_notdirty = dummy_section(&io_mem_notdirty);
>>>>>> -    phys_section_rom = dummy_section(&io_mem_rom);
>>>>>> -    phys_section_watch = dummy_section(&io_mem_watch);
>>>>>>  }
>>>>>>
>>>>>>  static void tcg_commit(MemoryListener *listener)
>>>>>> @@ -1779,7 +1787,6 @@ void address_space_destroy_dispatch(AddressSpace *as)
>>>>>>      AddressSpaceDispatch *d = as->dispatch;
>>>>>>
>>>>>>      memory_listener_unregister(&d->listener);
>>>>>> -    destroy_l2_mapping(&d->phys_map, P_L2_LEVELS - 1);
>>>>>>      g_free(d);
>>>>>>      as->dispatch = NULL;
>>>>>>  }
>>>>>> @@ -2386,7 +2393,7 @@ void stl_phys_notdirty(hwaddr addr, uint32_t val)
>>>>>>
>>>>>>      if (!memory_region_is_ram(section->mr) || section->readonly) {
>>>>>>          if (memory_region_is_ram(section->mr)) {
>>>>>> -            section = &phys_sections[phys_section_rom].section;
>>>>>> +            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
>>>>>>          }
>>>>>>          io_mem_write(section->mr, addr, val, 4);
>>>>>>      } else {
>>>>>> @@ -2422,7 +2429,7 @@ void stq_phys_notdirty(hwaddr addr, uint64_t val)
>>>>>>
>>>>>>      if (!memory_region_is_ram(section->mr) || section->readonly) {
>>>>>>          if (memory_region_is_ram(section->mr)) {
>>>>>> -            section = &phys_sections[phys_section_rom].section;
>>>>>> +            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
>>>>>>          }
>>>>>>  #ifdef TARGET_WORDS_BIGENDIAN
>>>>>>          io_mem_write(section->mr, addr, val >> 32, 4);
>>>>>> @@ -2455,7 +2462,7 @@ static inline void stl_phys_internal(hwaddr addr, uint32_t val,
>>>>>>
>>>>>>      if (!memory_region_is_ram(section->mr) || section->readonly) {
>>>>>>          if (memory_region_is_ram(section->mr)) {
>>>>>> -            section = &phys_sections[phys_section_rom].section;
>>>>>> +            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
>>>>>>          }
>>>>>>  #if defined(TARGET_WORDS_BIGENDIAN)
>>>>>>          if (endian == DEVICE_LITTLE_ENDIAN) {
>>>>>> @@ -2526,7 +2533,7 @@ static inline void stw_phys_internal(hwaddr addr, uint32_t val,
>>>>>>
>>>>>>      if (!memory_region_is_ram(section->mr) || section->readonly) {
>>>>>>          if (memory_region_is_ram(section->mr)) {
>>>>>> -            section = &phys_sections[phys_section_rom].section;
>>>>>> +            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
>>>>>>          }
>>>>>>  #if defined(TARGET_WORDS_BIGENDIAN)
>>>>>>          if (endian == DEVICE_LITTLE_ENDIAN) {
>>>>>> diff --git a/include/exec/memory.h b/include/exec/memory.h
>>>>>> index b97ace7..cc654fa 100644
>>>>>> --- a/include/exec/memory.h
>>>>>> +++ b/include/exec/memory.h
>>>>>> @@ -992,6 +992,8 @@ void *address_space_map(AddressSpace *as, hwaddr addr,
>>>>>>  void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
>>>>>>                           int is_write, hwaddr access_len);
>>>>>>
>>>>>> +void global_begin(void);
>>>>>> +void global_commit(void);
>>>>>>
>>>>>>  #endif
>>>>>>
>>>>>> diff --git a/memory.c b/memory.c
>>>>>> index 1a86607..da06dfd 100644
>>>>>> --- a/memory.c
>>>>>> +++ b/memory.c
>>>>>> @@ -805,6 +805,7 @@ void memory_region_transaction_commit(void)
>>>>>>      --memory_region_transaction_depth;
>>>>>>      if (!memory_region_transaction_depth && memory_region_update_pending) {
>>>>>>          memory_region_update_pending = false;
>>>>>> +        global_begin();
>>>>>>          MEMORY_LISTENER_CALL_GLOBAL(begin, Forward);
>>>>>>
>>>>>>          QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) {
>>>>>> @@ -812,6 +813,7 @@ void memory_region_transaction_commit(void)
>>>>>>          }
>>>>>>
>>>>>>          MEMORY_LISTENER_CALL_GLOBAL(commit, Forward);
>>>>>> +        global_commit();
>>>>>>      }
>>>>>>  }
>>>>>>
>>>>>>
>>>>>
>>>>
>>>>
>>>
>

Patch

diff --git a/exec.c b/exec.c
index c5f8082..bb4e540 100644
--- a/exec.c
+++ b/exec.c
@@ -80,23 +80,34 @@  int use_icount;
 #if !defined(CONFIG_USER_ONLY)
 
 #define SUBSECTION_IDX(addr)      ((addr) & ~TARGET_PAGE_MASK)
-#define PHYS_SECTION_ID(psection) ((psection) - phys_sections)
+#define PHYS_SECTION_ID(psection, base) ((psection) - base->phys_sections)
 
 typedef struct PhysSection {
     MemoryRegionSection section;
     uint16_t *sub_section;
 } PhysSection;
 
-static PhysSection *phys_sections;
-static unsigned phys_sections_nb, phys_sections_nb_alloc;
-static uint16_t phys_section_unassigned;
-static uint16_t phys_section_notdirty;
-static uint16_t phys_section_rom;
-static uint16_t phys_section_watch;
+typedef PhysPageEntry Node[L2_SIZE];
 
-/* Simple allocator for PhysPageEntry nodes */
-static PhysPageEntry (*phys_map_nodes)[L2_SIZE];
-static unsigned phys_map_nodes_nb, phys_map_nodes_nb_alloc;
+typedef struct PhysPageTable PhysPageTable;
+
+struct PhysPageTable {
+    int ref;
+    PhysSection *phys_sections;
+    unsigned phys_sections_nb;
+    unsigned phys_sections_nb_alloc;
+    uint16_t phys_section_unassigned;
+    uint16_t phys_section_notdirty;
+    uint16_t phys_section_rom;
+    uint16_t phys_section_watch;
+
+    Node *phys_map_nodes;
+    unsigned phys_map_nodes_nb;
+    unsigned phys_map_nodes_nb_alloc;
+};
+
+static PhysPageTable *cur_pgtbl;
+static PhysPageTable *next_pgtbl;
 
 #define PHYS_MAP_NODE_NIL (((uint16_t)~0) >> 1)
 
@@ -111,13 +122,13 @@  static MemoryRegion io_mem_watch;
 
 static void phys_map_node_reserve(unsigned nodes)
 {
-    if (phys_map_nodes_nb + nodes > phys_map_nodes_nb_alloc) {
+    if (next_pgtbl->phys_map_nodes_nb + nodes > next_pgtbl->phys_map_nodes_nb_alloc) {
         typedef PhysPageEntry Node[L2_SIZE];
-        phys_map_nodes_nb_alloc = MAX(phys_map_nodes_nb_alloc * 2, 16);
-        phys_map_nodes_nb_alloc = MAX(phys_map_nodes_nb_alloc,
-                                      phys_map_nodes_nb + nodes);
-        phys_map_nodes = g_renew(Node, phys_map_nodes,
-                                 phys_map_nodes_nb_alloc);
+        next_pgtbl->phys_map_nodes_nb_alloc = MAX(next_pgtbl->phys_map_nodes_nb_alloc * 2, 16);
+        next_pgtbl->phys_map_nodes_nb_alloc = MAX(next_pgtbl->phys_map_nodes_nb_alloc,
+                                      next_pgtbl->phys_map_nodes_nb + nodes);
+        next_pgtbl->phys_map_nodes = g_renew(Node, next_pgtbl->phys_map_nodes,
+                                 next_pgtbl->phys_map_nodes_nb_alloc);
     }
 }
 
@@ -126,22 +137,16 @@  static uint16_t phys_map_node_alloc(void)
     unsigned i;
     uint16_t ret;
 
-    ret = phys_map_nodes_nb++;
+    ret = next_pgtbl->phys_map_nodes_nb++;
     assert(ret != PHYS_MAP_NODE_NIL);
-    assert(ret != phys_map_nodes_nb_alloc);
+    assert(ret != next_pgtbl->phys_map_nodes_nb_alloc);
     for (i = 0; i < L2_SIZE; ++i) {
-        phys_map_nodes[ret][i].is_leaf = 0;
-        phys_map_nodes[ret][i].ptr = PHYS_MAP_NODE_NIL;
+        next_pgtbl->phys_map_nodes[ret][i].is_leaf = 0;
+        next_pgtbl->phys_map_nodes[ret][i].ptr = PHYS_MAP_NODE_NIL;
     }
     return ret;
 }
 
-static void phys_map_nodes_reset(void)
-{
-    phys_map_nodes_nb = 0;
-}
-
-
 static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index,
                                 hwaddr *nb, uint16_t leaf,
                                 int level)
@@ -152,15 +157,15 @@  static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index,
 
     if (!lp->is_leaf && lp->ptr == PHYS_MAP_NODE_NIL) {
         lp->ptr = phys_map_node_alloc();
-        p = phys_map_nodes[lp->ptr];
+        p = next_pgtbl->phys_map_nodes[lp->ptr];
         if (level == 0) {
             for (i = 0; i < L2_SIZE; i++) {
                 p[i].is_leaf = 1;
-                p[i].ptr = phys_section_unassigned;
+                p[i].ptr = next_pgtbl->phys_section_unassigned;
             }
         }
     } else {
-        p = phys_map_nodes[lp->ptr];
+        p = next_pgtbl->phys_map_nodes[lp->ptr];
     }
     lp = &p[(*index >> (level * L2_BITS)) & (L2_SIZE - 1)];
 
@@ -192,11 +197,13 @@  static PhysSection *phys_section_find(AddressSpaceDispatch *d,
 {
     PhysPageEntry lp = d->phys_map;
     PhysPageEntry *p;
+    PhysSection *phys_sections = cur_pgtbl->phys_sections;
+    Node *phys_map_nodes = cur_pgtbl->phys_map_nodes;
     int i;
 
     for (i = P_L2_LEVELS - 1; i >= 0 && !lp.is_leaf; i--) {
         if (lp.ptr == PHYS_MAP_NODE_NIL) {
-            return &phys_sections[phys_section_unassigned];
+            return &phys_sections[cur_pgtbl->phys_section_unassigned];
         }
         p = phys_map_nodes[lp.ptr];
         lp = p[(index >> (i * L2_BITS)) & (L2_SIZE - 1)];
@@ -209,6 +216,7 @@  static MemoryRegionSection *address_space_lookup_region(AddressSpace *as,
 {
     PhysSection *psection;
     uint16_t idx;
+    PhysSection *phys_sections = cur_pgtbl->phys_sections;
 
     psection = phys_section_find(as->dispatch, addr >> TARGET_PAGE_BITS);
     if (psection->sub_section) {
@@ -246,7 +254,7 @@  MemoryRegionSection *address_space_translate(AddressSpace *as, hwaddr addr,
                 | (addr & iotlb.addr_mask));
         len = MIN(len, (addr | iotlb.addr_mask) - addr + 1);
         if (!iotlb.perm[is_write]) {
-            section = &phys_sections[phys_section_unassigned].section;
+            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_unassigned].section;
             break;
         }
 
@@ -690,12 +698,12 @@  hwaddr memory_region_section_get_iotlb(CPUArchState *env,
         iotlb = (memory_region_get_ram_addr(section->mr) & TARGET_PAGE_MASK)
             + xlat;
         if (!section->readonly) {
-            iotlb |= phys_section_notdirty;
+            iotlb |= cur_pgtbl->phys_section_notdirty;
         } else {
-            iotlb |= phys_section_rom;
+            iotlb |= cur_pgtbl->phys_section_rom;
         }
     } else {
-        iotlb = PHYS_SECTION_ID(container_of(section, PhysSection, section));
+        iotlb = PHYS_SECTION_ID(container_of(section, PhysSection, section), cur_pgtbl);
         iotlb += xlat;
     }
 
@@ -705,7 +713,7 @@  hwaddr memory_region_section_get_iotlb(CPUArchState *env,
         if (vaddr == (wp->vaddr & TARGET_PAGE_MASK)) {
             /* Avoid trapping reads of pages with a write breakpoint. */
             if ((prot & PAGE_WRITE) || (wp->flags & BP_MEM_READ)) {
-                iotlb = phys_section_watch + paddr;
+                iotlb = cur_pgtbl->phys_section_watch + paddr;
                 *address |= TLB_MMIO;
                 break;
             }
@@ -721,59 +729,40 @@  static int subsection_register(PhysSection *psection, uint32_t start,
                                uint32_t end, uint16_t section);
 static void subsections_init(PhysSection *psection);
 
-static void destroy_page_desc(uint16_t section_index)
+/* Call after all listener has been commit.
+  * we do not walk over tree, just simply drop.
+  */
+static void destroy_pagetable(PhysPageTable *pgtbl)
 {
-    g_free(phys_sections[section_index].sub_section);
-}
-
-static void destroy_l2_mapping(PhysPageEntry *lp, unsigned level)
-{
-    unsigned i;
-    PhysPageEntry *p;
+    int i;
 
-    if (lp->ptr == PHYS_MAP_NODE_NIL) {
-        return;
-    }
+    g_free(pgtbl->phys_map_nodes);
 
-    p = phys_map_nodes[lp->ptr];
-    for (i = 0; i < L2_SIZE; ++i) {
-        if (!p[i].is_leaf) {
-            destroy_l2_mapping(&p[i], level - 1);
+    for (i = 0; i < pgtbl->phys_sections_nb_alloc; i++) {
+        if (pgtbl->phys_sections[i].sub_section) {
+            g_free(pgtbl->phys_sections[i].sub_section);
         } else {
-            destroy_page_desc(p[i].ptr);
+            memory_region_unref(pgtbl->phys_sections[i].section.mr);
         }
     }
-    lp->is_leaf = 0;
-    lp->ptr = PHYS_MAP_NODE_NIL;
-}
+    g_free(pgtbl->phys_sections);
 
-static void destroy_all_mappings(AddressSpaceDispatch *d)
-{
-    destroy_l2_mapping(&d->phys_map, P_L2_LEVELS - 1);
-    phys_map_nodes_reset();
+    g_free(pgtbl);
 }
 
-static uint16_t phys_section_add(MemoryRegionSection *section)
+static uint16_t phys_section_add(MemoryRegionSection *section, PhysPageTable *pgtbl)
 {
-    assert(phys_sections_nb < TARGET_PAGE_SIZE);
+    assert(pgtbl->phys_sections_nb < TARGET_PAGE_SIZE);
 
-    if (phys_sections_nb == phys_sections_nb_alloc) {
-        phys_sections_nb_alloc = MAX(phys_sections_nb_alloc * 2, 16);
-        phys_sections = g_renew(PhysSection, phys_sections,
-                                phys_sections_nb_alloc);
+    if (pgtbl->phys_sections_nb == pgtbl->phys_sections_nb_alloc) {
+        pgtbl->phys_sections_nb_alloc = MAX(pgtbl->phys_sections_nb_alloc * 2, 16);
+        pgtbl->phys_sections = g_renew(PhysSection, pgtbl->phys_sections,
+                                pgtbl->phys_sections_nb_alloc);
     }
-    phys_sections[phys_sections_nb].section = *section;
-    phys_sections[phys_sections_nb].sub_section = NULL;
+    pgtbl->phys_sections[pgtbl->phys_sections_nb].section = *section;
+    pgtbl->phys_sections[pgtbl->phys_sections_nb].sub_section = NULL;
     memory_region_ref(section->mr);
-    return phys_sections_nb++;
-}
-
-static void phys_sections_clear(void)
-{
-    while (phys_sections_nb > 0) {
-        PhysSection *phys_section = &phys_sections[--phys_sections_nb];
-        memory_region_unref(phys_section->section.mr);
-    }
+    return pgtbl->phys_sections_nb++;
 }
 
 static void register_subsection(AddressSpaceDispatch *d,
@@ -793,18 +782,18 @@  static void register_subsection(AddressSpaceDispatch *d,
            psection->section.mr == &io_mem_unassigned);
 
     if (!psection->sub_section) {
-        new_section = phys_section_add(&subsection);
-        psection = &phys_sections[new_section];
+        new_section = phys_section_add(&subsection, next_pgtbl);
+        psection = &next_pgtbl->phys_sections[new_section];
         subsections_init(psection);
         phys_page_set(d, base >> TARGET_PAGE_BITS, 1, new_section);
     } else {
-        new_section = PHYS_SECTION_ID(psection);
+        new_section = PHYS_SECTION_ID(psection, next_pgtbl);
     }
 
-    new_subsection = phys_section_add(section);
+    new_subsection = phys_section_add(section, next_pgtbl);
 
     /* phys_section_add invalidates psection, reload it  */
-    psection = &phys_sections[new_section];
+    psection = &next_pgtbl->phys_sections[new_section];
     start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
     end = start + section->size - 1;
     subsection_register(psection, start, end, new_subsection);
@@ -816,7 +805,7 @@  static void register_multipage(AddressSpaceDispatch *d, MemoryRegionSection *sec
     hwaddr start_addr = section->offset_within_address_space;
     ram_addr_t size = section->size;
     hwaddr addr;
-    uint16_t section_index = phys_section_add(section);
+    uint16_t section_index = phys_section_add(section, next_pgtbl);
 
     assert(size);
 
@@ -1653,7 +1642,7 @@  static void subsections_init(PhysSection *psection)
 {
     psection->sub_section = g_malloc0(sizeof(uint16_t) * TARGET_PAGE_SIZE);
     subsection_register(psection, 0, TARGET_PAGE_SIZE-1,
-                        phys_section_unassigned);
+                        next_pgtbl->phys_section_unassigned);
 }
 
 static uint16_t dummy_section(MemoryRegion *mr)
@@ -1665,12 +1654,12 @@  static uint16_t dummy_section(MemoryRegion *mr)
         .size = UINT64_MAX,
     };
 
-    return phys_section_add(&section);
+    return phys_section_add(&section, next_pgtbl);
 }
 
 MemoryRegion *iotlb_to_region(hwaddr index)
 {
-    return phys_sections[index & ~TARGET_PAGE_MASK].section.mr;
+    return cur_pgtbl->phys_sections[index & ~TARGET_PAGE_MASK].section.mr;
 }
 
 static void io_mem_init(void)
@@ -1685,21 +1674,40 @@  static void io_mem_init(void)
                           "watch", UINT64_MAX);
 }
 
+void global_begin(void)
+{
+    next_pgtbl = g_new0(PhysPageTable, 1);
+    next_pgtbl->ref = 1;
+    next_pgtbl->phys_section_unassigned = dummy_section(&io_mem_unassigned);
+    next_pgtbl->phys_section_notdirty = dummy_section(&io_mem_notdirty);
+    next_pgtbl->phys_section_rom = dummy_section(&io_mem_rom);
+    next_pgtbl->phys_section_watch = dummy_section(&io_mem_watch);
+}
+
+/* other listeners finished */
+void global_commit(void)
+{
+    PhysPageTable *t = cur_pgtbl;
+
+    cur_pgtbl = next_pgtbl;
+    /* Fix me,  currently, we rely on each address space listener agaist its
+      * reader. So when we come here, no readers will touch old phys_map_node.
+      * After rcu, should changed to call_rcu()
+      */
+    if (__sync_sub_and_fetch(&t->ref, 1) == 0) {
+        destroy_pagetable(t);
+    }
+}
+
 static void mem_begin(MemoryListener *listener)
 {
     AddressSpaceDispatch *d = container_of(listener, AddressSpaceDispatch, listener);
 
-    destroy_all_mappings(d);
     d->phys_map.ptr = PHYS_MAP_NODE_NIL;
 }
 
 static void core_begin(MemoryListener *listener)
 {
-    phys_sections_clear();
-    phys_section_unassigned = dummy_section(&io_mem_unassigned);
-    phys_section_notdirty = dummy_section(&io_mem_notdirty);
-    phys_section_rom = dummy_section(&io_mem_rom);
-    phys_section_watch = dummy_section(&io_mem_watch);
 }
 
 static void tcg_commit(MemoryListener *listener)
@@ -1779,7 +1787,6 @@  void address_space_destroy_dispatch(AddressSpace *as)
     AddressSpaceDispatch *d = as->dispatch;
 
     memory_listener_unregister(&d->listener);
-    destroy_l2_mapping(&d->phys_map, P_L2_LEVELS - 1);
     g_free(d);
     as->dispatch = NULL;
 }
@@ -2386,7 +2393,7 @@  void stl_phys_notdirty(hwaddr addr, uint32_t val)
 
     if (!memory_region_is_ram(section->mr) || section->readonly) {
         if (memory_region_is_ram(section->mr)) {
-            section = &phys_sections[phys_section_rom].section;
+            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
         }
         io_mem_write(section->mr, addr, val, 4);
     } else {
@@ -2422,7 +2429,7 @@  void stq_phys_notdirty(hwaddr addr, uint64_t val)
 
     if (!memory_region_is_ram(section->mr) || section->readonly) {
         if (memory_region_is_ram(section->mr)) {
-            section = &phys_sections[phys_section_rom].section;
+            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
         }
 #ifdef TARGET_WORDS_BIGENDIAN
         io_mem_write(section->mr, addr, val >> 32, 4);
@@ -2455,7 +2462,7 @@  static inline void stl_phys_internal(hwaddr addr, uint32_t val,
 
     if (!memory_region_is_ram(section->mr) || section->readonly) {
         if (memory_region_is_ram(section->mr)) {
-            section = &phys_sections[phys_section_rom].section;
+            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
         }
 #if defined(TARGET_WORDS_BIGENDIAN)
         if (endian == DEVICE_LITTLE_ENDIAN) {
@@ -2526,7 +2533,7 @@  static inline void stw_phys_internal(hwaddr addr, uint32_t val,
 
     if (!memory_region_is_ram(section->mr) || section->readonly) {
         if (memory_region_is_ram(section->mr)) {
-            section = &phys_sections[phys_section_rom].section;
+            section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
         }
 #if defined(TARGET_WORDS_BIGENDIAN)
         if (endian == DEVICE_LITTLE_ENDIAN) {
diff --git a/include/exec/memory.h b/include/exec/memory.h
index b97ace7..cc654fa 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -992,6 +992,8 @@  void *address_space_map(AddressSpace *as, hwaddr addr,
 void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
                          int is_write, hwaddr access_len);
 
+void global_begin(void);
+void global_commit(void);
 
 #endif
 
diff --git a/memory.c b/memory.c
index 1a86607..da06dfd 100644
--- a/memory.c
+++ b/memory.c
@@ -805,6 +805,7 @@  void memory_region_transaction_commit(void)
     --memory_region_transaction_depth;
     if (!memory_region_transaction_depth && memory_region_update_pending) {
         memory_region_update_pending = false;
+        global_begin();
         MEMORY_LISTENER_CALL_GLOBAL(begin, Forward);
 
         QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) {
@@ -812,6 +813,7 @@  void memory_region_transaction_commit(void)
         }
 
         MEMORY_LISTENER_CALL_GLOBAL(commit, Forward);
+        global_commit();
     }
 }