Patchwork [v4,07/16] memory: make mmio dispatch able to be out of biglock

login
register
mail settings
Submitter pingfank@linux.vnet.ibm.com
Date Oct. 22, 2012, 9:23 a.m.
Message ID <1350897839-29593-8-git-send-email-pingfank@linux.vnet.ibm.com>
Download mbox | patch
Permalink /patch/193183/
State New
Headers show

Comments

pingfank@linux.vnet.ibm.com - Oct. 22, 2012, 9:23 a.m.
Without biglock, we try to protect the mr by increase refcnt.
If we can inc refcnt, go backward and resort to biglock.

Another point is memory radix-tree can be flushed by another
thread, so we should get the copy of terminal mr to survive
from such issue.

Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
---
 exec.c |  125 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 files changed, 117 insertions(+), 8 deletions(-)
Jan Kiszka - Oct. 23, 2012, 12:12 p.m.
On 2012-10-22 11:23, Liu Ping Fan wrote:
> Without biglock, we try to protect the mr by increase refcnt.
> If we can inc refcnt, go backward and resort to biglock.
> 
> Another point is memory radix-tree can be flushed by another
> thread, so we should get the copy of terminal mr to survive
> from such issue.
> 
> Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
> ---
>  exec.c |  125 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
>  1 files changed, 117 insertions(+), 8 deletions(-)
> 
> diff --git a/exec.c b/exec.c
> index 5834766..91b859b 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -200,6 +200,8 @@ struct PhysPageEntry {
>      uint16_t ptr : 15;
>  };
>  
> +static QemuMutex mem_map_lock;
> +
>  /* Simple allocator for PhysPageEntry nodes */
>  static PhysPageEntry (*phys_map_nodes)[L2_SIZE];
>  static unsigned phys_map_nodes_nb, phys_map_nodes_nb_alloc;
> @@ -212,6 +214,8 @@ static PhysPageEntry phys_map = { .ptr = PHYS_MAP_NODE_NIL, .is_leaf = 0 };
>  
>  static void io_mem_init(void);
>  static void memory_map_init(void);
> +static int phys_page_lookup(target_phys_addr_t addr, MemoryRegionSection *mrs);
> +
>  
>  static MemoryRegion io_mem_watch;
>  #endif
> @@ -2245,6 +2249,7 @@ static void register_subpage(MemoryRegionSection *section)
>      subpage_t *subpage;
>      target_phys_addr_t base = section->offset_within_address_space
>          & TARGET_PAGE_MASK;
> +    /* Already under the protection of mem_map_lock */
>      MemoryRegionSection *existing = phys_page_find(base >> TARGET_PAGE_BITS);
>      MemoryRegionSection subsection = {
>          .offset_within_address_space = base,
> @@ -3165,6 +3170,8 @@ static void io_mem_init(void)
>  
>  static void core_begin(MemoryListener *listener)
>  {
> +    /* protect the updating process of mrs in memory core agaist readers */
> +    qemu_mutex_lock(&mem_map_lock);
>      destroy_all_mappings();
>      phys_sections_clear();
>      phys_map.ptr = PHYS_MAP_NODE_NIL;
> @@ -3184,17 +3191,32 @@ static void core_commit(MemoryListener *listener)
>      for(env = first_cpu; env != NULL; env = env->next_cpu) {
>          tlb_flush(env, 1);
>      }
> +    qemu_mutex_unlock(&mem_map_lock);
>  }
>  
>  static void core_region_add(MemoryListener *listener,
>                              MemoryRegionSection *section)
>  {
> +    MemoryRegion *mr = section->mr;
> +
> +    if (mr->ops) {
> +        if (mr->ops->ref) {

if (mr->ops && mr->ops->ref) {

here and in the cases below. Unless we avoid that callback anyway,
turning it into an mr flag.

> +            mr->ops->ref(mr);
> +        }
> +    }
>      cpu_register_physical_memory_log(section, section->readonly);
>  }
>  
>  static void core_region_del(MemoryListener *listener,
>                              MemoryRegionSection *section)
>  {
> +    MemoryRegion *mr = section->mr;
> +
> +    if (mr->ops) {
> +        if (mr->ops->unref) {
> +            mr->ops->unref(mr);
> +        }
> +    }
>  }
>  
>  static void core_region_nop(MemoryListener *listener,
> @@ -3348,6 +3370,8 @@ static void memory_map_init(void)
>      memory_region_init(system_io, "io", 65536);
>      set_system_io_map(system_io);
>  
> +    qemu_mutex_init(&mem_map_lock);
> +
>      memory_listener_register(&core_memory_listener, system_memory);
>      memory_listener_register(&io_memory_listener, system_io);
>  }
> @@ -3406,6 +3430,58 @@ int cpu_memory_rw_debug(CPUArchState *env, target_ulong addr,
>  }
>  
>  #else
> +
> +static MemoryRegionSection *subpage_get_terminal(subpage_t *mmio,
> +    target_phys_addr_t addr)
> +{
> +    MemoryRegionSection *section;
> +    unsigned int idx = SUBPAGE_IDX(addr);
> +
> +    section = &phys_sections[mmio->sub_section[idx]];
> +    return section;
> +}
> +
> +static int memory_region_section_ref(MemoryRegionSection *mrs)
> +{
> +    MemoryRegion *mr;
> +    int ret = 0;
> +
> +    mr = mrs->mr;
> +    if (mr->ops) {
> +        if (mr->ops->ref) {
> +            ret = mr->ops->ref(mr);
> +        }
> +    }
> +    return ret;

The return type should be bool, delivering true if reference was successful.

> +}
> +
> +static void memory_region_section_unref(MemoryRegionSection *mrs)
> +{
> +    MemoryRegion *mr;
> +
> +    mr = mrs->mr;
> +    if (mr->ops) {
> +        if (mr->ops->unref) {
> +            mr->ops->unref(mr);
> +        }
> +    }
> +}
> +
> +static int phys_page_lookup(target_phys_addr_t addr, MemoryRegionSection *mrs)
> +{
> +    MemoryRegionSection *section;
> +    int ret;
> +
> +    section = phys_page_find(addr >> TARGET_PAGE_BITS);
> +    if (section->mr->subpage) {
> +        section = subpage_get_terminal(section->mr->opaque, addr);
> +    }
> +    *mrs = *section;
> +    ret = memory_region_section_ref(mrs);
> +
> +    return ret;
> +}
> +
>  void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
>                              int len, int is_write)
>  {
> @@ -3413,14 +3489,28 @@ void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
>      uint8_t *ptr;
>      uint32_t val;
>      target_phys_addr_t page;
> -    MemoryRegionSection *section;
> +    MemoryRegionSection *section, obj_mrs;
> +    int safe_ref;
>  
>      while (len > 0) {
>          page = addr & TARGET_PAGE_MASK;
>          l = (page + TARGET_PAGE_SIZE) - addr;
>          if (l > len)
>              l = len;
> -        section = phys_page_find(page >> TARGET_PAGE_BITS);
> +        qemu_mutex_lock(&mem_map_lock);
> +        safe_ref = phys_page_lookup(page, &obj_mrs);
> +        qemu_mutex_unlock(&mem_map_lock);
> +        if (safe_ref == 0) {
> +            qemu_mutex_lock_iothread();
> +            qemu_mutex_lock(&mem_map_lock);
> +            /* At the 2nd try, mem map can change, so need to judge it again */
> +            safe_ref = phys_page_lookup(page, &obj_mrs);
> +            qemu_mutex_unlock(&mem_map_lock);
> +            if (safe_ref > 0) {
> +                qemu_mutex_unlock_iothread();
> +            }
> +        }
> +        section = &obj_mrs;
>  
>          if (is_write) {
>              if (!memory_region_is_ram(section->mr)) {
> @@ -3491,10 +3581,16 @@ void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
>                  qemu_put_ram_ptr(ptr);
>              }
>          }
> +
> +        memory_region_section_unref(&obj_mrs);

The mapping cannot change from not-referenced to reference-counted while
we were dispatching? I mean the case where we found not ref callback on
entry and took the big lock, but now there is an unref callback.

>          len -= l;
>          buf += l;
>          addr += l;
> +        if (safe_ref == 0) {
> +            qemu_mutex_unlock_iothread();
> +        }
>      }
> +
>  }
>  
>  /* used for ROM loading : can write in RAM and ROM */
> @@ -3504,14 +3600,18 @@ void cpu_physical_memory_write_rom(target_phys_addr_t addr,
>      int l;
>      uint8_t *ptr;
>      target_phys_addr_t page;
> -    MemoryRegionSection *section;
> +    MemoryRegionSection *section, mr_obj;
>  
>      while (len > 0) {
>          page = addr & TARGET_PAGE_MASK;
>          l = (page + TARGET_PAGE_SIZE) - addr;
>          if (l > len)
>              l = len;
> -        section = phys_page_find(page >> TARGET_PAGE_BITS);
> +
> +        qemu_mutex_lock(&mem_map_lock);
> +        phys_page_lookup(page, &mr_obj);
> +        qemu_mutex_unlock(&mem_map_lock);
> +        section = &mr_obj;

But here we don't care about the return code of phys_page_lookup and all
related topics? Because we assume the BQL is held? Reminds me that we
will need some support for assert(qemu_mutex_is_locked(&lock)).

>  
>          if (!(memory_region_is_ram(section->mr) ||
>                memory_region_is_romd(section->mr))) {
> @@ -3528,6 +3628,7 @@ void cpu_physical_memory_write_rom(target_phys_addr_t addr,
>          len -= l;
>          buf += l;
>          addr += l;
> +        memory_region_section_unref(&mr_obj);
>      }
>  }
>  
> @@ -3592,7 +3693,7 @@ void *cpu_physical_memory_map(target_phys_addr_t addr,
>      target_phys_addr_t todo = 0;
>      int l;
>      target_phys_addr_t page;
> -    MemoryRegionSection *section;
> +    MemoryRegionSection *section, mr_obj;
>      ram_addr_t raddr = RAM_ADDR_MAX;
>      ram_addr_t rlen;
>      void *ret;
> @@ -3602,7 +3703,10 @@ void *cpu_physical_memory_map(target_phys_addr_t addr,
>          l = (page + TARGET_PAGE_SIZE) - addr;
>          if (l > len)
>              l = len;
> -        section = phys_page_find(page >> TARGET_PAGE_BITS);
> +        qemu_mutex_lock(&mem_map_lock);
> +        phys_page_lookup(page, &mr_obj);
> +        qemu_mutex_unlock(&mem_map_lock);
> +        section = &mr_obj;
>  
>          if (!(memory_region_is_ram(section->mr) && !section->readonly)) {
>              if (todo || bounce.buffer) {
> @@ -3616,6 +3720,7 @@ void *cpu_physical_memory_map(target_phys_addr_t addr,
>              }
>  
>              *plen = l;
> +            memory_region_section_unref(&mr_obj);
>              return bounce.buffer;
>          }
>          if (!todo) {
> @@ -3630,6 +3735,7 @@ void *cpu_physical_memory_map(target_phys_addr_t addr,
>      rlen = todo;
>      ret = qemu_ram_ptr_length(raddr, &rlen);
>      *plen = rlen;
> +    memory_region_section_unref(&mr_obj);
>      return ret;
>  }
>  
> @@ -4239,9 +4345,12 @@ bool virtio_is_big_endian(void)
>  #ifndef CONFIG_USER_ONLY
>  bool cpu_physical_memory_is_io(target_phys_addr_t phys_addr)
>  {
> -    MemoryRegionSection *section;
> +    MemoryRegionSection *section, mr_obj;
>  
> -    section = phys_page_find(phys_addr >> TARGET_PAGE_BITS);
> +    qemu_mutex_lock(&mem_map_lock);
> +    phys_page_lookup(phys_addr, &mr_obj);
> +    qemu_mutex_unlock(&mem_map_lock);
> +    section = &mr_obj;

Err, no unref needed here?

>  
>      return !(memory_region_is_ram(section->mr) ||
>               memory_region_is_romd(section->mr));
> 

Jan
Avi Kivity - Oct. 23, 2012, 12:36 p.m.
On 10/23/2012 02:12 PM, Jan Kiszka wrote:
> On 2012-10-22 11:23, Liu Ping Fan wrote:
>> Without biglock, we try to protect the mr by increase refcnt.
>> If we can inc refcnt, go backward and resort to biglock.
>> 
>> Another point is memory radix-tree can be flushed by another
>> thread, so we should get the copy of terminal mr to survive
>> from such issue.
>> 
>> +
>>  void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
>>                              int len, int is_write)
>>  {
>> @@ -3413,14 +3489,28 @@ void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
>>      uint8_t *ptr;
>>      uint32_t val;
>>      target_phys_addr_t page;
>> -    MemoryRegionSection *section;
>> +    MemoryRegionSection *section, obj_mrs;
>> +    int safe_ref;
>>  
>>      while (len > 0) {
>>          page = addr & TARGET_PAGE_MASK;
>>          l = (page + TARGET_PAGE_SIZE) - addr;
>>          if (l > len)
>>              l = len;
>> -        section = phys_page_find(page >> TARGET_PAGE_BITS);
>> +        qemu_mutex_lock(&mem_map_lock);
>> +        safe_ref = phys_page_lookup(page, &obj_mrs);
>> +        qemu_mutex_unlock(&mem_map_lock);
>> +        if (safe_ref == 0) {
>> +            qemu_mutex_lock_iothread();
>> +            qemu_mutex_lock(&mem_map_lock);
>> +            /* At the 2nd try, mem map can change, so need to judge it again */
>> +            safe_ref = phys_page_lookup(page, &obj_mrs);
>> +            qemu_mutex_unlock(&mem_map_lock);
>> +            if (safe_ref > 0) {
>> +                qemu_mutex_unlock_iothread();
>> +            }
>> +        }
>> +        section = &obj_mrs;
>>  
>>          if (is_write) {
>>              if (!memory_region_is_ram(section->mr)) {
>> @@ -3491,10 +3581,16 @@ void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
>>                  qemu_put_ram_ptr(ptr);
>>              }
>>          }
>> +
>> +        memory_region_section_unref(&obj_mrs);
> 
> The mapping cannot change from not-referenced to reference-counted while
> we were dispatching? I mean the case where we found not ref callback on
> entry and took the big lock, but now there is an unref callback.

We drop the big lock in that case, so we end up in the same situation.

> 
>>          len -= l;
>>          buf += l;
>>          addr += l;
>> +        if (safe_ref == 0) {
>> +            qemu_mutex_unlock_iothread();
>> +        }
>>      }
>> +
>>  }
>>  
>>  /* used for ROM loading : can write in RAM and ROM */
>> @@ -3504,14 +3600,18 @@ void cpu_physical_memory_write_rom(target_phys_addr_t addr,
>>      int l;
>>      uint8_t *ptr;
>>      target_phys_addr_t page;
>> -    MemoryRegionSection *section;
>> +    MemoryRegionSection *section, mr_obj;
>>  
>>      while (len > 0) {
>>          page = addr & TARGET_PAGE_MASK;
>>          l = (page + TARGET_PAGE_SIZE) - addr;
>>          if (l > len)
>>              l = len;
>> -        section = phys_page_find(page >> TARGET_PAGE_BITS);
>> +
>> +        qemu_mutex_lock(&mem_map_lock);
>> +        phys_page_lookup(page, &mr_obj);
>> +        qemu_mutex_unlock(&mem_map_lock);
>> +        section = &mr_obj;
> 
> But here we don't care about the return code of phys_page_lookup and all
> related topics? Because we assume the BQL is held? Reminds me that we
> will need some support for assert(qemu_mutex_is_locked(&lock)).

I guess it's better to drop that assumption than to have asymmetric APIs.

>>  
>> @@ -4239,9 +4345,12 @@ bool virtio_is_big_endian(void)
>>  #ifndef CONFIG_USER_ONLY
>>  bool cpu_physical_memory_is_io(target_phys_addr_t phys_addr)
>>  {
>> -    MemoryRegionSection *section;
>> +    MemoryRegionSection *section, mr_obj;
>>  
>> -    section = phys_page_find(phys_addr >> TARGET_PAGE_BITS);
>> +    qemu_mutex_lock(&mem_map_lock);
>> +    phys_page_lookup(phys_addr, &mr_obj);
>> +    qemu_mutex_unlock(&mem_map_lock);
>> +    section = &mr_obj;
> 
> Err, no unref needed here?

Need _ref in the name to remind reviewers that it leaves the refcount
unbalanced.
pingfan liu - Oct. 24, 2012, 6:31 a.m.
On Tue, Oct 23, 2012 at 8:36 PM, Avi Kivity <avi@redhat.com> wrote:
> On 10/23/2012 02:12 PM, Jan Kiszka wrote:
>> On 2012-10-22 11:23, Liu Ping Fan wrote:
>>> Without biglock, we try to protect the mr by increase refcnt.
>>> If we can inc refcnt, go backward and resort to biglock.
>>>
>>> Another point is memory radix-tree can be flushed by another
>>> thread, so we should get the copy of terminal mr to survive
>>> from such issue.
>>>
>>> +
>>>  void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
>>>                              int len, int is_write)
>>>  {
>>> @@ -3413,14 +3489,28 @@ void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
>>>      uint8_t *ptr;
>>>      uint32_t val;
>>>      target_phys_addr_t page;
>>> -    MemoryRegionSection *section;
>>> +    MemoryRegionSection *section, obj_mrs;
>>> +    int safe_ref;
>>>
>>>      while (len > 0) {
>>>          page = addr & TARGET_PAGE_MASK;
>>>          l = (page + TARGET_PAGE_SIZE) - addr;
>>>          if (l > len)
>>>              l = len;
>>> -        section = phys_page_find(page >> TARGET_PAGE_BITS);
>>> +        qemu_mutex_lock(&mem_map_lock);
>>> +        safe_ref = phys_page_lookup(page, &obj_mrs);
>>> +        qemu_mutex_unlock(&mem_map_lock);
>>> +        if (safe_ref == 0) {
>>> +            qemu_mutex_lock_iothread();
>>> +            qemu_mutex_lock(&mem_map_lock);
>>> +            /* At the 2nd try, mem map can change, so need to judge it again */
>>> +            safe_ref = phys_page_lookup(page, &obj_mrs);
>>> +            qemu_mutex_unlock(&mem_map_lock);
>>> +            if (safe_ref > 0) {
>>> +                qemu_mutex_unlock_iothread();
>>> +            }
>>> +        }
>>> +        section = &obj_mrs;
>>>
>>>          if (is_write) {
>>>              if (!memory_region_is_ram(section->mr)) {
>>> @@ -3491,10 +3581,16 @@ void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
>>>                  qemu_put_ram_ptr(ptr);
>>>              }
>>>          }
>>> +
>>> +        memory_region_section_unref(&obj_mrs);
>>
>> The mapping cannot change from not-referenced to reference-counted while
>> we were dispatching? I mean the case where we found not ref callback on
>> entry and took the big lock, but now there is an unref callback.
>
> We drop the big lock in that case, so we end up in the same situation.
>
>>
>>>          len -= l;
>>>          buf += l;
>>>          addr += l;
>>> +        if (safe_ref == 0) {
>>> +            qemu_mutex_unlock_iothread();
>>> +        }
>>>      }
>>> +
>>>  }
>>>
>>>  /* used for ROM loading : can write in RAM and ROM */
>>> @@ -3504,14 +3600,18 @@ void cpu_physical_memory_write_rom(target_phys_addr_t addr,
>>>      int l;
>>>      uint8_t *ptr;
>>>      target_phys_addr_t page;
>>> -    MemoryRegionSection *section;
>>> +    MemoryRegionSection *section, mr_obj;
>>>
>>>      while (len > 0) {
>>>          page = addr & TARGET_PAGE_MASK;
>>>          l = (page + TARGET_PAGE_SIZE) - addr;
>>>          if (l > len)
>>>              l = len;
>>> -        section = phys_page_find(page >> TARGET_PAGE_BITS);
>>> +
>>> +        qemu_mutex_lock(&mem_map_lock);
>>> +        phys_page_lookup(page, &mr_obj);
>>> +        qemu_mutex_unlock(&mem_map_lock);
>>> +        section = &mr_obj;
>>
>> But here we don't care about the return code of phys_page_lookup and all
>> related topics? Because we assume the BQL is held? Reminds me that we
>> will need some support for assert(qemu_mutex_is_locked(&lock)).
>
> I guess it's better to drop that assumption than to have asymmetric APIs.
>
Yes, now the updater of physmap based on mem_map_lock, and the same it
will be for readers.
>>>
>>> @@ -4239,9 +4345,12 @@ bool virtio_is_big_endian(void)
>>>  #ifndef CONFIG_USER_ONLY
>>>  bool cpu_physical_memory_is_io(target_phys_addr_t phys_addr)
>>>  {
>>> -    MemoryRegionSection *section;
>>> +    MemoryRegionSection *section, mr_obj;
>>>
>>> -    section = phys_page_find(phys_addr >> TARGET_PAGE_BITS);
>>> +    qemu_mutex_lock(&mem_map_lock);
>>> +    phys_page_lookup(phys_addr, &mr_obj);
>>> +    qemu_mutex_unlock(&mem_map_lock);
>>> +    section = &mr_obj;
>>
>> Err, no unref needed here?
>
> Need _ref in the name to remind reviewers that it leaves the refcount
> unbalanced.
>
Oh, here is a bug, need unref.  As to unbalanced refcount, it will be
adopted for virtio-blk listener (not implement in this patchset)

Regards,
pingfan
> --
> error compiling committee.c: too many arguments to function
>
pingfan liu - Oct. 24, 2012, 6:56 a.m.
On Wed, Oct 24, 2012 at 2:31 PM, liu ping fan <qemulist@gmail.com> wrote:
> On Tue, Oct 23, 2012 at 8:36 PM, Avi Kivity <avi@redhat.com> wrote:
>> On 10/23/2012 02:12 PM, Jan Kiszka wrote:
>>> On 2012-10-22 11:23, Liu Ping Fan wrote:
>>>> Without biglock, we try to protect the mr by increase refcnt.
>>>> If we can inc refcnt, go backward and resort to biglock.
>>>>
>>>> Another point is memory radix-tree can be flushed by another
>>>> thread, so we should get the copy of terminal mr to survive
>>>> from such issue.
>>>>
>>>> +
>>>>  void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
>>>>                              int len, int is_write)
>>>>  {
>>>> @@ -3413,14 +3489,28 @@ void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
>>>>      uint8_t *ptr;
>>>>      uint32_t val;
>>>>      target_phys_addr_t page;
>>>> -    MemoryRegionSection *section;
>>>> +    MemoryRegionSection *section, obj_mrs;
>>>> +    int safe_ref;
>>>>
>>>>      while (len > 0) {
>>>>          page = addr & TARGET_PAGE_MASK;
>>>>          l = (page + TARGET_PAGE_SIZE) - addr;
>>>>          if (l > len)
>>>>              l = len;
>>>> -        section = phys_page_find(page >> TARGET_PAGE_BITS);
>>>> +        qemu_mutex_lock(&mem_map_lock);
>>>> +        safe_ref = phys_page_lookup(page, &obj_mrs);
>>>> +        qemu_mutex_unlock(&mem_map_lock);
>>>> +        if (safe_ref == 0) {
>>>> +            qemu_mutex_lock_iothread();
>>>> +            qemu_mutex_lock(&mem_map_lock);
>>>> +            /* At the 2nd try, mem map can change, so need to judge it again */
>>>> +            safe_ref = phys_page_lookup(page, &obj_mrs);
>>>> +            qemu_mutex_unlock(&mem_map_lock);
>>>> +            if (safe_ref > 0) {
>>>> +                qemu_mutex_unlock_iothread();
>>>> +            }
>>>> +        }
>>>> +        section = &obj_mrs;
>>>>
>>>>          if (is_write) {
>>>>              if (!memory_region_is_ram(section->mr)) {
>>>> @@ -3491,10 +3581,16 @@ void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
>>>>                  qemu_put_ram_ptr(ptr);
>>>>              }
>>>>          }
>>>> +
>>>> +        memory_region_section_unref(&obj_mrs);
>>>
>>> The mapping cannot change from not-referenced to reference-counted while
>>> we were dispatching? I mean the case where we found not ref callback on
>>> entry and took the big lock, but now there is an unref callback.
>>
>> We drop the big lock in that case, so we end up in the same situation.
>>
>>>
>>>>          len -= l;
>>>>          buf += l;
>>>>          addr += l;
>>>> +        if (safe_ref == 0) {
>>>> +            qemu_mutex_unlock_iothread();
>>>> +        }
>>>>      }
>>>> +
>>>>  }
>>>>
>>>>  /* used for ROM loading : can write in RAM and ROM */
>>>> @@ -3504,14 +3600,18 @@ void cpu_physical_memory_write_rom(target_phys_addr_t addr,
>>>>      int l;
>>>>      uint8_t *ptr;
>>>>      target_phys_addr_t page;
>>>> -    MemoryRegionSection *section;
>>>> +    MemoryRegionSection *section, mr_obj;
>>>>
>>>>      while (len > 0) {
>>>>          page = addr & TARGET_PAGE_MASK;
>>>>          l = (page + TARGET_PAGE_SIZE) - addr;
>>>>          if (l > len)
>>>>              l = len;
>>>> -        section = phys_page_find(page >> TARGET_PAGE_BITS);
>>>> +
>>>> +        qemu_mutex_lock(&mem_map_lock);
>>>> +        phys_page_lookup(page, &mr_obj);
>>>> +        qemu_mutex_unlock(&mem_map_lock);
>>>> +        section = &mr_obj;
>>>
>>> But here we don't care about the return code of phys_page_lookup and all
>>> related topics? Because we assume the BQL is held? Reminds me that we
>>> will need some support for assert(qemu_mutex_is_locked(&lock)).
>>
>> I guess it's better to drop that assumption than to have asymmetric APIs.
>>
> Yes, now the updater of physmap based on mem_map_lock, and the same it
> will be for readers.
>>>>
>>>> @@ -4239,9 +4345,12 @@ bool virtio_is_big_endian(void)
>>>>  #ifndef CONFIG_USER_ONLY
>>>>  bool cpu_physical_memory_is_io(target_phys_addr_t phys_addr)
>>>>  {
>>>> -    MemoryRegionSection *section;
>>>> +    MemoryRegionSection *section, mr_obj;
>>>>
>>>> -    section = phys_page_find(phys_addr >> TARGET_PAGE_BITS);
>>>> +    qemu_mutex_lock(&mem_map_lock);
>>>> +    phys_page_lookup(phys_addr, &mr_obj);
>>>> +    qemu_mutex_unlock(&mem_map_lock);
>>>> +    section = &mr_obj;
>>>
>>> Err, no unref needed here?
>>
>> Need _ref in the name to remind reviewers that it leaves the refcount
>> unbalanced.
>>
> Oh, here is a bug, need unref.  As to unbalanced refcount, it will be
> adopted for virtio-blk listener (not implement in this patchset)
>
It is like cpu_physical_memory_map/unmap, the map will hold the
unbalanced ref, and unmap release it.

> Regards,
> pingfan
>> --
>> error compiling committee.c: too many arguments to function
>>
Avi Kivity - Oct. 25, 2012, 8:57 a.m.
On 10/24/2012 08:56 AM, liu ping fan wrote:
>>>
>> Oh, here is a bug, need unref.  As to unbalanced refcount, it will be
>> adopted for virtio-blk listener (not implement in this patchset)
>>
> It is like cpu_physical_memory_map/unmap, the map will hold the
> unbalanced ref, and unmap release it.

Those APIs have symmetric names.  map/unmap, ref/unref, lock/unlock.
But here we have lookup and no unlookup.

Patch

diff --git a/exec.c b/exec.c
index 5834766..91b859b 100644
--- a/exec.c
+++ b/exec.c
@@ -200,6 +200,8 @@  struct PhysPageEntry {
     uint16_t ptr : 15;
 };
 
+static QemuMutex mem_map_lock;
+
 /* Simple allocator for PhysPageEntry nodes */
 static PhysPageEntry (*phys_map_nodes)[L2_SIZE];
 static unsigned phys_map_nodes_nb, phys_map_nodes_nb_alloc;
@@ -212,6 +214,8 @@  static PhysPageEntry phys_map = { .ptr = PHYS_MAP_NODE_NIL, .is_leaf = 0 };
 
 static void io_mem_init(void);
 static void memory_map_init(void);
+static int phys_page_lookup(target_phys_addr_t addr, MemoryRegionSection *mrs);
+
 
 static MemoryRegion io_mem_watch;
 #endif
@@ -2245,6 +2249,7 @@  static void register_subpage(MemoryRegionSection *section)
     subpage_t *subpage;
     target_phys_addr_t base = section->offset_within_address_space
         & TARGET_PAGE_MASK;
+    /* Already under the protection of mem_map_lock */
     MemoryRegionSection *existing = phys_page_find(base >> TARGET_PAGE_BITS);
     MemoryRegionSection subsection = {
         .offset_within_address_space = base,
@@ -3165,6 +3170,8 @@  static void io_mem_init(void)
 
 static void core_begin(MemoryListener *listener)
 {
+    /* protect the updating process of mrs in memory core agaist readers */
+    qemu_mutex_lock(&mem_map_lock);
     destroy_all_mappings();
     phys_sections_clear();
     phys_map.ptr = PHYS_MAP_NODE_NIL;
@@ -3184,17 +3191,32 @@  static void core_commit(MemoryListener *listener)
     for(env = first_cpu; env != NULL; env = env->next_cpu) {
         tlb_flush(env, 1);
     }
+    qemu_mutex_unlock(&mem_map_lock);
 }
 
 static void core_region_add(MemoryListener *listener,
                             MemoryRegionSection *section)
 {
+    MemoryRegion *mr = section->mr;
+
+    if (mr->ops) {
+        if (mr->ops->ref) {
+            mr->ops->ref(mr);
+        }
+    }
     cpu_register_physical_memory_log(section, section->readonly);
 }
 
 static void core_region_del(MemoryListener *listener,
                             MemoryRegionSection *section)
 {
+    MemoryRegion *mr = section->mr;
+
+    if (mr->ops) {
+        if (mr->ops->unref) {
+            mr->ops->unref(mr);
+        }
+    }
 }
 
 static void core_region_nop(MemoryListener *listener,
@@ -3348,6 +3370,8 @@  static void memory_map_init(void)
     memory_region_init(system_io, "io", 65536);
     set_system_io_map(system_io);
 
+    qemu_mutex_init(&mem_map_lock);
+
     memory_listener_register(&core_memory_listener, system_memory);
     memory_listener_register(&io_memory_listener, system_io);
 }
@@ -3406,6 +3430,58 @@  int cpu_memory_rw_debug(CPUArchState *env, target_ulong addr,
 }
 
 #else
+
+static MemoryRegionSection *subpage_get_terminal(subpage_t *mmio,
+    target_phys_addr_t addr)
+{
+    MemoryRegionSection *section;
+    unsigned int idx = SUBPAGE_IDX(addr);
+
+    section = &phys_sections[mmio->sub_section[idx]];
+    return section;
+}
+
+static int memory_region_section_ref(MemoryRegionSection *mrs)
+{
+    MemoryRegion *mr;
+    int ret = 0;
+
+    mr = mrs->mr;
+    if (mr->ops) {
+        if (mr->ops->ref) {
+            ret = mr->ops->ref(mr);
+        }
+    }
+    return ret;
+}
+
+static void memory_region_section_unref(MemoryRegionSection *mrs)
+{
+    MemoryRegion *mr;
+
+    mr = mrs->mr;
+    if (mr->ops) {
+        if (mr->ops->unref) {
+            mr->ops->unref(mr);
+        }
+    }
+}
+
+static int phys_page_lookup(target_phys_addr_t addr, MemoryRegionSection *mrs)
+{
+    MemoryRegionSection *section;
+    int ret;
+
+    section = phys_page_find(addr >> TARGET_PAGE_BITS);
+    if (section->mr->subpage) {
+        section = subpage_get_terminal(section->mr->opaque, addr);
+    }
+    *mrs = *section;
+    ret = memory_region_section_ref(mrs);
+
+    return ret;
+}
+
 void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
                             int len, int is_write)
 {
@@ -3413,14 +3489,28 @@  void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
     uint8_t *ptr;
     uint32_t val;
     target_phys_addr_t page;
-    MemoryRegionSection *section;
+    MemoryRegionSection *section, obj_mrs;
+    int safe_ref;
 
     while (len > 0) {
         page = addr & TARGET_PAGE_MASK;
         l = (page + TARGET_PAGE_SIZE) - addr;
         if (l > len)
             l = len;
-        section = phys_page_find(page >> TARGET_PAGE_BITS);
+        qemu_mutex_lock(&mem_map_lock);
+        safe_ref = phys_page_lookup(page, &obj_mrs);
+        qemu_mutex_unlock(&mem_map_lock);
+        if (safe_ref == 0) {
+            qemu_mutex_lock_iothread();
+            qemu_mutex_lock(&mem_map_lock);
+            /* At the 2nd try, mem map can change, so need to judge it again */
+            safe_ref = phys_page_lookup(page, &obj_mrs);
+            qemu_mutex_unlock(&mem_map_lock);
+            if (safe_ref > 0) {
+                qemu_mutex_unlock_iothread();
+            }
+        }
+        section = &obj_mrs;
 
         if (is_write) {
             if (!memory_region_is_ram(section->mr)) {
@@ -3491,10 +3581,16 @@  void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
                 qemu_put_ram_ptr(ptr);
             }
         }
+
+        memory_region_section_unref(&obj_mrs);
         len -= l;
         buf += l;
         addr += l;
+        if (safe_ref == 0) {
+            qemu_mutex_unlock_iothread();
+        }
     }
+
 }
 
 /* used for ROM loading : can write in RAM and ROM */
@@ -3504,14 +3600,18 @@  void cpu_physical_memory_write_rom(target_phys_addr_t addr,
     int l;
     uint8_t *ptr;
     target_phys_addr_t page;
-    MemoryRegionSection *section;
+    MemoryRegionSection *section, mr_obj;
 
     while (len > 0) {
         page = addr & TARGET_PAGE_MASK;
         l = (page + TARGET_PAGE_SIZE) - addr;
         if (l > len)
             l = len;
-        section = phys_page_find(page >> TARGET_PAGE_BITS);
+
+        qemu_mutex_lock(&mem_map_lock);
+        phys_page_lookup(page, &mr_obj);
+        qemu_mutex_unlock(&mem_map_lock);
+        section = &mr_obj;
 
         if (!(memory_region_is_ram(section->mr) ||
               memory_region_is_romd(section->mr))) {
@@ -3528,6 +3628,7 @@  void cpu_physical_memory_write_rom(target_phys_addr_t addr,
         len -= l;
         buf += l;
         addr += l;
+        memory_region_section_unref(&mr_obj);
     }
 }
 
@@ -3592,7 +3693,7 @@  void *cpu_physical_memory_map(target_phys_addr_t addr,
     target_phys_addr_t todo = 0;
     int l;
     target_phys_addr_t page;
-    MemoryRegionSection *section;
+    MemoryRegionSection *section, mr_obj;
     ram_addr_t raddr = RAM_ADDR_MAX;
     ram_addr_t rlen;
     void *ret;
@@ -3602,7 +3703,10 @@  void *cpu_physical_memory_map(target_phys_addr_t addr,
         l = (page + TARGET_PAGE_SIZE) - addr;
         if (l > len)
             l = len;
-        section = phys_page_find(page >> TARGET_PAGE_BITS);
+        qemu_mutex_lock(&mem_map_lock);
+        phys_page_lookup(page, &mr_obj);
+        qemu_mutex_unlock(&mem_map_lock);
+        section = &mr_obj;
 
         if (!(memory_region_is_ram(section->mr) && !section->readonly)) {
             if (todo || bounce.buffer) {
@@ -3616,6 +3720,7 @@  void *cpu_physical_memory_map(target_phys_addr_t addr,
             }
 
             *plen = l;
+            memory_region_section_unref(&mr_obj);
             return bounce.buffer;
         }
         if (!todo) {
@@ -3630,6 +3735,7 @@  void *cpu_physical_memory_map(target_phys_addr_t addr,
     rlen = todo;
     ret = qemu_ram_ptr_length(raddr, &rlen);
     *plen = rlen;
+    memory_region_section_unref(&mr_obj);
     return ret;
 }
 
@@ -4239,9 +4345,12 @@  bool virtio_is_big_endian(void)
 #ifndef CONFIG_USER_ONLY
 bool cpu_physical_memory_is_io(target_phys_addr_t phys_addr)
 {
-    MemoryRegionSection *section;
+    MemoryRegionSection *section, mr_obj;
 
-    section = phys_page_find(phys_addr >> TARGET_PAGE_BITS);
+    qemu_mutex_lock(&mem_map_lock);
+    phys_page_lookup(phys_addr, &mr_obj);
+    qemu_mutex_unlock(&mem_map_lock);
+    section = &mr_obj;
 
     return !(memory_region_is_ram(section->mr) ||
              memory_region_is_romd(section->mr));