diff mbox

[RFC,02/10] use a different translation block list for each cpu.

Message ID 1421428797-23697-3-git-send-email-fred.konrad@greensocs.com
State New
Headers show

Commit Message

fred.konrad@greensocs.com Jan. 16, 2015, 5:19 p.m. UTC
From: KONRAD Frederic <fred.konrad@greensocs.com>

We need a different TranslationBlock list for each core in case of multithread
TCG.

Signed-off-by: KONRAD Frederic <fred.konrad@greensocs.com>
---
 translate-all.c | 40 ++++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 18 deletions(-)

Comments

Alex Bennée Jan. 27, 2015, 2:45 p.m. UTC | #1
fred.konrad@greensocs.com writes:

> From: KONRAD Frederic <fred.konrad@greensocs.com>
>
> We need a different TranslationBlock list for each core in case of multithread
> TCG.
>
> Signed-off-by: KONRAD Frederic <fred.konrad@greensocs.com>
> ---
>  translate-all.c | 40 ++++++++++++++++++++++------------------
>  1 file changed, 22 insertions(+), 18 deletions(-)
>
> diff --git a/translate-all.c b/translate-all.c
> index 8fa4378..0e11c70 100644
> --- a/translate-all.c
> +++ b/translate-all.c
> @@ -72,10 +72,11 @@
>  #endif
>  
>  #define SMC_BITMAP_USE_THRESHOLD 10
> +#define MAX_CPUS 256

Where does this number come from?

>  typedef struct PageDesc {
>      /* list of TBs intersecting this ram page */
> -    TranslationBlock *first_tb;
> +    TranslationBlock *first_tb[MAX_CPUS];

Especially given the size of the PageDesc structure this adds a lot of
of bulk, mostly unused. Is the access to the TB list via PageDesc that
frequent to avoid an additional indirection?

>      /* in order to optimize self modifying code, we count the number
>         of lookups we do to a given page to use a bitmap */
>      unsigned int code_write_count;
> @@ -750,7 +751,7 @@ static inline void invalidate_page_bitmap(PageDesc *p)
>  /* Set to NULL all the 'first_tb' fields in all PageDescs. */
>  static void page_flush_tb_1(int level, void **lp)
>  {
> -    int i;
> +    int i, j;
>  
>      if (*lp == NULL) {
>          return;
> @@ -759,7 +760,9 @@ static void page_flush_tb_1(int level, void **lp)
>          PageDesc *pd = *lp;
>  
>          for (i = 0; i < V_L2_SIZE; ++i) {
> -            pd[i].first_tb = NULL;
> +            for (j = 0; j < MAX_CPUS; j++) {
> +                pd[i].first_tb[j] = NULL;
> +            }
>              invalidate_page_bitmap(pd + i);
>          }
>      } else {
> @@ -937,12 +940,12 @@ void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr)
>      /* remove the TB from the page list */
>      if (tb->page_addr[0] != page_addr) {
>          p = page_find(tb->page_addr[0] >> TARGET_PAGE_BITS);
> -        tb_page_remove(&p->first_tb, tb);
> +        tb_page_remove(&p->first_tb[current_cpu->cpu_index], tb);
>          invalidate_page_bitmap(p);
>      }
>      if (tb->page_addr[1] != -1 && tb->page_addr[1] != page_addr) {
>          p = page_find(tb->page_addr[1] >> TARGET_PAGE_BITS);
> -        tb_page_remove(&p->first_tb, tb);
> +        tb_page_remove(&p->first_tb[current_cpu->cpu_index], tb);
>          invalidate_page_bitmap(p);
>      }
>  
> @@ -1012,7 +1015,7 @@ static void build_page_bitmap(PageDesc *p)
>  
>      p->code_bitmap = g_malloc0(TARGET_PAGE_SIZE / 8);
>  
> -    tb = p->first_tb;
> +    tb = p->first_tb[current_cpu->cpu_index];
>      while (tb != NULL) {
>          n = (uintptr_t)tb & 3;
>          tb = (TranslationBlock *)((uintptr_t)tb & ~3);
> @@ -1138,7 +1141,7 @@ void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end,
>      /* we remove all the TBs in the range [start, end[ */
>      /* XXX: see if in some cases it could be faster to invalidate all
>         the code */
> -    tb = p->first_tb;
> +    tb = p->first_tb[cpu->cpu_index];
>      while (tb != NULL) {
>          n = (uintptr_t)tb & 3;
>          tb = (TranslationBlock *)((uintptr_t)tb & ~3);
> @@ -1196,7 +1199,7 @@ void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end,
>      }
>  #if !defined(CONFIG_USER_ONLY)
>      /* if no code remaining, no need to continue to use slow writes */
> -    if (!p->first_tb) {
> +    if (!p->first_tb[cpu->cpu_index]) {
>          invalidate_page_bitmap(p);
>          if (is_cpu_write_access) {
>              tlb_unprotect_code_phys(cpu, start, cpu->mem_io_vaddr);
> @@ -1224,10 +1227,10 @@ void tb_invalidate_phys_page_fast(tb_page_addr_t start, int len)
>  #if 0
>      if (1) {
>          qemu_log("modifying code at 0x%x size=%d EIP=%x PC=%08x\n",
> -                  cpu_single_env->mem_io_vaddr, len,
> -                  cpu_single_env->eip,
> -                  cpu_single_env->eip +
> -                  (intptr_t)cpu_single_env->segs[R_CS].base);
> +                  current_cpu->mem_io_vaddr, len,
> +                  current_cpu->eip,
> +                  current_cpu->eip +
> +                  (intptr_t)current_cpu->segs[R_CS].base);
>      }
>  #endif
>      p = page_find(start >> TARGET_PAGE_BITS);
> @@ -1269,7 +1272,7 @@ static void tb_invalidate_phys_page(tb_page_addr_t addr,
>      if (!p) {
>          return;
>      }
> -    tb = p->first_tb;
> +    tb = p->first_tb[current_cpu->cpu_index];
>  #ifdef TARGET_HAS_PRECISE_SMC
>      if (tb && pc != 0) {
>          current_tb = tb_find_pc(pc);
> @@ -1299,7 +1302,7 @@ static void tb_invalidate_phys_page(tb_page_addr_t addr,
>          tb_phys_invalidate(tb, addr);
>          tb = tb->page_next[n];
>      }
> -    p->first_tb = NULL;
> +    p->first_tb[current_cpu->cpu_index] = NULL;
>  #ifdef TARGET_HAS_PRECISE_SMC
>      if (current_tb_modified) {
>          /* we generate a block containing just the instruction
> @@ -1327,11 +1330,12 @@ static inline void tb_alloc_page(TranslationBlock *tb,
>  
>      tb->page_addr[n] = page_addr;
>      p = page_find_alloc(page_addr >> TARGET_PAGE_BITS, 1);
> -    tb->page_next[n] = p->first_tb;
> +    tb->page_next[n] = p->first_tb[current_cpu->cpu_index];
>  #ifndef CONFIG_USER_ONLY
> -    page_already_protected = p->first_tb != NULL;
> +    page_already_protected = p->first_tb[current_cpu->cpu_index] != NULL;
>  #endif
> -    p->first_tb = (TranslationBlock *)((uintptr_t)tb | n);
> +    p->first_tb[current_cpu->cpu_index]
> +      = (TranslationBlock *)((uintptr_t)tb | n);
>      invalidate_page_bitmap(p);
>  
>  #if defined(TARGET_HAS_SMC) || 1
> @@ -1821,7 +1825,7 @@ void page_set_flags(target_ulong start, target_ulong end, int flags)
>             the code inside.  */
>          if (!(p->flags & PAGE_WRITE) &&
>              (flags & PAGE_WRITE) &&
> -            p->first_tb) {
> +            p->first_tb[current_cpu->cpu_index]) {
>              tb_invalidate_phys_page(addr, 0, NULL, false);
>          }
>          p->flags = flags;

As the TranslationBlock itself has a linked list for page related
blocks:

  struct TranslationBlock *page_next[2];

could we not just come up with a structure that chains them together
here?
fred.konrad@greensocs.com Jan. 27, 2015, 3:16 p.m. UTC | #2
On 27/01/2015 15:45, Alex Bennée wrote:
> fred.konrad@greensocs.com writes:
>
>> From: KONRAD Frederic <fred.konrad@greensocs.com>
>>
>> We need a different TranslationBlock list for each core in case of multithread
>> TCG.
>>
>> Signed-off-by: KONRAD Frederic <fred.konrad@greensocs.com>
>> ---
>>   translate-all.c | 40 ++++++++++++++++++++++------------------
>>   1 file changed, 22 insertions(+), 18 deletions(-)
>>
>> diff --git a/translate-all.c b/translate-all.c
>> index 8fa4378..0e11c70 100644
>> --- a/translate-all.c
>> +++ b/translate-all.c
>> @@ -72,10 +72,11 @@
>>   #endif
>>   
>>   #define SMC_BITMAP_USE_THRESHOLD 10
>> +#define MAX_CPUS 256
> Where does this number come from?
>
>>   typedef struct PageDesc {
>>       /* list of TBs intersecting this ram page */
>> -    TranslationBlock *first_tb;
>> +    TranslationBlock *first_tb[MAX_CPUS];
> Especially given the size of the PageDesc structure this adds a lot of
> of bulk, mostly unused. Is the access to the TB list via PageDesc that
> frequent to avoid an additional indirection?
>
>>       /* in order to optimize self modifying code, we count the number
>>          of lookups we do to a given page to use a bitmap */
>>       unsigned int code_write_count;
>> @@ -750,7 +751,7 @@ static inline void invalidate_page_bitmap(PageDesc *p)
>>   /* Set to NULL all the 'first_tb' fields in all PageDescs. */
>>   static void page_flush_tb_1(int level, void **lp)
>>   {
>> -    int i;
>> +    int i, j;
>>   
>>       if (*lp == NULL) {
>>           return;
>> @@ -759,7 +760,9 @@ static void page_flush_tb_1(int level, void **lp)
>>           PageDesc *pd = *lp;
>>   
>>           for (i = 0; i < V_L2_SIZE; ++i) {
>> -            pd[i].first_tb = NULL;
>> +            for (j = 0; j < MAX_CPUS; j++) {
>> +                pd[i].first_tb[j] = NULL;
>> +            }
>>               invalidate_page_bitmap(pd + i);
>>           }
>>       } else {
>> @@ -937,12 +940,12 @@ void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr)
>>       /* remove the TB from the page list */
>>       if (tb->page_addr[0] != page_addr) {
>>           p = page_find(tb->page_addr[0] >> TARGET_PAGE_BITS);
>> -        tb_page_remove(&p->first_tb, tb);
>> +        tb_page_remove(&p->first_tb[current_cpu->cpu_index], tb);
>>           invalidate_page_bitmap(p);
>>       }
>>       if (tb->page_addr[1] != -1 && tb->page_addr[1] != page_addr) {
>>           p = page_find(tb->page_addr[1] >> TARGET_PAGE_BITS);
>> -        tb_page_remove(&p->first_tb, tb);
>> +        tb_page_remove(&p->first_tb[current_cpu->cpu_index], tb);
>>           invalidate_page_bitmap(p);
>>       }
>>   
>> @@ -1012,7 +1015,7 @@ static void build_page_bitmap(PageDesc *p)
>>   
>>       p->code_bitmap = g_malloc0(TARGET_PAGE_SIZE / 8);
>>   
>> -    tb = p->first_tb;
>> +    tb = p->first_tb[current_cpu->cpu_index];
>>       while (tb != NULL) {
>>           n = (uintptr_t)tb & 3;
>>           tb = (TranslationBlock *)((uintptr_t)tb & ~3);
>> @@ -1138,7 +1141,7 @@ void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end,
>>       /* we remove all the TBs in the range [start, end[ */
>>       /* XXX: see if in some cases it could be faster to invalidate all
>>          the code */
>> -    tb = p->first_tb;
>> +    tb = p->first_tb[cpu->cpu_index];
>>       while (tb != NULL) {
>>           n = (uintptr_t)tb & 3;
>>           tb = (TranslationBlock *)((uintptr_t)tb & ~3);
>> @@ -1196,7 +1199,7 @@ void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end,
>>       }
>>   #if !defined(CONFIG_USER_ONLY)
>>       /* if no code remaining, no need to continue to use slow writes */
>> -    if (!p->first_tb) {
>> +    if (!p->first_tb[cpu->cpu_index]) {
>>           invalidate_page_bitmap(p);
>>           if (is_cpu_write_access) {
>>               tlb_unprotect_code_phys(cpu, start, cpu->mem_io_vaddr);
>> @@ -1224,10 +1227,10 @@ void tb_invalidate_phys_page_fast(tb_page_addr_t start, int len)
>>   #if 0
>>       if (1) {
>>           qemu_log("modifying code at 0x%x size=%d EIP=%x PC=%08x\n",
>> -                  cpu_single_env->mem_io_vaddr, len,
>> -                  cpu_single_env->eip,
>> -                  cpu_single_env->eip +
>> -                  (intptr_t)cpu_single_env->segs[R_CS].base);
>> +                  current_cpu->mem_io_vaddr, len,
>> +                  current_cpu->eip,
>> +                  current_cpu->eip +
>> +                  (intptr_t)current_cpu->segs[R_CS].base);
>>       }
>>   #endif
>>       p = page_find(start >> TARGET_PAGE_BITS);
>> @@ -1269,7 +1272,7 @@ static void tb_invalidate_phys_page(tb_page_addr_t addr,
>>       if (!p) {
>>           return;
>>       }
>> -    tb = p->first_tb;
>> +    tb = p->first_tb[current_cpu->cpu_index];
>>   #ifdef TARGET_HAS_PRECISE_SMC
>>       if (tb && pc != 0) {
>>           current_tb = tb_find_pc(pc);
>> @@ -1299,7 +1302,7 @@ static void tb_invalidate_phys_page(tb_page_addr_t addr,
>>           tb_phys_invalidate(tb, addr);
>>           tb = tb->page_next[n];
>>       }
>> -    p->first_tb = NULL;
>> +    p->first_tb[current_cpu->cpu_index] = NULL;
>>   #ifdef TARGET_HAS_PRECISE_SMC
>>       if (current_tb_modified) {
>>           /* we generate a block containing just the instruction
>> @@ -1327,11 +1330,12 @@ static inline void tb_alloc_page(TranslationBlock *tb,
>>   
>>       tb->page_addr[n] = page_addr;
>>       p = page_find_alloc(page_addr >> TARGET_PAGE_BITS, 1);
>> -    tb->page_next[n] = p->first_tb;
>> +    tb->page_next[n] = p->first_tb[current_cpu->cpu_index];
>>   #ifndef CONFIG_USER_ONLY
>> -    page_already_protected = p->first_tb != NULL;
>> +    page_already_protected = p->first_tb[current_cpu->cpu_index] != NULL;
>>   #endif
>> -    p->first_tb = (TranslationBlock *)((uintptr_t)tb | n);
>> +    p->first_tb[current_cpu->cpu_index]
>> +      = (TranslationBlock *)((uintptr_t)tb | n);
>>       invalidate_page_bitmap(p);
>>   
>>   #if defined(TARGET_HAS_SMC) || 1
>> @@ -1821,7 +1825,7 @@ void page_set_flags(target_ulong start, target_ulong end, int flags)
>>              the code inside.  */
>>           if (!(p->flags & PAGE_WRITE) &&
>>               (flags & PAGE_WRITE) &&
>> -            p->first_tb) {
>> +            p->first_tb[current_cpu->cpu_index]) {
>>               tb_invalidate_phys_page(addr, 0, NULL, false);
>>           }
>>           p->flags = flags;
> As the TranslationBlock itself has a linked list for page related
> blocks:
>
>    struct TranslationBlock *page_next[2];
>
> could we not just come up with a structure that chains them together
> here?
>
Hi Alex,

Thanks for looking at this.

We don't know how many time this first_tb is accessed right now..
You suggest to chains tb instead of using an array for this?
This make sense but I think it means we will have to protect this by a 
mutex as
well?

Thanks,
Fred
Peter Maydell Jan. 29, 2015, 3:24 p.m. UTC | #3
On 16 January 2015 at 17:19,  <fred.konrad@greensocs.com> wrote:
> From: KONRAD Frederic <fred.konrad@greensocs.com>
>
> We need a different TranslationBlock list for each core in case of multithread
> TCG.
>
> Signed-off-by: KONRAD Frederic <fred.konrad@greensocs.com>
> ---
>  translate-all.c | 40 ++++++++++++++++++++++------------------
>  1 file changed, 22 insertions(+), 18 deletions(-)
>
> diff --git a/translate-all.c b/translate-all.c
> index 8fa4378..0e11c70 100644
> --- a/translate-all.c
> +++ b/translate-all.c
> @@ -72,10 +72,11 @@
>  #endif
>
>  #define SMC_BITMAP_USE_THRESHOLD 10
> +#define MAX_CPUS 256
>
>  typedef struct PageDesc {
>      /* list of TBs intersecting this ram page */
> -    TranslationBlock *first_tb;
> +    TranslationBlock *first_tb[MAX_CPUS];

Do we really need to know this for every CPU, or just for
the one that's using this PageDesc? I am assuming we're going to make
the l1_map be per-CPU.

>      /* in order to optimize self modifying code, we count the number
>         of lookups we do to a given page to use a bitmap */
>      unsigned int code_write_count;
> @@ -750,7 +751,7 @@ static inline void invalidate_page_bitmap(PageDesc *p)
>  /* Set to NULL all the 'first_tb' fields in all PageDescs. */
>  static void page_flush_tb_1(int level, void **lp)
>  {
> -    int i;
> +    int i, j;
>
>      if (*lp == NULL) {
>          return;
> @@ -759,7 +760,9 @@ static void page_flush_tb_1(int level, void **lp)
>          PageDesc *pd = *lp;
>
>          for (i = 0; i < V_L2_SIZE; ++i) {
> -            pd[i].first_tb = NULL;
> +            for (j = 0; j < MAX_CPUS; j++) {
> +                pd[i].first_tb[j] = NULL;
> +            }
>              invalidate_page_bitmap(pd + i);
>          }
>      } else {
> @@ -937,12 +940,12 @@ void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr)
>      /* remove the TB from the page list */
>      if (tb->page_addr[0] != page_addr) {
>          p = page_find(tb->page_addr[0] >> TARGET_PAGE_BITS);
> -        tb_page_remove(&p->first_tb, tb);
> +        tb_page_remove(&p->first_tb[current_cpu->cpu_index], tb);

Anything using current_cpu in this code is hugely suspect.
For instance cpu_restore_state() takes a CPUState pointer and
calls this function -- either it should be acting on just that
CPU (which might not be the current one) or on all CPUs. In
any case implicitly working on current_cpu here is wrong.

Probably we need to look at the public-facing functions here
and decide which should have "operate on all CPUs" semantics
and which should have "operate on the CPU passed as a parameter"
and which "operate on the implicit current CPU".

-- PMM
Mark Burton Jan. 29, 2015, 3:33 p.m. UTC | #4
I’ll let Fred answer the other points you make - which might help explain what we’re finding..
But - for this one…

The idea for now is to keep things simple and have a thread per CPU and a ‘cache’ per thread. (Later we can look at reducing the caches).

What we mean by a ‘cache’ needs to be clearer. I dont think ‘cache’ is a helpful word, but it’s the one thats been used elsewhere… anyway Actually - what we (think we) mean is this first_tb list (in each page block). In other words, as things stand, each CPU that is going to use this page is also going to build it’s own translation block list. We end up with pages x CPU first_tb lists…

Does that make sense?

Cheers

Mark.







> On 29 Jan 2015, at 16:24, Peter Maydell <peter.maydell@linaro.org> wrote:
> 
>> +    TranslationBlock *first_tb[MAX_CPUS];
> 
> Do we really need to know this for every CPU, or just for
> the one that's using this PageDesc? I am assuming we're going to make
> the l1_map be per-CPU.


	 +44 (0)20 7100 3485 x 210
 +33 (0)5 33 52 01 77x 210

	+33 (0)603762104
	mark.burton
fred.konrad@greensocs.com Feb. 2, 2015, 8:39 a.m. UTC | #5
On 29/01/2015 16:24, Peter Maydell wrote:
> On 16 January 2015 at 17:19,  <fred.konrad@greensocs.com> wrote:
>> From: KONRAD Frederic <fred.konrad@greensocs.com>
>>
>> We need a different TranslationBlock list for each core in case of multithread
>> TCG.
>>
>> Signed-off-by: KONRAD Frederic <fred.konrad@greensocs.com>
>> ---
>>   translate-all.c | 40 ++++++++++++++++++++++------------------
>>   1 file changed, 22 insertions(+), 18 deletions(-)
>>
>> diff --git a/translate-all.c b/translate-all.c
>> index 8fa4378..0e11c70 100644
>> --- a/translate-all.c
>> +++ b/translate-all.c
>> @@ -72,10 +72,11 @@
>>   #endif
>>
>>   #define SMC_BITMAP_USE_THRESHOLD 10
>> +#define MAX_CPUS 256
>>
>>   typedef struct PageDesc {
>>       /* list of TBs intersecting this ram page */
>> -    TranslationBlock *first_tb;
>> +    TranslationBlock *first_tb[MAX_CPUS];
> Do we really need to know this for every CPU, or just for
> the one that's using this PageDesc? I am assuming we're going to make
> the l1_map be per-CPU.

Do we have any clue of which cpu is using this PageDesc?
We did this like that because it is quite simple.
>
>>       /* in order to optimize self modifying code, we count the number
>>          of lookups we do to a given page to use a bitmap */
>>       unsigned int code_write_count;
>> @@ -750,7 +751,7 @@ static inline void invalidate_page_bitmap(PageDesc *p)
>>   /* Set to NULL all the 'first_tb' fields in all PageDescs. */
>>   static void page_flush_tb_1(int level, void **lp)
>>   {
>> -    int i;
>> +    int i, j;
>>
>>       if (*lp == NULL) {
>>           return;
>> @@ -759,7 +760,9 @@ static void page_flush_tb_1(int level, void **lp)
>>           PageDesc *pd = *lp;
>>
>>           for (i = 0; i < V_L2_SIZE; ++i) {
>> -            pd[i].first_tb = NULL;
>> +            for (j = 0; j < MAX_CPUS; j++) {
>> +                pd[i].first_tb[j] = NULL;
>> +            }
>>               invalidate_page_bitmap(pd + i);
>>           }
>>       } else {
>> @@ -937,12 +940,12 @@ void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr)
>>       /* remove the TB from the page list */
>>       if (tb->page_addr[0] != page_addr) {
>>           p = page_find(tb->page_addr[0] >> TARGET_PAGE_BITS);
>> -        tb_page_remove(&p->first_tb, tb);
>> +        tb_page_remove(&p->first_tb[current_cpu->cpu_index], tb);
> Anything using current_cpu in this code is hugely suspect.
> For instance cpu_restore_state() takes a CPUState pointer and
> calls this function -- either it should be acting on just that
> CPU (which might not be the current one) or on all CPUs. In
> any case implicitly working on current_cpu here is wrong.
>
> Probably we need to look at the public-facing functions here
> and decide which should have "operate on all CPUs" semantics
> and which should have "operate on the CPU passed as a parameter"
> and which "operate on the implicit current CPU".

Ok so the idea would be to have eg a cpu mask parameter to know which cpu to
invalidate/restore etc etc?
Or just pointer and invalidate all if NULL?

Thanks,
Fred
>
> -- PMM
Peter Maydell Feb. 2, 2015, 8:49 a.m. UTC | #6
On 2 February 2015 at 08:39, Frederic Konrad <fred.konrad@greensocs.com> wrote:
> On 29/01/2015 16:24, Peter Maydell wrote:
>>
>> On 16 January 2015 at 17:19,  <fred.konrad@greensocs.com> wrote:
>>>
>>> From: KONRAD Frederic <fred.konrad@greensocs.com>
>>>
>>> We need a different TranslationBlock list for each core in case of
>>> multithread
>>> TCG.
>>>
>>> Signed-off-by: KONRAD Frederic <fred.konrad@greensocs.com>
>>> ---
>>>   translate-all.c | 40 ++++++++++++++++++++++------------------
>>>   1 file changed, 22 insertions(+), 18 deletions(-)
>>>
>>> diff --git a/translate-all.c b/translate-all.c
>>> index 8fa4378..0e11c70 100644
>>> --- a/translate-all.c
>>> +++ b/translate-all.c
>>> @@ -72,10 +72,11 @@
>>>   #endif
>>>
>>>   #define SMC_BITMAP_USE_THRESHOLD 10
>>> +#define MAX_CPUS 256
>>>
>>>   typedef struct PageDesc {
>>>       /* list of TBs intersecting this ram page */
>>> -    TranslationBlock *first_tb;
>>> +    TranslationBlock *first_tb[MAX_CPUS];
>>
>> Do we really need to know this for every CPU, or just for
>> the one that's using this PageDesc? I am assuming we're going to make
>> the l1_map be per-CPU.
>
>
> Do we have any clue of which cpu is using this PageDesc?

If you're making the l1_map per-CPU then the PageDescs you
get to from that l1_map are for that CPU. Basically my
(possibly naive) view of the PageDesc struct is that it's
just the leaf descriptors for the l1_map, and the l1_map
has to be per-CPU (because it's a virtual-address-space
indexed data structure). So I think if you have a PageDesc
you know already which CPU it relates to, because you got
it from that CPU thread's l1_map.

-- PMM
Richard Henderson Feb. 3, 2015, 4:17 p.m. UTC | #7
On 01/16/2015 09:19 AM, fred.konrad@greensocs.com wrote:
> @@ -759,7 +760,9 @@ static void page_flush_tb_1(int level, void **lp)
>          PageDesc *pd = *lp;
>  
>          for (i = 0; i < V_L2_SIZE; ++i) {
> -            pd[i].first_tb = NULL;
> +            for (j = 0; j < MAX_CPUS; j++) {
> +                pd[i].first_tb[j] = NULL;
> +            }
>              invalidate_page_bitmap(pd + i);
>          }
>      } else {

Surely you've got to do some locking somewhere in order to be able to modify
another thread's cpu tb list.

I realize that we do have to solve this problem for x86, but for most other
targets we ought, in principal, be able to avoid it.  Which simply requires
that we not treat icache flushes as nops.

When the kernel has modified a page, like so, it will also have notified the
other cpus that like so,

        if (smp_call_function(ipi_flush_icache_page, mm, 1)) {

We ought to be able to leverage this to avoid some locking at the qemu level.


r~
Paolo Bonzini Feb. 3, 2015, 4:33 p.m. UTC | #8
On 03/02/2015 17:17, Richard Henderson wrote:
>> > @@ -759,7 +760,9 @@ static void page_flush_tb_1(int level, void **lp)
>> >          PageDesc *pd = *lp;
>> >  
>> >          for (i = 0; i < V_L2_SIZE; ++i) {
>> > -            pd[i].first_tb = NULL;
>> > +            for (j = 0; j < MAX_CPUS; j++) {
>> > +                pd[i].first_tb[j] = NULL;
>> > +            }
>> >              invalidate_page_bitmap(pd + i);
>> >          }
>> >      } else {
> Surely you've got to do some locking somewhere in order to be able to modify
> another thread's cpu tb list.

But that's probably not even necessary.  page_flush_tb_1 is called from
tb_flush, which in turn is only called in very special circumstances.

It should be possible to have something like the kernel's "stop_machine"
that does the following:

1) schedule a callback on all TCG CPU threads

2) wait for all CPUs to have reached that callback

3) do tb_flush on all CPUs, while it knows they are not holding any lock

4) release all TCG CPU threads

With one TCG thread, just use qemu_bh_new (hidden behind a suitable API
of course!).  Once you have multiple TCG CPU threads, loop on all CPUs
with the same run_on_cpu function that KVM uses.

Paolo
diff mbox

Patch

diff --git a/translate-all.c b/translate-all.c
index 8fa4378..0e11c70 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -72,10 +72,11 @@ 
 #endif
 
 #define SMC_BITMAP_USE_THRESHOLD 10
+#define MAX_CPUS 256
 
 typedef struct PageDesc {
     /* list of TBs intersecting this ram page */
-    TranslationBlock *first_tb;
+    TranslationBlock *first_tb[MAX_CPUS];
     /* in order to optimize self modifying code, we count the number
        of lookups we do to a given page to use a bitmap */
     unsigned int code_write_count;
@@ -750,7 +751,7 @@  static inline void invalidate_page_bitmap(PageDesc *p)
 /* Set to NULL all the 'first_tb' fields in all PageDescs. */
 static void page_flush_tb_1(int level, void **lp)
 {
-    int i;
+    int i, j;
 
     if (*lp == NULL) {
         return;
@@ -759,7 +760,9 @@  static void page_flush_tb_1(int level, void **lp)
         PageDesc *pd = *lp;
 
         for (i = 0; i < V_L2_SIZE; ++i) {
-            pd[i].first_tb = NULL;
+            for (j = 0; j < MAX_CPUS; j++) {
+                pd[i].first_tb[j] = NULL;
+            }
             invalidate_page_bitmap(pd + i);
         }
     } else {
@@ -937,12 +940,12 @@  void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr)
     /* remove the TB from the page list */
     if (tb->page_addr[0] != page_addr) {
         p = page_find(tb->page_addr[0] >> TARGET_PAGE_BITS);
-        tb_page_remove(&p->first_tb, tb);
+        tb_page_remove(&p->first_tb[current_cpu->cpu_index], tb);
         invalidate_page_bitmap(p);
     }
     if (tb->page_addr[1] != -1 && tb->page_addr[1] != page_addr) {
         p = page_find(tb->page_addr[1] >> TARGET_PAGE_BITS);
-        tb_page_remove(&p->first_tb, tb);
+        tb_page_remove(&p->first_tb[current_cpu->cpu_index], tb);
         invalidate_page_bitmap(p);
     }
 
@@ -1012,7 +1015,7 @@  static void build_page_bitmap(PageDesc *p)
 
     p->code_bitmap = g_malloc0(TARGET_PAGE_SIZE / 8);
 
-    tb = p->first_tb;
+    tb = p->first_tb[current_cpu->cpu_index];
     while (tb != NULL) {
         n = (uintptr_t)tb & 3;
         tb = (TranslationBlock *)((uintptr_t)tb & ~3);
@@ -1138,7 +1141,7 @@  void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end,
     /* we remove all the TBs in the range [start, end[ */
     /* XXX: see if in some cases it could be faster to invalidate all
        the code */
-    tb = p->first_tb;
+    tb = p->first_tb[cpu->cpu_index];
     while (tb != NULL) {
         n = (uintptr_t)tb & 3;
         tb = (TranslationBlock *)((uintptr_t)tb & ~3);
@@ -1196,7 +1199,7 @@  void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end,
     }
 #if !defined(CONFIG_USER_ONLY)
     /* if no code remaining, no need to continue to use slow writes */
-    if (!p->first_tb) {
+    if (!p->first_tb[cpu->cpu_index]) {
         invalidate_page_bitmap(p);
         if (is_cpu_write_access) {
             tlb_unprotect_code_phys(cpu, start, cpu->mem_io_vaddr);
@@ -1224,10 +1227,10 @@  void tb_invalidate_phys_page_fast(tb_page_addr_t start, int len)
 #if 0
     if (1) {
         qemu_log("modifying code at 0x%x size=%d EIP=%x PC=%08x\n",
-                  cpu_single_env->mem_io_vaddr, len,
-                  cpu_single_env->eip,
-                  cpu_single_env->eip +
-                  (intptr_t)cpu_single_env->segs[R_CS].base);
+                  current_cpu->mem_io_vaddr, len,
+                  current_cpu->eip,
+                  current_cpu->eip +
+                  (intptr_t)current_cpu->segs[R_CS].base);
     }
 #endif
     p = page_find(start >> TARGET_PAGE_BITS);
@@ -1269,7 +1272,7 @@  static void tb_invalidate_phys_page(tb_page_addr_t addr,
     if (!p) {
         return;
     }
-    tb = p->first_tb;
+    tb = p->first_tb[current_cpu->cpu_index];
 #ifdef TARGET_HAS_PRECISE_SMC
     if (tb && pc != 0) {
         current_tb = tb_find_pc(pc);
@@ -1299,7 +1302,7 @@  static void tb_invalidate_phys_page(tb_page_addr_t addr,
         tb_phys_invalidate(tb, addr);
         tb = tb->page_next[n];
     }
-    p->first_tb = NULL;
+    p->first_tb[current_cpu->cpu_index] = NULL;
 #ifdef TARGET_HAS_PRECISE_SMC
     if (current_tb_modified) {
         /* we generate a block containing just the instruction
@@ -1327,11 +1330,12 @@  static inline void tb_alloc_page(TranslationBlock *tb,
 
     tb->page_addr[n] = page_addr;
     p = page_find_alloc(page_addr >> TARGET_PAGE_BITS, 1);
-    tb->page_next[n] = p->first_tb;
+    tb->page_next[n] = p->first_tb[current_cpu->cpu_index];
 #ifndef CONFIG_USER_ONLY
-    page_already_protected = p->first_tb != NULL;
+    page_already_protected = p->first_tb[current_cpu->cpu_index] != NULL;
 #endif
-    p->first_tb = (TranslationBlock *)((uintptr_t)tb | n);
+    p->first_tb[current_cpu->cpu_index]
+      = (TranslationBlock *)((uintptr_t)tb | n);
     invalidate_page_bitmap(p);
 
 #if defined(TARGET_HAS_SMC) || 1
@@ -1821,7 +1825,7 @@  void page_set_flags(target_ulong start, target_ulong end, int flags)
            the code inside.  */
         if (!(p->flags & PAGE_WRITE) &&
             (flags & PAGE_WRITE) &&
-            p->first_tb) {
+            p->first_tb[current_cpu->cpu_index]) {
             tb_invalidate_phys_page(addr, 0, NULL, false);
         }
         p->flags = flags;