Message ID | 1421428797-23697-3-git-send-email-fred.konrad@greensocs.com |
---|---|
State | New |
Headers | show |
fred.konrad@greensocs.com writes: > From: KONRAD Frederic <fred.konrad@greensocs.com> > > We need a different TranslationBlock list for each core in case of multithread > TCG. > > Signed-off-by: KONRAD Frederic <fred.konrad@greensocs.com> > --- > translate-all.c | 40 ++++++++++++++++++++++------------------ > 1 file changed, 22 insertions(+), 18 deletions(-) > > diff --git a/translate-all.c b/translate-all.c > index 8fa4378..0e11c70 100644 > --- a/translate-all.c > +++ b/translate-all.c > @@ -72,10 +72,11 @@ > #endif > > #define SMC_BITMAP_USE_THRESHOLD 10 > +#define MAX_CPUS 256 Where does this number come from? > typedef struct PageDesc { > /* list of TBs intersecting this ram page */ > - TranslationBlock *first_tb; > + TranslationBlock *first_tb[MAX_CPUS]; Especially given the size of the PageDesc structure this adds a lot of of bulk, mostly unused. Is the access to the TB list via PageDesc that frequent to avoid an additional indirection? > /* in order to optimize self modifying code, we count the number > of lookups we do to a given page to use a bitmap */ > unsigned int code_write_count; > @@ -750,7 +751,7 @@ static inline void invalidate_page_bitmap(PageDesc *p) > /* Set to NULL all the 'first_tb' fields in all PageDescs. */ > static void page_flush_tb_1(int level, void **lp) > { > - int i; > + int i, j; > > if (*lp == NULL) { > return; > @@ -759,7 +760,9 @@ static void page_flush_tb_1(int level, void **lp) > PageDesc *pd = *lp; > > for (i = 0; i < V_L2_SIZE; ++i) { > - pd[i].first_tb = NULL; > + for (j = 0; j < MAX_CPUS; j++) { > + pd[i].first_tb[j] = NULL; > + } > invalidate_page_bitmap(pd + i); > } > } else { > @@ -937,12 +940,12 @@ void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr) > /* remove the TB from the page list */ > if (tb->page_addr[0] != page_addr) { > p = page_find(tb->page_addr[0] >> TARGET_PAGE_BITS); > - tb_page_remove(&p->first_tb, tb); > + tb_page_remove(&p->first_tb[current_cpu->cpu_index], tb); > invalidate_page_bitmap(p); > } > if (tb->page_addr[1] != -1 && tb->page_addr[1] != page_addr) { > p = page_find(tb->page_addr[1] >> TARGET_PAGE_BITS); > - tb_page_remove(&p->first_tb, tb); > + tb_page_remove(&p->first_tb[current_cpu->cpu_index], tb); > invalidate_page_bitmap(p); > } > > @@ -1012,7 +1015,7 @@ static void build_page_bitmap(PageDesc *p) > > p->code_bitmap = g_malloc0(TARGET_PAGE_SIZE / 8); > > - tb = p->first_tb; > + tb = p->first_tb[current_cpu->cpu_index]; > while (tb != NULL) { > n = (uintptr_t)tb & 3; > tb = (TranslationBlock *)((uintptr_t)tb & ~3); > @@ -1138,7 +1141,7 @@ void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end, > /* we remove all the TBs in the range [start, end[ */ > /* XXX: see if in some cases it could be faster to invalidate all > the code */ > - tb = p->first_tb; > + tb = p->first_tb[cpu->cpu_index]; > while (tb != NULL) { > n = (uintptr_t)tb & 3; > tb = (TranslationBlock *)((uintptr_t)tb & ~3); > @@ -1196,7 +1199,7 @@ void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end, > } > #if !defined(CONFIG_USER_ONLY) > /* if no code remaining, no need to continue to use slow writes */ > - if (!p->first_tb) { > + if (!p->first_tb[cpu->cpu_index]) { > invalidate_page_bitmap(p); > if (is_cpu_write_access) { > tlb_unprotect_code_phys(cpu, start, cpu->mem_io_vaddr); > @@ -1224,10 +1227,10 @@ void tb_invalidate_phys_page_fast(tb_page_addr_t start, int len) > #if 0 > if (1) { > qemu_log("modifying code at 0x%x size=%d EIP=%x PC=%08x\n", > - cpu_single_env->mem_io_vaddr, len, > - cpu_single_env->eip, > - cpu_single_env->eip + > - (intptr_t)cpu_single_env->segs[R_CS].base); > + current_cpu->mem_io_vaddr, len, > + current_cpu->eip, > + current_cpu->eip + > + (intptr_t)current_cpu->segs[R_CS].base); > } > #endif > p = page_find(start >> TARGET_PAGE_BITS); > @@ -1269,7 +1272,7 @@ static void tb_invalidate_phys_page(tb_page_addr_t addr, > if (!p) { > return; > } > - tb = p->first_tb; > + tb = p->first_tb[current_cpu->cpu_index]; > #ifdef TARGET_HAS_PRECISE_SMC > if (tb && pc != 0) { > current_tb = tb_find_pc(pc); > @@ -1299,7 +1302,7 @@ static void tb_invalidate_phys_page(tb_page_addr_t addr, > tb_phys_invalidate(tb, addr); > tb = tb->page_next[n]; > } > - p->first_tb = NULL; > + p->first_tb[current_cpu->cpu_index] = NULL; > #ifdef TARGET_HAS_PRECISE_SMC > if (current_tb_modified) { > /* we generate a block containing just the instruction > @@ -1327,11 +1330,12 @@ static inline void tb_alloc_page(TranslationBlock *tb, > > tb->page_addr[n] = page_addr; > p = page_find_alloc(page_addr >> TARGET_PAGE_BITS, 1); > - tb->page_next[n] = p->first_tb; > + tb->page_next[n] = p->first_tb[current_cpu->cpu_index]; > #ifndef CONFIG_USER_ONLY > - page_already_protected = p->first_tb != NULL; > + page_already_protected = p->first_tb[current_cpu->cpu_index] != NULL; > #endif > - p->first_tb = (TranslationBlock *)((uintptr_t)tb | n); > + p->first_tb[current_cpu->cpu_index] > + = (TranslationBlock *)((uintptr_t)tb | n); > invalidate_page_bitmap(p); > > #if defined(TARGET_HAS_SMC) || 1 > @@ -1821,7 +1825,7 @@ void page_set_flags(target_ulong start, target_ulong end, int flags) > the code inside. */ > if (!(p->flags & PAGE_WRITE) && > (flags & PAGE_WRITE) && > - p->first_tb) { > + p->first_tb[current_cpu->cpu_index]) { > tb_invalidate_phys_page(addr, 0, NULL, false); > } > p->flags = flags; As the TranslationBlock itself has a linked list for page related blocks: struct TranslationBlock *page_next[2]; could we not just come up with a structure that chains them together here?
On 27/01/2015 15:45, Alex Bennée wrote: > fred.konrad@greensocs.com writes: > >> From: KONRAD Frederic <fred.konrad@greensocs.com> >> >> We need a different TranslationBlock list for each core in case of multithread >> TCG. >> >> Signed-off-by: KONRAD Frederic <fred.konrad@greensocs.com> >> --- >> translate-all.c | 40 ++++++++++++++++++++++------------------ >> 1 file changed, 22 insertions(+), 18 deletions(-) >> >> diff --git a/translate-all.c b/translate-all.c >> index 8fa4378..0e11c70 100644 >> --- a/translate-all.c >> +++ b/translate-all.c >> @@ -72,10 +72,11 @@ >> #endif >> >> #define SMC_BITMAP_USE_THRESHOLD 10 >> +#define MAX_CPUS 256 > Where does this number come from? > >> typedef struct PageDesc { >> /* list of TBs intersecting this ram page */ >> - TranslationBlock *first_tb; >> + TranslationBlock *first_tb[MAX_CPUS]; > Especially given the size of the PageDesc structure this adds a lot of > of bulk, mostly unused. Is the access to the TB list via PageDesc that > frequent to avoid an additional indirection? > >> /* in order to optimize self modifying code, we count the number >> of lookups we do to a given page to use a bitmap */ >> unsigned int code_write_count; >> @@ -750,7 +751,7 @@ static inline void invalidate_page_bitmap(PageDesc *p) >> /* Set to NULL all the 'first_tb' fields in all PageDescs. */ >> static void page_flush_tb_1(int level, void **lp) >> { >> - int i; >> + int i, j; >> >> if (*lp == NULL) { >> return; >> @@ -759,7 +760,9 @@ static void page_flush_tb_1(int level, void **lp) >> PageDesc *pd = *lp; >> >> for (i = 0; i < V_L2_SIZE; ++i) { >> - pd[i].first_tb = NULL; >> + for (j = 0; j < MAX_CPUS; j++) { >> + pd[i].first_tb[j] = NULL; >> + } >> invalidate_page_bitmap(pd + i); >> } >> } else { >> @@ -937,12 +940,12 @@ void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr) >> /* remove the TB from the page list */ >> if (tb->page_addr[0] != page_addr) { >> p = page_find(tb->page_addr[0] >> TARGET_PAGE_BITS); >> - tb_page_remove(&p->first_tb, tb); >> + tb_page_remove(&p->first_tb[current_cpu->cpu_index], tb); >> invalidate_page_bitmap(p); >> } >> if (tb->page_addr[1] != -1 && tb->page_addr[1] != page_addr) { >> p = page_find(tb->page_addr[1] >> TARGET_PAGE_BITS); >> - tb_page_remove(&p->first_tb, tb); >> + tb_page_remove(&p->first_tb[current_cpu->cpu_index], tb); >> invalidate_page_bitmap(p); >> } >> >> @@ -1012,7 +1015,7 @@ static void build_page_bitmap(PageDesc *p) >> >> p->code_bitmap = g_malloc0(TARGET_PAGE_SIZE / 8); >> >> - tb = p->first_tb; >> + tb = p->first_tb[current_cpu->cpu_index]; >> while (tb != NULL) { >> n = (uintptr_t)tb & 3; >> tb = (TranslationBlock *)((uintptr_t)tb & ~3); >> @@ -1138,7 +1141,7 @@ void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end, >> /* we remove all the TBs in the range [start, end[ */ >> /* XXX: see if in some cases it could be faster to invalidate all >> the code */ >> - tb = p->first_tb; >> + tb = p->first_tb[cpu->cpu_index]; >> while (tb != NULL) { >> n = (uintptr_t)tb & 3; >> tb = (TranslationBlock *)((uintptr_t)tb & ~3); >> @@ -1196,7 +1199,7 @@ void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end, >> } >> #if !defined(CONFIG_USER_ONLY) >> /* if no code remaining, no need to continue to use slow writes */ >> - if (!p->first_tb) { >> + if (!p->first_tb[cpu->cpu_index]) { >> invalidate_page_bitmap(p); >> if (is_cpu_write_access) { >> tlb_unprotect_code_phys(cpu, start, cpu->mem_io_vaddr); >> @@ -1224,10 +1227,10 @@ void tb_invalidate_phys_page_fast(tb_page_addr_t start, int len) >> #if 0 >> if (1) { >> qemu_log("modifying code at 0x%x size=%d EIP=%x PC=%08x\n", >> - cpu_single_env->mem_io_vaddr, len, >> - cpu_single_env->eip, >> - cpu_single_env->eip + >> - (intptr_t)cpu_single_env->segs[R_CS].base); >> + current_cpu->mem_io_vaddr, len, >> + current_cpu->eip, >> + current_cpu->eip + >> + (intptr_t)current_cpu->segs[R_CS].base); >> } >> #endif >> p = page_find(start >> TARGET_PAGE_BITS); >> @@ -1269,7 +1272,7 @@ static void tb_invalidate_phys_page(tb_page_addr_t addr, >> if (!p) { >> return; >> } >> - tb = p->first_tb; >> + tb = p->first_tb[current_cpu->cpu_index]; >> #ifdef TARGET_HAS_PRECISE_SMC >> if (tb && pc != 0) { >> current_tb = tb_find_pc(pc); >> @@ -1299,7 +1302,7 @@ static void tb_invalidate_phys_page(tb_page_addr_t addr, >> tb_phys_invalidate(tb, addr); >> tb = tb->page_next[n]; >> } >> - p->first_tb = NULL; >> + p->first_tb[current_cpu->cpu_index] = NULL; >> #ifdef TARGET_HAS_PRECISE_SMC >> if (current_tb_modified) { >> /* we generate a block containing just the instruction >> @@ -1327,11 +1330,12 @@ static inline void tb_alloc_page(TranslationBlock *tb, >> >> tb->page_addr[n] = page_addr; >> p = page_find_alloc(page_addr >> TARGET_PAGE_BITS, 1); >> - tb->page_next[n] = p->first_tb; >> + tb->page_next[n] = p->first_tb[current_cpu->cpu_index]; >> #ifndef CONFIG_USER_ONLY >> - page_already_protected = p->first_tb != NULL; >> + page_already_protected = p->first_tb[current_cpu->cpu_index] != NULL; >> #endif >> - p->first_tb = (TranslationBlock *)((uintptr_t)tb | n); >> + p->first_tb[current_cpu->cpu_index] >> + = (TranslationBlock *)((uintptr_t)tb | n); >> invalidate_page_bitmap(p); >> >> #if defined(TARGET_HAS_SMC) || 1 >> @@ -1821,7 +1825,7 @@ void page_set_flags(target_ulong start, target_ulong end, int flags) >> the code inside. */ >> if (!(p->flags & PAGE_WRITE) && >> (flags & PAGE_WRITE) && >> - p->first_tb) { >> + p->first_tb[current_cpu->cpu_index]) { >> tb_invalidate_phys_page(addr, 0, NULL, false); >> } >> p->flags = flags; > As the TranslationBlock itself has a linked list for page related > blocks: > > struct TranslationBlock *page_next[2]; > > could we not just come up with a structure that chains them together > here? > Hi Alex, Thanks for looking at this. We don't know how many time this first_tb is accessed right now.. You suggest to chains tb instead of using an array for this? This make sense but I think it means we will have to protect this by a mutex as well? Thanks, Fred
On 16 January 2015 at 17:19, <fred.konrad@greensocs.com> wrote: > From: KONRAD Frederic <fred.konrad@greensocs.com> > > We need a different TranslationBlock list for each core in case of multithread > TCG. > > Signed-off-by: KONRAD Frederic <fred.konrad@greensocs.com> > --- > translate-all.c | 40 ++++++++++++++++++++++------------------ > 1 file changed, 22 insertions(+), 18 deletions(-) > > diff --git a/translate-all.c b/translate-all.c > index 8fa4378..0e11c70 100644 > --- a/translate-all.c > +++ b/translate-all.c > @@ -72,10 +72,11 @@ > #endif > > #define SMC_BITMAP_USE_THRESHOLD 10 > +#define MAX_CPUS 256 > > typedef struct PageDesc { > /* list of TBs intersecting this ram page */ > - TranslationBlock *first_tb; > + TranslationBlock *first_tb[MAX_CPUS]; Do we really need to know this for every CPU, or just for the one that's using this PageDesc? I am assuming we're going to make the l1_map be per-CPU. > /* in order to optimize self modifying code, we count the number > of lookups we do to a given page to use a bitmap */ > unsigned int code_write_count; > @@ -750,7 +751,7 @@ static inline void invalidate_page_bitmap(PageDesc *p) > /* Set to NULL all the 'first_tb' fields in all PageDescs. */ > static void page_flush_tb_1(int level, void **lp) > { > - int i; > + int i, j; > > if (*lp == NULL) { > return; > @@ -759,7 +760,9 @@ static void page_flush_tb_1(int level, void **lp) > PageDesc *pd = *lp; > > for (i = 0; i < V_L2_SIZE; ++i) { > - pd[i].first_tb = NULL; > + for (j = 0; j < MAX_CPUS; j++) { > + pd[i].first_tb[j] = NULL; > + } > invalidate_page_bitmap(pd + i); > } > } else { > @@ -937,12 +940,12 @@ void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr) > /* remove the TB from the page list */ > if (tb->page_addr[0] != page_addr) { > p = page_find(tb->page_addr[0] >> TARGET_PAGE_BITS); > - tb_page_remove(&p->first_tb, tb); > + tb_page_remove(&p->first_tb[current_cpu->cpu_index], tb); Anything using current_cpu in this code is hugely suspect. For instance cpu_restore_state() takes a CPUState pointer and calls this function -- either it should be acting on just that CPU (which might not be the current one) or on all CPUs. In any case implicitly working on current_cpu here is wrong. Probably we need to look at the public-facing functions here and decide which should have "operate on all CPUs" semantics and which should have "operate on the CPU passed as a parameter" and which "operate on the implicit current CPU". -- PMM
I’ll let Fred answer the other points you make - which might help explain what we’re finding.. But - for this one… The idea for now is to keep things simple and have a thread per CPU and a ‘cache’ per thread. (Later we can look at reducing the caches). What we mean by a ‘cache’ needs to be clearer. I dont think ‘cache’ is a helpful word, but it’s the one thats been used elsewhere… anyway Actually - what we (think we) mean is this first_tb list (in each page block). In other words, as things stand, each CPU that is going to use this page is also going to build it’s own translation block list. We end up with pages x CPU first_tb lists… Does that make sense? Cheers Mark. > On 29 Jan 2015, at 16:24, Peter Maydell <peter.maydell@linaro.org> wrote: > >> + TranslationBlock *first_tb[MAX_CPUS]; > > Do we really need to know this for every CPU, or just for > the one that's using this PageDesc? I am assuming we're going to make > the l1_map be per-CPU. +44 (0)20 7100 3485 x 210 +33 (0)5 33 52 01 77x 210 +33 (0)603762104 mark.burton
On 29/01/2015 16:24, Peter Maydell wrote: > On 16 January 2015 at 17:19, <fred.konrad@greensocs.com> wrote: >> From: KONRAD Frederic <fred.konrad@greensocs.com> >> >> We need a different TranslationBlock list for each core in case of multithread >> TCG. >> >> Signed-off-by: KONRAD Frederic <fred.konrad@greensocs.com> >> --- >> translate-all.c | 40 ++++++++++++++++++++++------------------ >> 1 file changed, 22 insertions(+), 18 deletions(-) >> >> diff --git a/translate-all.c b/translate-all.c >> index 8fa4378..0e11c70 100644 >> --- a/translate-all.c >> +++ b/translate-all.c >> @@ -72,10 +72,11 @@ >> #endif >> >> #define SMC_BITMAP_USE_THRESHOLD 10 >> +#define MAX_CPUS 256 >> >> typedef struct PageDesc { >> /* list of TBs intersecting this ram page */ >> - TranslationBlock *first_tb; >> + TranslationBlock *first_tb[MAX_CPUS]; > Do we really need to know this for every CPU, or just for > the one that's using this PageDesc? I am assuming we're going to make > the l1_map be per-CPU. Do we have any clue of which cpu is using this PageDesc? We did this like that because it is quite simple. > >> /* in order to optimize self modifying code, we count the number >> of lookups we do to a given page to use a bitmap */ >> unsigned int code_write_count; >> @@ -750,7 +751,7 @@ static inline void invalidate_page_bitmap(PageDesc *p) >> /* Set to NULL all the 'first_tb' fields in all PageDescs. */ >> static void page_flush_tb_1(int level, void **lp) >> { >> - int i; >> + int i, j; >> >> if (*lp == NULL) { >> return; >> @@ -759,7 +760,9 @@ static void page_flush_tb_1(int level, void **lp) >> PageDesc *pd = *lp; >> >> for (i = 0; i < V_L2_SIZE; ++i) { >> - pd[i].first_tb = NULL; >> + for (j = 0; j < MAX_CPUS; j++) { >> + pd[i].first_tb[j] = NULL; >> + } >> invalidate_page_bitmap(pd + i); >> } >> } else { >> @@ -937,12 +940,12 @@ void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr) >> /* remove the TB from the page list */ >> if (tb->page_addr[0] != page_addr) { >> p = page_find(tb->page_addr[0] >> TARGET_PAGE_BITS); >> - tb_page_remove(&p->first_tb, tb); >> + tb_page_remove(&p->first_tb[current_cpu->cpu_index], tb); > Anything using current_cpu in this code is hugely suspect. > For instance cpu_restore_state() takes a CPUState pointer and > calls this function -- either it should be acting on just that > CPU (which might not be the current one) or on all CPUs. In > any case implicitly working on current_cpu here is wrong. > > Probably we need to look at the public-facing functions here > and decide which should have "operate on all CPUs" semantics > and which should have "operate on the CPU passed as a parameter" > and which "operate on the implicit current CPU". Ok so the idea would be to have eg a cpu mask parameter to know which cpu to invalidate/restore etc etc? Or just pointer and invalidate all if NULL? Thanks, Fred > > -- PMM
On 2 February 2015 at 08:39, Frederic Konrad <fred.konrad@greensocs.com> wrote: > On 29/01/2015 16:24, Peter Maydell wrote: >> >> On 16 January 2015 at 17:19, <fred.konrad@greensocs.com> wrote: >>> >>> From: KONRAD Frederic <fred.konrad@greensocs.com> >>> >>> We need a different TranslationBlock list for each core in case of >>> multithread >>> TCG. >>> >>> Signed-off-by: KONRAD Frederic <fred.konrad@greensocs.com> >>> --- >>> translate-all.c | 40 ++++++++++++++++++++++------------------ >>> 1 file changed, 22 insertions(+), 18 deletions(-) >>> >>> diff --git a/translate-all.c b/translate-all.c >>> index 8fa4378..0e11c70 100644 >>> --- a/translate-all.c >>> +++ b/translate-all.c >>> @@ -72,10 +72,11 @@ >>> #endif >>> >>> #define SMC_BITMAP_USE_THRESHOLD 10 >>> +#define MAX_CPUS 256 >>> >>> typedef struct PageDesc { >>> /* list of TBs intersecting this ram page */ >>> - TranslationBlock *first_tb; >>> + TranslationBlock *first_tb[MAX_CPUS]; >> >> Do we really need to know this for every CPU, or just for >> the one that's using this PageDesc? I am assuming we're going to make >> the l1_map be per-CPU. > > > Do we have any clue of which cpu is using this PageDesc? If you're making the l1_map per-CPU then the PageDescs you get to from that l1_map are for that CPU. Basically my (possibly naive) view of the PageDesc struct is that it's just the leaf descriptors for the l1_map, and the l1_map has to be per-CPU (because it's a virtual-address-space indexed data structure). So I think if you have a PageDesc you know already which CPU it relates to, because you got it from that CPU thread's l1_map. -- PMM
On 01/16/2015 09:19 AM, fred.konrad@greensocs.com wrote: > @@ -759,7 +760,9 @@ static void page_flush_tb_1(int level, void **lp) > PageDesc *pd = *lp; > > for (i = 0; i < V_L2_SIZE; ++i) { > - pd[i].first_tb = NULL; > + for (j = 0; j < MAX_CPUS; j++) { > + pd[i].first_tb[j] = NULL; > + } > invalidate_page_bitmap(pd + i); > } > } else { Surely you've got to do some locking somewhere in order to be able to modify another thread's cpu tb list. I realize that we do have to solve this problem for x86, but for most other targets we ought, in principal, be able to avoid it. Which simply requires that we not treat icache flushes as nops. When the kernel has modified a page, like so, it will also have notified the other cpus that like so, if (smp_call_function(ipi_flush_icache_page, mm, 1)) { We ought to be able to leverage this to avoid some locking at the qemu level. r~
On 03/02/2015 17:17, Richard Henderson wrote: >> > @@ -759,7 +760,9 @@ static void page_flush_tb_1(int level, void **lp) >> > PageDesc *pd = *lp; >> > >> > for (i = 0; i < V_L2_SIZE; ++i) { >> > - pd[i].first_tb = NULL; >> > + for (j = 0; j < MAX_CPUS; j++) { >> > + pd[i].first_tb[j] = NULL; >> > + } >> > invalidate_page_bitmap(pd + i); >> > } >> > } else { > Surely you've got to do some locking somewhere in order to be able to modify > another thread's cpu tb list. But that's probably not even necessary. page_flush_tb_1 is called from tb_flush, which in turn is only called in very special circumstances. It should be possible to have something like the kernel's "stop_machine" that does the following: 1) schedule a callback on all TCG CPU threads 2) wait for all CPUs to have reached that callback 3) do tb_flush on all CPUs, while it knows they are not holding any lock 4) release all TCG CPU threads With one TCG thread, just use qemu_bh_new (hidden behind a suitable API of course!). Once you have multiple TCG CPU threads, loop on all CPUs with the same run_on_cpu function that KVM uses. Paolo
diff --git a/translate-all.c b/translate-all.c index 8fa4378..0e11c70 100644 --- a/translate-all.c +++ b/translate-all.c @@ -72,10 +72,11 @@ #endif #define SMC_BITMAP_USE_THRESHOLD 10 +#define MAX_CPUS 256 typedef struct PageDesc { /* list of TBs intersecting this ram page */ - TranslationBlock *first_tb; + TranslationBlock *first_tb[MAX_CPUS]; /* in order to optimize self modifying code, we count the number of lookups we do to a given page to use a bitmap */ unsigned int code_write_count; @@ -750,7 +751,7 @@ static inline void invalidate_page_bitmap(PageDesc *p) /* Set to NULL all the 'first_tb' fields in all PageDescs. */ static void page_flush_tb_1(int level, void **lp) { - int i; + int i, j; if (*lp == NULL) { return; @@ -759,7 +760,9 @@ static void page_flush_tb_1(int level, void **lp) PageDesc *pd = *lp; for (i = 0; i < V_L2_SIZE; ++i) { - pd[i].first_tb = NULL; + for (j = 0; j < MAX_CPUS; j++) { + pd[i].first_tb[j] = NULL; + } invalidate_page_bitmap(pd + i); } } else { @@ -937,12 +940,12 @@ void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr) /* remove the TB from the page list */ if (tb->page_addr[0] != page_addr) { p = page_find(tb->page_addr[0] >> TARGET_PAGE_BITS); - tb_page_remove(&p->first_tb, tb); + tb_page_remove(&p->first_tb[current_cpu->cpu_index], tb); invalidate_page_bitmap(p); } if (tb->page_addr[1] != -1 && tb->page_addr[1] != page_addr) { p = page_find(tb->page_addr[1] >> TARGET_PAGE_BITS); - tb_page_remove(&p->first_tb, tb); + tb_page_remove(&p->first_tb[current_cpu->cpu_index], tb); invalidate_page_bitmap(p); } @@ -1012,7 +1015,7 @@ static void build_page_bitmap(PageDesc *p) p->code_bitmap = g_malloc0(TARGET_PAGE_SIZE / 8); - tb = p->first_tb; + tb = p->first_tb[current_cpu->cpu_index]; while (tb != NULL) { n = (uintptr_t)tb & 3; tb = (TranslationBlock *)((uintptr_t)tb & ~3); @@ -1138,7 +1141,7 @@ void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end, /* we remove all the TBs in the range [start, end[ */ /* XXX: see if in some cases it could be faster to invalidate all the code */ - tb = p->first_tb; + tb = p->first_tb[cpu->cpu_index]; while (tb != NULL) { n = (uintptr_t)tb & 3; tb = (TranslationBlock *)((uintptr_t)tb & ~3); @@ -1196,7 +1199,7 @@ void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end, } #if !defined(CONFIG_USER_ONLY) /* if no code remaining, no need to continue to use slow writes */ - if (!p->first_tb) { + if (!p->first_tb[cpu->cpu_index]) { invalidate_page_bitmap(p); if (is_cpu_write_access) { tlb_unprotect_code_phys(cpu, start, cpu->mem_io_vaddr); @@ -1224,10 +1227,10 @@ void tb_invalidate_phys_page_fast(tb_page_addr_t start, int len) #if 0 if (1) { qemu_log("modifying code at 0x%x size=%d EIP=%x PC=%08x\n", - cpu_single_env->mem_io_vaddr, len, - cpu_single_env->eip, - cpu_single_env->eip + - (intptr_t)cpu_single_env->segs[R_CS].base); + current_cpu->mem_io_vaddr, len, + current_cpu->eip, + current_cpu->eip + + (intptr_t)current_cpu->segs[R_CS].base); } #endif p = page_find(start >> TARGET_PAGE_BITS); @@ -1269,7 +1272,7 @@ static void tb_invalidate_phys_page(tb_page_addr_t addr, if (!p) { return; } - tb = p->first_tb; + tb = p->first_tb[current_cpu->cpu_index]; #ifdef TARGET_HAS_PRECISE_SMC if (tb && pc != 0) { current_tb = tb_find_pc(pc); @@ -1299,7 +1302,7 @@ static void tb_invalidate_phys_page(tb_page_addr_t addr, tb_phys_invalidate(tb, addr); tb = tb->page_next[n]; } - p->first_tb = NULL; + p->first_tb[current_cpu->cpu_index] = NULL; #ifdef TARGET_HAS_PRECISE_SMC if (current_tb_modified) { /* we generate a block containing just the instruction @@ -1327,11 +1330,12 @@ static inline void tb_alloc_page(TranslationBlock *tb, tb->page_addr[n] = page_addr; p = page_find_alloc(page_addr >> TARGET_PAGE_BITS, 1); - tb->page_next[n] = p->first_tb; + tb->page_next[n] = p->first_tb[current_cpu->cpu_index]; #ifndef CONFIG_USER_ONLY - page_already_protected = p->first_tb != NULL; + page_already_protected = p->first_tb[current_cpu->cpu_index] != NULL; #endif - p->first_tb = (TranslationBlock *)((uintptr_t)tb | n); + p->first_tb[current_cpu->cpu_index] + = (TranslationBlock *)((uintptr_t)tb | n); invalidate_page_bitmap(p); #if defined(TARGET_HAS_SMC) || 1 @@ -1821,7 +1825,7 @@ void page_set_flags(target_ulong start, target_ulong end, int flags) the code inside. */ if (!(p->flags & PAGE_WRITE) && (flags & PAGE_WRITE) && - p->first_tb) { + p->first_tb[current_cpu->cpu_index]) { tb_invalidate_phys_page(addr, 0, NULL, false); } p->flags = flags;