Message ID | 1256386749-85299-10-git-send-email-juha.riihimaki@nokia.com |
---|---|
State | New |
Headers | show |
On Sat, Oct 24, 2009 at 1:19 PM, <juha.riihimaki@nokia.com> wrote: > From: Juha Riihimäki <juha.riihimaki@nokia.com> > > Reduce the amount of TCG ops generated from NEON vld/vst instructions > by simplifying the code generation. > > Signed-off-by: Juha Riihimäki <juha.riihimaki@nokia.com> > --- > target-arm/translate.c | 67 ++++++++++++++++++++++++----------------------- > 1 files changed, 34 insertions(+), 33 deletions(-) > > diff --git a/target-arm/translate.c b/target-arm/translate.c > index f262758..55d6377 100644 > --- a/target-arm/translate.c > +++ b/target-arm/translate.c > @@ -3708,6 +3708,7 @@ static int disas_neon_ls_insn(CPUState * env, DisasContext *s, uint32_t insn) > TCGv tmp; > TCGv tmp2; > TCGv_i64 tmp64; > + TCGv stride_var; > > if (!vfp_enabled(env)) > return 1; > @@ -3729,6 +3730,7 @@ static int disas_neon_ls_insn(CPUState * env, DisasContext *s, uint32_t insn) > return 1; > load_reg_var(s, addr, rn); > stride = (1 << size) * interleave; > + stride_var = tcg_const_i32(stride); > for (reg = 0; reg < nregs; reg++) { > if (interleave > 2 || (interleave == 2 && nregs == 2)) { > load_reg_var(s, addr, rn); > @@ -3747,7 +3749,7 @@ static int disas_neon_ls_insn(CPUState * env, DisasContext *s, uint32_t insn) > neon_load_reg64(tmp64, rd); > gen_st64(tmp64, addr, IS_USER(s)); > } > - tcg_gen_addi_i32(addr, addr, stride); > + tcg_gen_add_i32(addr, addr, stride_var); > } else { > for (pass = 0; pass < 2; pass++) { > if (size == 2) { > @@ -3758,58 +3760,57 @@ static int disas_neon_ls_insn(CPUState * env, DisasContext *s, uint32_t insn) > tmp = neon_load_reg(rd, pass); > gen_st32(tmp, addr, IS_USER(s)); > } > - tcg_gen_addi_i32(addr, addr, stride); > + tcg_gen_add_i32(addr, addr, stride_var); > } else if (size == 1) { > if (load) { > tmp = gen_ld16u(addr, IS_USER(s)); > - tcg_gen_addi_i32(addr, addr, stride); > + tcg_gen_add_i32(addr, addr, stride_var); > tmp2 = gen_ld16u(addr, IS_USER(s)); > - tcg_gen_addi_i32(addr, addr, stride); > - gen_bfi(tmp, tmp, tmp2, 16, 0xffff); > + tcg_gen_add_i32(addr, addr, stride_var); > + tcg_gen_shli_i32(tmp2, tmp2, 16); > + tcg_gen_or_i32(tmp, tmp, tmp2); > dead_tmp(tmp2); > neon_store_reg(rd, pass, tmp); > } else { > tmp = neon_load_reg(rd, pass); > - tmp2 = new_tmp(); > - tcg_gen_shri_i32(tmp2, tmp, 16); > - gen_st16(tmp, addr, IS_USER(s)); > - tcg_gen_addi_i32(addr, addr, stride); > - gen_st16(tmp2, addr, IS_USER(s)); > - tcg_gen_addi_i32(addr, addr, stride); > + tcg_gen_qemu_st16(tmp, addr, IS_USER(s)); > + tcg_gen_add_i32(addr, addr, stride_var); > + tcg_gen_shri_i32(tmp, tmp, 16); > + tcg_gen_qemu_st16(tmp, addr, IS_USER(s)); > + tcg_gen_add_i32(addr, addr, stride_var); > + dead_tmp(tmp); I don't really like the idea of having tcg_qemu_ld/st not factored in some place, as it makes memory access tracing extensions more intrusive. This brings us back to the problem having function freeing tmps. In that case, you could perhaps create a gen_st16_dont_free function as a temporary workaround? > } > } else /* size == 0 */ { > if (load) { > - TCGV_UNUSED(tmp2); > - for (n = 0; n < 4; n++) { > - tmp = gen_ld8u(addr, IS_USER(s)); > - tcg_gen_addi_i32(addr, addr, stride); > - if (n == 0) { > - tmp2 = tmp; > - } else { > - gen_bfi(tmp2, tmp2, tmp, n * 8, 0xff); > - dead_tmp(tmp); > - } > + tmp = gen_ld8u(addr, IS_USER(s)); > + tcg_gen_add_i32(addr, addr, stride_var); > + for (n = 1; n < 4; n++) { > + tmp2 = gen_ld8u(addr, IS_USER(s)); > + tcg_gen_add_i32(addr, addr, stride_var); > + tcg_gen_shli_i32(tmp2, tmp2, n * 8); > + tcg_gen_or_i32(tmp, tmp, tmp2); > + dead_tmp(tmp2); > } > - neon_store_reg(rd, pass, tmp2); > + neon_store_reg(rd, pass, tmp); > } else { > - tmp2 = neon_load_reg(rd, pass); > - for (n = 0; n < 4; n++) { > - tmp = new_tmp(); > - if (n == 0) { > - tcg_gen_mov_i32(tmp, tmp2); > - } else { > - tcg_gen_shri_i32(tmp, tmp2, n * 8); > - } > - gen_st8(tmp, addr, IS_USER(s)); > - tcg_gen_addi_i32(addr, addr, stride); > + tmp2 = tcg_const_i32(8); > + tmp = neon_load_reg(rd, pass); > + for (n = 0; n < 3; n++) { > + tcg_gen_qemu_st8(tmp, addr, IS_USER(s)); > + tcg_gen_add_i32(addr, addr, stride_var); > + tcg_gen_shr_i32(tmp, tmp, tmp2); > } > - dead_tmp(tmp2); > + tcg_gen_qemu_st8(tmp, addr, IS_USER(s)); > + tcg_gen_add_i32(addr, addr, stride_var); > + dead_tmp(tmp); > + tcg_temp_free_i32(tmp2); Same comment as above. Laurent > } > } > } > } > rd += spacing; > } > + tcg_temp_free_i32(stride_var); > stride = nregs * 8; > } else { > size = (insn >> 10) & 3; > -- > 1.6.5 > > > >
On Oct 25, 2009, at 16:01, ext Laurent Desnogues wrote: > I don't really like the idea of having tcg_qemu_ld/st not factored > in some place, as it makes memory access tracing extensions > more intrusive. > > This brings us back to the problem having function freeing tmps. > In that case, you could perhaps create a gen_st16_dont_free > function as a temporary workaround? I could, but it is getting ugly =/ How about if I create another patch that moves the temporary variable (de)allocation outside the gen_ldx/ gen_stx functions and then refactor this patch on top of that? Regards, Juha
On Mon, Oct 26, 2009 at 8:46 AM, <Juha.Riihimaki@nokia.com> wrote: > > On Oct 25, 2009, at 16:01, ext Laurent Desnogues wrote: > >> I don't really like the idea of having tcg_qemu_ld/st not factored >> in some place, as it makes memory access tracing extensions >> more intrusive. >> >> This brings us back to the problem having function freeing tmps. >> In that case, you could perhaps create a gen_st16_dont_free >> function as a temporary workaround? > > I could, but it is getting ugly =/ How about if I create another patch > that moves the temporary variable (de)allocation outside the gen_ldx/ > gen_stx functions and then refactor this patch on top of that? I'd personally like this, but I guess you'd better wait for some inputs from other QEMU dev's before doing it. Laurent
On Mon, Oct 26, 2009 at 10:11:07AM +0100, Laurent Desnogues wrote: > On Mon, Oct 26, 2009 at 8:46 AM, <Juha.Riihimaki@nokia.com> wrote: > > > > On Oct 25, 2009, at 16:01, ext Laurent Desnogues wrote: > > > >> I don't really like the idea of having tcg_qemu_ld/st not factored > >> in some place, as it makes memory access tracing extensions > >> more intrusive. > >> > >> This brings us back to the problem having function freeing tmps. > >> In that case, you could perhaps create a gen_st16_dont_free > >> function as a temporary workaround? > > > > I could, but it is getting ugly =/ How about if I create another patch > > that moves the temporary variable (de)allocation outside the gen_ldx/ > > gen_stx functions and then refactor this patch on top of that? > > I'd personally like this, but I guess you'd better wait for some inputs > from other QEMU dev's before doing it. Looks like the best option to me.
On Oct 26, 2009, at 23:05, ext Aurelien Jarno wrote: > On Mon, Oct 26, 2009 at 10:11:07AM +0100, Laurent Desnogues wrote: >> On Mon, Oct 26, 2009 at 8:46 AM, <Juha.Riihimaki@nokia.com> wrote: >>> >>> On Oct 25, 2009, at 16:01, ext Laurent Desnogues wrote: >>> >>>> I don't really like the idea of having tcg_qemu_ld/st not factored >>>> in some place, as it makes memory access tracing extensions >>>> more intrusive. >>>> >>>> This brings us back to the problem having function freeing tmps. >>>> In that case, you could perhaps create a gen_st16_dont_free >>>> function as a temporary workaround? >>> >>> I could, but it is getting ugly =/ How about if I create another >>> patch >>> that moves the temporary variable (de)allocation outside the >>> gen_ldx/ >>> gen_stx functions and then refactor this patch on top of that? >> >> I'd personally like this, but I guess you'd better wait for some >> inputs >> from other QEMU dev's before doing it. > > Looks like the best option to me. Alrighty then, I did the patch against the latest git and it's rather large... but seems to have broken nothing at least in my testing. The patch will remove all implicit tcg temp variable allocation and deallocation in target-arm/translate.c and make it the responsibility of the calling function. At the same time I also removed the new_tmp and dead_tmp functions completely because I see no point in only tracking some of the 32bit temp variables instead of everything. Personally I think the patch makes reading and understanding (and why not also writing) the file much easier. I do also have a version that has a compile time option of substituting the tcg temp variable alloc/ dealloc function calls with inline functions that track the usage but this is not included with the patch. I'll send the patch shortly, should be nice bed-time reading I guess ;) Please comment when you have free time to read it through... Cheers, Juha
On Thu, Oct 29, 2009 at 2:45 PM, <Juha.Riihimaki@nokia.com> wrote: [...] > > Alrighty then, I did the patch against the latest git and it's rather > large... but seems to have broken nothing at least in my testing. The > patch will remove all implicit tcg temp variable allocation and > deallocation in target-arm/translate.c and make it the responsibility > of the calling function. At the same time I also removed the new_tmp > and dead_tmp functions completely because I see no point in only > tracking some of the 32bit temp variables instead of everything. > Personally I think the patch makes reading and understanding (and why > not also writing) the file much easier. I do also have a version that > has a compile time option of substituting the tcg temp variable alloc/ > dealloc function calls with inline functions that track the usage but > this is not included with the patch. I was thinking about alloc/dealloc problems and I think a good way of detecting issues is to exercise all target instructions. It can be done easily with some bit trickery for targets with regular encodings. I have already done so for a new target and this helps a lot; it only required a few dozens lines of code, but it's hacky :-) > I'll send the patch shortly, should be nice bed-time reading I > guess ;) Please comment when you have free time to read it through... Nice! Laurent
diff --git a/target-arm/translate.c b/target-arm/translate.c index f262758..55d6377 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -3708,6 +3708,7 @@ static int disas_neon_ls_insn(CPUState * env, DisasContext *s, uint32_t insn) TCGv tmp; TCGv tmp2; TCGv_i64 tmp64; + TCGv stride_var; if (!vfp_enabled(env)) return 1; @@ -3729,6 +3730,7 @@ static int disas_neon_ls_insn(CPUState * env, DisasContext *s, uint32_t insn) return 1; load_reg_var(s, addr, rn); stride = (1 << size) * interleave; + stride_var = tcg_const_i32(stride); for (reg = 0; reg < nregs; reg++) { if (interleave > 2 || (interleave == 2 && nregs == 2)) { load_reg_var(s, addr, rn); @@ -3747,7 +3749,7 @@ static int disas_neon_ls_insn(CPUState * env, DisasContext *s, uint32_t insn) neon_load_reg64(tmp64, rd); gen_st64(tmp64, addr, IS_USER(s)); } - tcg_gen_addi_i32(addr, addr, stride); + tcg_gen_add_i32(addr, addr, stride_var); } else { for (pass = 0; pass < 2; pass++) { if (size == 2) { @@ -3758,58 +3760,57 @@ static int disas_neon_ls_insn(CPUState * env, DisasContext *s, uint32_t insn) tmp = neon_load_reg(rd, pass); gen_st32(tmp, addr, IS_USER(s)); } - tcg_gen_addi_i32(addr, addr, stride); + tcg_gen_add_i32(addr, addr, stride_var); } else if (size == 1) { if (load) { tmp = gen_ld16u(addr, IS_USER(s)); - tcg_gen_addi_i32(addr, addr, stride); + tcg_gen_add_i32(addr, addr, stride_var); tmp2 = gen_ld16u(addr, IS_USER(s)); - tcg_gen_addi_i32(addr, addr, stride); - gen_bfi(tmp, tmp, tmp2, 16, 0xffff); + tcg_gen_add_i32(addr, addr, stride_var); + tcg_gen_shli_i32(tmp2, tmp2, 16); + tcg_gen_or_i32(tmp, tmp, tmp2); dead_tmp(tmp2); neon_store_reg(rd, pass, tmp); } else { tmp = neon_load_reg(rd, pass); - tmp2 = new_tmp(); - tcg_gen_shri_i32(tmp2, tmp, 16); - gen_st16(tmp, addr, IS_USER(s)); - tcg_gen_addi_i32(addr, addr, stride); - gen_st16(tmp2, addr, IS_USER(s)); - tcg_gen_addi_i32(addr, addr, stride); + tcg_gen_qemu_st16(tmp, addr, IS_USER(s)); + tcg_gen_add_i32(addr, addr, stride_var); + tcg_gen_shri_i32(tmp, tmp, 16); + tcg_gen_qemu_st16(tmp, addr, IS_USER(s)); + tcg_gen_add_i32(addr, addr, stride_var); + dead_tmp(tmp); } } else /* size == 0 */ { if (load) { - TCGV_UNUSED(tmp2); - for (n = 0; n < 4; n++) { - tmp = gen_ld8u(addr, IS_USER(s)); - tcg_gen_addi_i32(addr, addr, stride); - if (n == 0) { - tmp2 = tmp; - } else { - gen_bfi(tmp2, tmp2, tmp, n * 8, 0xff); - dead_tmp(tmp); - } + tmp = gen_ld8u(addr, IS_USER(s)); + tcg_gen_add_i32(addr, addr, stride_var); + for (n = 1; n < 4; n++) { + tmp2 = gen_ld8u(addr, IS_USER(s)); + tcg_gen_add_i32(addr, addr, stride_var); + tcg_gen_shli_i32(tmp2, tmp2, n * 8); + tcg_gen_or_i32(tmp, tmp, tmp2); + dead_tmp(tmp2); } - neon_store_reg(rd, pass, tmp2); + neon_store_reg(rd, pass, tmp); } else { - tmp2 = neon_load_reg(rd, pass); - for (n = 0; n < 4; n++) { - tmp = new_tmp(); - if (n == 0) { - tcg_gen_mov_i32(tmp, tmp2); - } else { - tcg_gen_shri_i32(tmp, tmp2, n * 8); - } - gen_st8(tmp, addr, IS_USER(s)); - tcg_gen_addi_i32(addr, addr, stride); + tmp2 = tcg_const_i32(8); + tmp = neon_load_reg(rd, pass); + for (n = 0; n < 3; n++) { + tcg_gen_qemu_st8(tmp, addr, IS_USER(s)); + tcg_gen_add_i32(addr, addr, stride_var); + tcg_gen_shr_i32(tmp, tmp, tmp2); } - dead_tmp(tmp2); + tcg_gen_qemu_st8(tmp, addr, IS_USER(s)); + tcg_gen_add_i32(addr, addr, stride_var); + dead_tmp(tmp); + tcg_temp_free_i32(tmp2); } } } } rd += spacing; } + tcg_temp_free_i32(stride_var); stride = nregs * 8; } else { size = (insn >> 10) & 3;