Patchwork [v2,09/10] target-arm: optimize neon vld/vst ops

login
register
mail settings
Submitter Juha.Riihimaki@nokia.com
Date Oct. 24, 2009, 12:19 p.m.
Message ID <1256386749-85299-10-git-send-email-juha.riihimaki@nokia.com>
Download mbox | patch
Permalink /patch/36837/
State New
Headers show

Comments

Juha.Riihimaki@nokia.com - Oct. 24, 2009, 12:19 p.m.
From: Juha Riihimäki <juha.riihimaki@nokia.com>

Reduce the amount of TCG ops generated from NEON vld/vst instructions
by simplifying the code generation.

Signed-off-by: Juha Riihimäki <juha.riihimaki@nokia.com>
---
 target-arm/translate.c |   67 ++++++++++++++++++++++++-----------------------
 1 files changed, 34 insertions(+), 33 deletions(-)
Laurent Desnogues - Oct. 25, 2009, 2:01 p.m.
On Sat, Oct 24, 2009 at 1:19 PM,  <juha.riihimaki@nokia.com> wrote:
> From: Juha Riihimäki <juha.riihimaki@nokia.com>
>
> Reduce the amount of TCG ops generated from NEON vld/vst instructions
> by simplifying the code generation.
>
> Signed-off-by: Juha Riihimäki <juha.riihimaki@nokia.com>
> ---
>  target-arm/translate.c |   67 ++++++++++++++++++++++++-----------------------
>  1 files changed, 34 insertions(+), 33 deletions(-)
>
> diff --git a/target-arm/translate.c b/target-arm/translate.c
> index f262758..55d6377 100644
> --- a/target-arm/translate.c
> +++ b/target-arm/translate.c
> @@ -3708,6 +3708,7 @@ static int disas_neon_ls_insn(CPUState * env, DisasContext *s, uint32_t insn)
>     TCGv tmp;
>     TCGv tmp2;
>     TCGv_i64 tmp64;
> +    TCGv stride_var;
>
>     if (!vfp_enabled(env))
>       return 1;
> @@ -3729,6 +3730,7 @@ static int disas_neon_ls_insn(CPUState * env, DisasContext *s, uint32_t insn)
>             return 1;
>         load_reg_var(s, addr, rn);
>         stride = (1 << size) * interleave;
> +        stride_var = tcg_const_i32(stride);
>         for (reg = 0; reg < nregs; reg++) {
>             if (interleave > 2 || (interleave == 2 && nregs == 2)) {
>                 load_reg_var(s, addr, rn);
> @@ -3747,7 +3749,7 @@ static int disas_neon_ls_insn(CPUState * env, DisasContext *s, uint32_t insn)
>                     neon_load_reg64(tmp64, rd);
>                     gen_st64(tmp64, addr, IS_USER(s));
>                 }
> -                tcg_gen_addi_i32(addr, addr, stride);
> +                tcg_gen_add_i32(addr, addr, stride_var);
>             } else {
>                 for (pass = 0; pass < 2; pass++) {
>                     if (size == 2) {
> @@ -3758,58 +3760,57 @@ static int disas_neon_ls_insn(CPUState * env, DisasContext *s, uint32_t insn)
>                             tmp = neon_load_reg(rd, pass);
>                             gen_st32(tmp, addr, IS_USER(s));
>                         }
> -                        tcg_gen_addi_i32(addr, addr, stride);
> +                        tcg_gen_add_i32(addr, addr, stride_var);
>                     } else if (size == 1) {
>                         if (load) {
>                             tmp = gen_ld16u(addr, IS_USER(s));
> -                            tcg_gen_addi_i32(addr, addr, stride);
> +                            tcg_gen_add_i32(addr, addr, stride_var);
>                             tmp2 = gen_ld16u(addr, IS_USER(s));
> -                            tcg_gen_addi_i32(addr, addr, stride);
> -                            gen_bfi(tmp, tmp, tmp2, 16, 0xffff);
> +                            tcg_gen_add_i32(addr, addr, stride_var);
> +                            tcg_gen_shli_i32(tmp2, tmp2, 16);
> +                            tcg_gen_or_i32(tmp, tmp, tmp2);
>                             dead_tmp(tmp2);
>                             neon_store_reg(rd, pass, tmp);
>                         } else {
>                             tmp = neon_load_reg(rd, pass);
> -                            tmp2 = new_tmp();
> -                            tcg_gen_shri_i32(tmp2, tmp, 16);
> -                            gen_st16(tmp, addr, IS_USER(s));
> -                            tcg_gen_addi_i32(addr, addr, stride);
> -                            gen_st16(tmp2, addr, IS_USER(s));
> -                            tcg_gen_addi_i32(addr, addr, stride);
> +                            tcg_gen_qemu_st16(tmp, addr, IS_USER(s));
> +                            tcg_gen_add_i32(addr, addr, stride_var);
> +                            tcg_gen_shri_i32(tmp, tmp, 16);
> +                            tcg_gen_qemu_st16(tmp, addr, IS_USER(s));
> +                            tcg_gen_add_i32(addr, addr, stride_var);
> +                            dead_tmp(tmp);

I don't really like the idea of having tcg_qemu_ld/st not factored
in some place, as it makes memory access tracing extensions
more intrusive.

This brings us back to the problem having function freeing tmps.
In that case, you could perhaps create a gen_st16_dont_free
function as a temporary workaround?

>                         }
>                     } else /* size == 0 */ {
>                         if (load) {
> -                            TCGV_UNUSED(tmp2);
> -                            for (n = 0; n < 4; n++) {
> -                                tmp = gen_ld8u(addr, IS_USER(s));
> -                                tcg_gen_addi_i32(addr, addr, stride);
> -                                if (n == 0) {
> -                                    tmp2 = tmp;
> -                                } else {
> -                                    gen_bfi(tmp2, tmp2, tmp, n * 8, 0xff);
> -                                    dead_tmp(tmp);
> -                                }
> +                            tmp = gen_ld8u(addr, IS_USER(s));
> +                            tcg_gen_add_i32(addr, addr, stride_var);
> +                            for (n = 1; n < 4; n++) {
> +                                tmp2 = gen_ld8u(addr, IS_USER(s));
> +                                tcg_gen_add_i32(addr, addr, stride_var);
> +                                tcg_gen_shli_i32(tmp2, tmp2, n * 8);
> +                                tcg_gen_or_i32(tmp, tmp, tmp2);
> +                                dead_tmp(tmp2);
>                             }
> -                            neon_store_reg(rd, pass, tmp2);
> +                            neon_store_reg(rd, pass, tmp);
>                         } else {
> -                            tmp2 = neon_load_reg(rd, pass);
> -                            for (n = 0; n < 4; n++) {
> -                                tmp = new_tmp();
> -                                if (n == 0) {
> -                                    tcg_gen_mov_i32(tmp, tmp2);
> -                                } else {
> -                                    tcg_gen_shri_i32(tmp, tmp2, n * 8);
> -                                }
> -                                gen_st8(tmp, addr, IS_USER(s));
> -                                tcg_gen_addi_i32(addr, addr, stride);
> +                            tmp2 = tcg_const_i32(8);
> +                            tmp = neon_load_reg(rd, pass);
> +                            for (n = 0; n < 3; n++) {
> +                                tcg_gen_qemu_st8(tmp, addr, IS_USER(s));
> +                                tcg_gen_add_i32(addr, addr, stride_var);
> +                                tcg_gen_shr_i32(tmp, tmp, tmp2);
>                             }
> -                            dead_tmp(tmp2);
> +                            tcg_gen_qemu_st8(tmp, addr, IS_USER(s));
> +                            tcg_gen_add_i32(addr, addr, stride_var);
> +                            dead_tmp(tmp);
> +                            tcg_temp_free_i32(tmp2);

Same comment as above.


Laurent

>                         }
>                     }
>                 }
>             }
>             rd += spacing;
>         }
> +        tcg_temp_free_i32(stride_var);
>         stride = nregs * 8;
>     } else {
>         size = (insn >> 10) & 3;
> --
> 1.6.5
>
>
>
>
Juha.Riihimaki@nokia.com - Oct. 26, 2009, 7:46 a.m.
On Oct 25, 2009, at 16:01, ext Laurent Desnogues wrote:

> I don't really like the idea of having tcg_qemu_ld/st not factored
> in some place, as it makes memory access tracing extensions
> more intrusive.
>
> This brings us back to the problem having function freeing tmps.
> In that case, you could perhaps create a gen_st16_dont_free
> function as a temporary workaround?

I could, but it is getting ugly =/ How about if I create another patch  
that moves the temporary variable (de)allocation outside the gen_ldx/ 
gen_stx functions and then refactor this patch on top of that?


Regards,
Juha
Laurent Desnogues - Oct. 26, 2009, 9:11 a.m.
On Mon, Oct 26, 2009 at 8:46 AM,  <Juha.Riihimaki@nokia.com> wrote:
>
> On Oct 25, 2009, at 16:01, ext Laurent Desnogues wrote:
>
>> I don't really like the idea of having tcg_qemu_ld/st not factored
>> in some place, as it makes memory access tracing extensions
>> more intrusive.
>>
>> This brings us back to the problem having function freeing tmps.
>> In that case, you could perhaps create a gen_st16_dont_free
>> function as a temporary workaround?
>
> I could, but it is getting ugly =/ How about if I create another patch
> that moves the temporary variable (de)allocation outside the gen_ldx/
> gen_stx functions and then refactor this patch on top of that?

I'd personally like this, but I guess you'd better wait for some inputs
from other QEMU dev's before doing it.


Laurent
Aurelien Jarno - Oct. 26, 2009, 9:05 p.m.
On Mon, Oct 26, 2009 at 10:11:07AM +0100, Laurent Desnogues wrote:
> On Mon, Oct 26, 2009 at 8:46 AM,  <Juha.Riihimaki@nokia.com> wrote:
> >
> > On Oct 25, 2009, at 16:01, ext Laurent Desnogues wrote:
> >
> >> I don't really like the idea of having tcg_qemu_ld/st not factored
> >> in some place, as it makes memory access tracing extensions
> >> more intrusive.
> >>
> >> This brings us back to the problem having function freeing tmps.
> >> In that case, you could perhaps create a gen_st16_dont_free
> >> function as a temporary workaround?
> >
> > I could, but it is getting ugly =/ How about if I create another patch
> > that moves the temporary variable (de)allocation outside the gen_ldx/
> > gen_stx functions and then refactor this patch on top of that?
> 
> I'd personally like this, but I guess you'd better wait for some inputs
> from other QEMU dev's before doing it.

Looks like the best option to me.
Juha.Riihimaki@nokia.com - Oct. 29, 2009, 1:45 p.m.
On Oct 26, 2009, at 23:05, ext Aurelien Jarno wrote:

> On Mon, Oct 26, 2009 at 10:11:07AM +0100, Laurent Desnogues wrote:
>> On Mon, Oct 26, 2009 at 8:46 AM,  <Juha.Riihimaki@nokia.com> wrote:
>>>
>>> On Oct 25, 2009, at 16:01, ext Laurent Desnogues wrote:
>>>
>>>> I don't really like the idea of having tcg_qemu_ld/st not factored
>>>> in some place, as it makes memory access tracing extensions
>>>> more intrusive.
>>>>
>>>> This brings us back to the problem having function freeing tmps.
>>>> In that case, you could perhaps create a gen_st16_dont_free
>>>> function as a temporary workaround?
>>>
>>> I could, but it is getting ugly =/ How about if I create another  
>>> patch
>>> that moves the temporary variable (de)allocation outside the  
>>> gen_ldx/
>>> gen_stx functions and then refactor this patch on top of that?
>>
>> I'd personally like this, but I guess you'd better wait for some  
>> inputs
>> from other QEMU dev's before doing it.
>
> Looks like the best option to me.

Alrighty then, I did the patch against the latest git and it's rather  
large... but seems to have broken nothing at least in my testing. The  
patch will remove all implicit tcg temp variable allocation and  
deallocation in target-arm/translate.c and make it the responsibility  
of the calling function. At the same time I also removed the new_tmp  
and dead_tmp functions completely because I see no point in only  
tracking some of the 32bit temp variables instead of everything.  
Personally I think the patch makes reading and understanding (and why  
not also writing) the file much easier. I do also have a version that  
has a compile time option of substituting the tcg temp variable alloc/ 
dealloc function calls with inline functions that track the usage but  
this is not included with the patch.

I'll send the patch shortly, should be nice bed-time reading I  
guess ;) Please comment when you have free time to read it through...


Cheers,
Juha
Laurent Desnogues - Oct. 29, 2009, 1:52 p.m.
On Thu, Oct 29, 2009 at 2:45 PM,  <Juha.Riihimaki@nokia.com> wrote:
[...]
>
> Alrighty then, I did the patch against the latest git and it's rather
> large... but seems to have broken nothing at least in my testing. The
> patch will remove all implicit tcg temp variable allocation and
> deallocation in target-arm/translate.c and make it the responsibility
> of the calling function. At the same time I also removed the new_tmp
> and dead_tmp functions completely because I see no point in only
> tracking some of the 32bit temp variables instead of everything.
> Personally I think the patch makes reading and understanding (and why
> not also writing) the file much easier. I do also have a version that
> has a compile time option of substituting the tcg temp variable alloc/
> dealloc function calls with inline functions that track the usage but
> this is not included with the patch.

I was thinking about alloc/dealloc problems and I think a good way
of detecting issues is to exercise all target instructions.  It can be
done easily with some bit trickery for targets with regular encodings.
I have already done so for a new target and this helps a lot;  it only
required a few dozens lines of code, but it's hacky :-)

> I'll send the patch shortly, should be nice bed-time reading I
> guess ;) Please comment when you have free time to read it through...

Nice!


Laurent

Patch

diff --git a/target-arm/translate.c b/target-arm/translate.c
index f262758..55d6377 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -3708,6 +3708,7 @@  static int disas_neon_ls_insn(CPUState * env, DisasContext *s, uint32_t insn)
     TCGv tmp;
     TCGv tmp2;
     TCGv_i64 tmp64;
+    TCGv stride_var;
 
     if (!vfp_enabled(env))
       return 1;
@@ -3729,6 +3730,7 @@  static int disas_neon_ls_insn(CPUState * env, DisasContext *s, uint32_t insn)
             return 1;
         load_reg_var(s, addr, rn);
         stride = (1 << size) * interleave;
+        stride_var = tcg_const_i32(stride);
         for (reg = 0; reg < nregs; reg++) {
             if (interleave > 2 || (interleave == 2 && nregs == 2)) {
                 load_reg_var(s, addr, rn);
@@ -3747,7 +3749,7 @@  static int disas_neon_ls_insn(CPUState * env, DisasContext *s, uint32_t insn)
                     neon_load_reg64(tmp64, rd);
                     gen_st64(tmp64, addr, IS_USER(s));
                 }
-                tcg_gen_addi_i32(addr, addr, stride);
+                tcg_gen_add_i32(addr, addr, stride_var);
             } else {
                 for (pass = 0; pass < 2; pass++) {
                     if (size == 2) {
@@ -3758,58 +3760,57 @@  static int disas_neon_ls_insn(CPUState * env, DisasContext *s, uint32_t insn)
                             tmp = neon_load_reg(rd, pass);
                             gen_st32(tmp, addr, IS_USER(s));
                         }
-                        tcg_gen_addi_i32(addr, addr, stride);
+                        tcg_gen_add_i32(addr, addr, stride_var);
                     } else if (size == 1) {
                         if (load) {
                             tmp = gen_ld16u(addr, IS_USER(s));
-                            tcg_gen_addi_i32(addr, addr, stride);
+                            tcg_gen_add_i32(addr, addr, stride_var);
                             tmp2 = gen_ld16u(addr, IS_USER(s));
-                            tcg_gen_addi_i32(addr, addr, stride);
-                            gen_bfi(tmp, tmp, tmp2, 16, 0xffff);
+                            tcg_gen_add_i32(addr, addr, stride_var);
+                            tcg_gen_shli_i32(tmp2, tmp2, 16);
+                            tcg_gen_or_i32(tmp, tmp, tmp2);
                             dead_tmp(tmp2);
                             neon_store_reg(rd, pass, tmp);
                         } else {
                             tmp = neon_load_reg(rd, pass);
-                            tmp2 = new_tmp();
-                            tcg_gen_shri_i32(tmp2, tmp, 16);
-                            gen_st16(tmp, addr, IS_USER(s));
-                            tcg_gen_addi_i32(addr, addr, stride);
-                            gen_st16(tmp2, addr, IS_USER(s));
-                            tcg_gen_addi_i32(addr, addr, stride);
+                            tcg_gen_qemu_st16(tmp, addr, IS_USER(s));
+                            tcg_gen_add_i32(addr, addr, stride_var);
+                            tcg_gen_shri_i32(tmp, tmp, 16);
+                            tcg_gen_qemu_st16(tmp, addr, IS_USER(s));
+                            tcg_gen_add_i32(addr, addr, stride_var);
+                            dead_tmp(tmp);
                         }
                     } else /* size == 0 */ {
                         if (load) {
-                            TCGV_UNUSED(tmp2);
-                            for (n = 0; n < 4; n++) {
-                                tmp = gen_ld8u(addr, IS_USER(s));
-                                tcg_gen_addi_i32(addr, addr, stride);
-                                if (n == 0) {
-                                    tmp2 = tmp;
-                                } else {
-                                    gen_bfi(tmp2, tmp2, tmp, n * 8, 0xff);
-                                    dead_tmp(tmp);
-                                }
+                            tmp = gen_ld8u(addr, IS_USER(s));
+                            tcg_gen_add_i32(addr, addr, stride_var);
+                            for (n = 1; n < 4; n++) {
+                                tmp2 = gen_ld8u(addr, IS_USER(s));
+                                tcg_gen_add_i32(addr, addr, stride_var);
+                                tcg_gen_shli_i32(tmp2, tmp2, n * 8);
+                                tcg_gen_or_i32(tmp, tmp, tmp2);
+                                dead_tmp(tmp2);
                             }
-                            neon_store_reg(rd, pass, tmp2);
+                            neon_store_reg(rd, pass, tmp);
                         } else {
-                            tmp2 = neon_load_reg(rd, pass);
-                            for (n = 0; n < 4; n++) {
-                                tmp = new_tmp();
-                                if (n == 0) {
-                                    tcg_gen_mov_i32(tmp, tmp2);
-                                } else {
-                                    tcg_gen_shri_i32(tmp, tmp2, n * 8);
-                                }
-                                gen_st8(tmp, addr, IS_USER(s));
-                                tcg_gen_addi_i32(addr, addr, stride);
+                            tmp2 = tcg_const_i32(8);
+                            tmp = neon_load_reg(rd, pass);
+                            for (n = 0; n < 3; n++) {
+                                tcg_gen_qemu_st8(tmp, addr, IS_USER(s));
+                                tcg_gen_add_i32(addr, addr, stride_var);
+                                tcg_gen_shr_i32(tmp, tmp, tmp2);
                             }
-                            dead_tmp(tmp2);
+                            tcg_gen_qemu_st8(tmp, addr, IS_USER(s));
+                            tcg_gen_add_i32(addr, addr, stride_var);
+                            dead_tmp(tmp);
+                            tcg_temp_free_i32(tmp2);
                         }
                     }
                 }
             }
             rd += spacing;
         }
+        tcg_temp_free_i32(stride_var);
         stride = nregs * 8;
     } else {
         size = (insn >> 10) & 3;