Patchwork [07/12] target-arm: fix neon vsri, vshl and vsli ops

login
register
mail settings
Submitter Juha.Riihimaki@nokia.com
Date Oct. 21, 2009, 10:17 a.m.
Message ID <FE83A2F3-69F5-4647-A3FE-CD18F4892693@nokia.com>
Download mbox | patch
Permalink /patch/36519/
State New
Headers show

Comments

Juha.Riihimaki@nokia.com - Oct. 21, 2009, 10:17 a.m.
Shift immediate value is incorrectly overwritten by a temporary  
variable in the processing of NEON vsri, vshl and vsli instructions.

Signed-off-by: Juha Riihimäki <juha.riihimaki@nokia.com>
---
      TCGv tmp3;
@@ -4624,31 +4624,31 @@ static int disas_neon_data_insn(CPUState *  
env, DisasContext *s, uint32_t insn)
                              switch (size) {
                              case 0:
                                  if (op == 4)
-                                    imm = 0xff >> -shift;
+                                    imm2 = 0xff >> -shift;
                                  else
-                                    imm = (uint8_t)(0xff << shift);
-                                imm |= imm << 8;
-                                imm |= imm << 16;
+                                    imm2 = (uint8_t)(0xff << shift);
+                                imm2 |= imm2 << 8;
+                                imm2 |= imm2 << 16;
                                  break;
                              case 1:
                                  if (op == 4)
-                                    imm = 0xffff >> -shift;
+                                    imm2 = 0xffff >> -shift;
                                  else
-                                    imm = (uint16_t)(0xffff << shift);
-                                imm |= imm << 16;
+                                    imm2 = (uint16_t)(0xffff << shift);
+                                imm2 |= imm2 << 16;
                                  break;
                              case 2:
                                  if (op == 4)
-                                    imm = 0xffffffffu >> -shift;
+                                    imm2 = 0xffffffffu >> -shift;
                                  else
-                                    imm = 0xffffffffu << shift;
+                                    imm2 = 0xffffffffu << shift;
                                  break;
                              default:
                                  abort();
                              }
                              tmp2 = neon_load_reg(rd, pass);
-                            tcg_gen_andi_i32(tmp, tmp, imm);
-                            tcg_gen_andi_i32(tmp2, tmp2, ~imm);
+                            tcg_gen_andi_i32(tmp, tmp, imm2);
+                            tcg_gen_andi_i32(tmp2, tmp2, ~imm2);
                              tcg_gen_or_i32(tmp, tmp, tmp2);
                              dead_tmp(tmp2);
                          }
Laurent Desnogues - Oct. 21, 2009, 10:46 a.m.
On Wed, Oct 21, 2009 at 12:17 PM,  <Juha.Riihimaki@nokia.com> wrote:
> Shift immediate value is incorrectly overwritten by a temporary
> variable in the processing of NEON vsri, vshl and vsli instructions.
>
> Signed-off-by: Juha Riihimäki <juha.riihimaki@nokia.com>
> ---
> diff --git a/target-arm/translate.c b/target-arm/translate.c
> index 59bf7bc..c92ecc6 100644
> --- a/target-arm/translate.c
> +++ b/target-arm/translate.c
> @@ -4094,7 +4094,7 @@ static int disas_neon_data_insn(CPUState * env,
> DisasContext *s, uint32_t insn)
>      int pairwise;
>      int u;
>      int n;
> -    uint32_t imm;
> +    uint32_t imm, imm2;

I would prefer mask to imm2, but that's personal taste :-)

>      TCGv tmp;
>      TCGv tmp2;
>      TCGv tmp3;
> @@ -4624,31 +4624,31 @@ static int disas_neon_data_insn(CPUState *
> env, DisasContext *s, uint32_t insn)
>                              switch (size) {
>                              case 0:
>                                  if (op == 4)
> -                                    imm = 0xff >> -shift;
> +                                    imm2 = 0xff >> -shift;
>                                  else
> -                                    imm = (uint8_t)(0xff << shift);
> -                                imm |= imm << 8;
> -                                imm |= imm << 16;
> +                                    imm2 = (uint8_t)(0xff << shift);
> +                                imm2 |= imm2 << 8;
> +                                imm2 |= imm2 << 16;
>                                  break;
>                              case 1:
>                                  if (op == 4)
> -                                    imm = 0xffff >> -shift;
> +                                    imm2 = 0xffff >> -shift;
>                                  else
> -                                    imm = (uint16_t)(0xffff << shift);
> -                                imm |= imm << 16;
> +                                    imm2 = (uint16_t)(0xffff << shift);
> +                                imm2 |= imm2 << 16;
>                                  break;
>                              case 2:
>                                  if (op == 4)
> -                                    imm = 0xffffffffu >> -shift;
> +                                    imm2 = 0xffffffffu >> -shift;
>                                  else
> -                                    imm = 0xffffffffu << shift;
> +                                    imm2 = 0xffffffffu << shift;
>                                  break;
>                              default:
>                                  abort();
>                              }
>                              tmp2 = neon_load_reg(rd, pass);
> -                            tcg_gen_andi_i32(tmp, tmp, imm);
> -                            tcg_gen_andi_i32(tmp2, tmp2, ~imm);
> +                            tcg_gen_andi_i32(tmp, tmp, imm2);
> +                            tcg_gen_andi_i32(tmp2, tmp2, ~imm2);
>                              tcg_gen_or_i32(tmp, tmp, tmp2);
>                              dead_tmp(tmp2);
>                          }

I mostly agree with this, except there's a mistake already
present in the original code:  for a size of 2 the shift amount
can be 32 (look at VSRI in the ARM Architecture Reference
Manual).  Note in this case, shift will be -32.


Laurent
Juha.Riihimaki@nokia.com - Oct. 22, 2009, 6:49 a.m.
On Oct 21, 2009, at 13:46, ext Laurent Desnogues wrote:

>> @@ -4624,31 +4624,31 @@ static int disas_neon_data_insn(CPUState *
>> env, DisasContext *s, uint32_t insn)
>>                              switch (size) {
>>                              case 0:
>>                                  if (op == 4)
>> -                                    imm = 0xff >> -shift;
>> +                                    imm2 = 0xff >> -shift;
>>                                  else
>> -                                    imm = (uint8_t)(0xff << shift);
>> -                                imm |= imm << 8;
>> -                                imm |= imm << 16;
>> +                                    imm2 = (uint8_t)(0xff << shift);
>> +                                imm2 |= imm2 << 8;
>> +                                imm2 |= imm2 << 16;
>>                                  break;
>>                              case 1:
>>                                  if (op == 4)
>> -                                    imm = 0xffff >> -shift;
>> +                                    imm2 = 0xffff >> -shift;
>>                                  else
>> -                                    imm = (uint16_t)(0xffff <<  
>> shift);
>> -                                imm |= imm << 16;
>> +                                    imm2 = (uint16_t)(0xffff <<  
>> shift);
>> +                                imm2 |= imm2 << 16;
>>                                  break;
>>                              case 2:
>>                                  if (op == 4)
>> -                                    imm = 0xffffffffu >> -shift;
>> +                                    imm2 = 0xffffffffu >> -shift;
>>                                  else
>> -                                    imm = 0xffffffffu << shift;
>> +                                    imm2 = 0xffffffffu << shift;
>>                                  break;
>>                              default:
>>                                  abort();
>>                              }
>>                              tmp2 = neon_load_reg(rd, pass);
>> -                            tcg_gen_andi_i32(tmp, tmp, imm);
>> -                            tcg_gen_andi_i32(tmp2, tmp2, ~imm);
>> +                            tcg_gen_andi_i32(tmp, tmp, imm2);
>> +                            tcg_gen_andi_i32(tmp2, tmp2, ~imm2);
>>                              tcg_gen_or_i32(tmp, tmp, tmp2);
>>                              dead_tmp(tmp2);
>>                          }
>
> I mostly agree with this, except there's a mistake already
> present in the original code:  for a size of 2 the shift amount
> can be 32 (look at VSRI in the ARM Architecture Reference
> Manual).  Note in this case, shift will be -32.

True. However, I'm not sure if this causes incorrect behavior or not.  
I'm not a NEON expert but how I understand the VSRI instruction is  
that it will shift the element value before it is inserted in the  
destination vector, therefore with element size 2 and shift 32 the  
result would be always zero and I guess that would still apply if the  
shift was -32: does it matter in which direction you shift if you  
anyway shift all the bits out?


Regards,
Juha
Laurent Desnogues - Oct. 22, 2009, 7:18 a.m.
On Thu, Oct 22, 2009 at 8:49 AM,  <Juha.Riihimaki@nokia.com> wrote:
>
> On Oct 21, 2009, at 13:46, ext Laurent Desnogues wrote:
>
>>> @@ -4624,31 +4624,31 @@ static int disas_neon_data_insn(CPUState *
>>> env, DisasContext *s, uint32_t insn)
>>>                              switch (size) {
>>>                              case 0:
>>>                                  if (op == 4)
>>> -                                    imm = 0xff >> -shift;
>>> +                                    imm2 = 0xff >> -shift;
>>>                                  else
>>> -                                    imm = (uint8_t)(0xff << shift);
>>> -                                imm |= imm << 8;
>>> -                                imm |= imm << 16;
>>> +                                    imm2 = (uint8_t)(0xff << shift);
>>> +                                imm2 |= imm2 << 8;
>>> +                                imm2 |= imm2 << 16;
>>>                                  break;
>>>                              case 1:
>>>                                  if (op == 4)
>>> -                                    imm = 0xffff >> -shift;
>>> +                                    imm2 = 0xffff >> -shift;
>>>                                  else
>>> -                                    imm = (uint16_t)(0xffff <<
>>> shift);
>>> -                                imm |= imm << 16;
>>> +                                    imm2 = (uint16_t)(0xffff <<
>>> shift);
>>> +                                imm2 |= imm2 << 16;
>>>                                  break;
>>>                              case 2:
>>>                                  if (op == 4)
>>> -                                    imm = 0xffffffffu >> -shift;
>>> +                                    imm2 = 0xffffffffu >> -shift;
>>>                                  else
>>> -                                    imm = 0xffffffffu << shift;
>>> +                                    imm2 = 0xffffffffu << shift;
>>>                                  break;
>>>                              default:
>>>                                  abort();
>>>                              }
>>>                              tmp2 = neon_load_reg(rd, pass);
>>> -                            tcg_gen_andi_i32(tmp, tmp, imm);
>>> -                            tcg_gen_andi_i32(tmp2, tmp2, ~imm);
>>> +                            tcg_gen_andi_i32(tmp, tmp, imm2);
>>> +                            tcg_gen_andi_i32(tmp2, tmp2, ~imm2);
>>>                              tcg_gen_or_i32(tmp, tmp, tmp2);
>>>                              dead_tmp(tmp2);
>>>                          }
>>
>> I mostly agree with this, except there's a mistake already
>> present in the original code:  for a size of 2 the shift amount
>> can be 32 (look at VSRI in the ARM Architecture Reference
>> Manual).  Note in this case, shift will be -32.
>
> True. However, I'm not sure if this causes incorrect behavior or not.
> I'm not a NEON expert but how I understand the VSRI instruction is
> that it will shift the element value before it is inserted in the
> destination vector, therefore with element size 2 and shift 32 the
> result would be always zero and I guess that would still apply if the
> shift was -32: does it matter in which direction you shift if you
> anyway shift all the bits out?

The problem is not in the NEON semantics but rather the C
one:  the behaviour of a shift by an amount greater than or
equal to the bit-width of the type is undefined;  in this case
imm2 is 32-bit and the shift is 32-bit.  What you want is
imm2 = 0 if shift is -32, as you correctly guessed.


Laurent
Juha.Riihimaki@nokia.com - Oct. 22, 2009, 7:33 a.m.
On Oct 22, 2009, at 10:18, ext Laurent Desnogues wrote:

> On Thu, Oct 22, 2009 at 8:49 AM,  <Juha.Riihimaki@nokia.com> wrote:
>>
>> On Oct 21, 2009, at 13:46, ext Laurent Desnogues wrote:
>>
>>>> @@ -4624,31 +4624,31 @@ static int disas_neon_data_insn(CPUState *
>>>> env, DisasContext *s, uint32_t insn)
>>>>                              switch (size) {
>>>>                              case 0:
>>>>                                  if (op == 4)
>>>> -                                    imm = 0xff >> -shift;
>>>> +                                    imm2 = 0xff >> -shift;
>>>>                                  else
>>>> -                                    imm = (uint8_t)(0xff <<  
>>>> shift);
>>>> -                                imm |= imm << 8;
>>>> -                                imm |= imm << 16;
>>>> +                                    imm2 = (uint8_t)(0xff <<  
>>>> shift);
>>>> +                                imm2 |= imm2 << 8;
>>>> +                                imm2 |= imm2 << 16;
>>>>                                  break;
>>>>                              case 1:
>>>>                                  if (op == 4)
>>>> -                                    imm = 0xffff >> -shift;
>>>> +                                    imm2 = 0xffff >> -shift;
>>>>                                  else
>>>> -                                    imm = (uint16_t)(0xffff <<
>>>> shift);
>>>> -                                imm |= imm << 16;
>>>> +                                    imm2 = (uint16_t)(0xffff <<
>>>> shift);
>>>> +                                imm2 |= imm2 << 16;
>>>>                                  break;
>>>>                              case 2:
>>>>                                  if (op == 4)
>>>> -                                    imm = 0xffffffffu >> -shift;
>>>> +                                    imm2 = 0xffffffffu >> -shift;
>>>>                                  else
>>>> -                                    imm = 0xffffffffu << shift;
>>>> +                                    imm2 = 0xffffffffu << shift;
>>>>                                  break;
>>>>                              default:
>>>>                                  abort();
>>>>                              }
>>>>                              tmp2 = neon_load_reg(rd, pass);
>>>> -                            tcg_gen_andi_i32(tmp, tmp, imm);
>>>> -                            tcg_gen_andi_i32(tmp2, tmp2, ~imm);
>>>> +                            tcg_gen_andi_i32(tmp, tmp, imm2);
>>>> +                            tcg_gen_andi_i32(tmp2, tmp2, ~imm2);
>>>>                              tcg_gen_or_i32(tmp, tmp, tmp2);
>>>>                              dead_tmp(tmp2);
>>>>                          }
>>>
>>> I mostly agree with this, except there's a mistake already
>>> present in the original code:  for a size of 2 the shift amount
>>> can be 32 (look at VSRI in the ARM Architecture Reference
>>> Manual).  Note in this case, shift will be -32.
>>
>> True. However, I'm not sure if this causes incorrect behavior or not.
>> I'm not a NEON expert but how I understand the VSRI instruction is
>> that it will shift the element value before it is inserted in the
>> destination vector, therefore with element size 2 and shift 32 the
>> result would be always zero and I guess that would still apply if the
>> shift was -32: does it matter in which direction you shift if you
>> anyway shift all the bits out?
>
> The problem is not in the NEON semantics but rather the C
> one:  the behaviour of a shift by an amount greater than or
> equal to the bit-width of the type is undefined;  in this case
> imm2 is 32-bit and the shift is 32-bit.  What you want is
> imm2 = 0 if shift is -32, as you correctly guessed.

Ah, I see. Doesn't the same issue apply for the 8bit and 16bit element  
sizes as well? I suppose it will be sufficient to check for a negative  
shift value and in that case force the mask value to zero.


Regards,
Juha
Laurent Desnogues - Oct. 22, 2009, 7:40 a.m.
On Thu, Oct 22, 2009 at 9:33 AM,  <Juha.Riihimaki@nokia.com> wrote:
>
> On Oct 22, 2009, at 10:18, ext Laurent Desnogues wrote:
>
>> On Thu, Oct 22, 2009 at 8:49 AM,  <Juha.Riihimaki@nokia.com> wrote:
>>>
>>> On Oct 21, 2009, at 13:46, ext Laurent Desnogues wrote:
>>>
>>>>> @@ -4624,31 +4624,31 @@ static int disas_neon_data_insn(CPUState *
>>>>> env, DisasContext *s, uint32_t insn)
>>>>>                              switch (size) {
>>>>>                              case 0:
>>>>>                                  if (op == 4)
>>>>> -                                    imm = 0xff >> -shift;
>>>>> +                                    imm2 = 0xff >> -shift;
>>>>>                                  else
>>>>> -                                    imm = (uint8_t)(0xff <<
>>>>> shift);
>>>>> -                                imm |= imm << 8;
>>>>> -                                imm |= imm << 16;
>>>>> +                                    imm2 = (uint8_t)(0xff <<
>>>>> shift);
>>>>> +                                imm2 |= imm2 << 8;
>>>>> +                                imm2 |= imm2 << 16;
>>>>>                                  break;
>>>>>                              case 1:
>>>>>                                  if (op == 4)
>>>>> -                                    imm = 0xffff >> -shift;
>>>>> +                                    imm2 = 0xffff >> -shift;
>>>>>                                  else
>>>>> -                                    imm = (uint16_t)(0xffff <<
>>>>> shift);
>>>>> -                                imm |= imm << 16;
>>>>> +                                    imm2 = (uint16_t)(0xffff <<
>>>>> shift);
>>>>> +                                imm2 |= imm2 << 16;
>>>>>                                  break;
>>>>>                              case 2:
>>>>>                                  if (op == 4)
>>>>> -                                    imm = 0xffffffffu >> -shift;
>>>>> +                                    imm2 = 0xffffffffu >> -shift;
>>>>>                                  else
>>>>> -                                    imm = 0xffffffffu << shift;
>>>>> +                                    imm2 = 0xffffffffu << shift;
>>>>>                                  break;
>>>>>                              default:
>>>>>                                  abort();
>>>>>                              }
>>>>>                              tmp2 = neon_load_reg(rd, pass);
>>>>> -                            tcg_gen_andi_i32(tmp, tmp, imm);
>>>>> -                            tcg_gen_andi_i32(tmp2, tmp2, ~imm);
>>>>> +                            tcg_gen_andi_i32(tmp, tmp, imm2);
>>>>> +                            tcg_gen_andi_i32(tmp2, tmp2, ~imm2);
>>>>>                              tcg_gen_or_i32(tmp, tmp, tmp2);
>>>>>                              dead_tmp(tmp2);
>>>>>                          }
>>>>
>>>> I mostly agree with this, except there's a mistake already
>>>> present in the original code:  for a size of 2 the shift amount
>>>> can be 32 (look at VSRI in the ARM Architecture Reference
>>>> Manual).  Note in this case, shift will be -32.
>>>
>>> True. However, I'm not sure if this causes incorrect behavior or not.
>>> I'm not a NEON expert but how I understand the VSRI instruction is
>>> that it will shift the element value before it is inserted in the
>>> destination vector, therefore with element size 2 and shift 32 the
>>> result would be always zero and I guess that would still apply if the
>>> shift was -32: does it matter in which direction you shift if you
>>> anyway shift all the bits out?
>>
>> The problem is not in the NEON semantics but rather the C
>> one:  the behaviour of a shift by an amount greater than or
>> equal to the bit-width of the type is undefined;  in this case
>> imm2 is 32-bit and the shift is 32-bit.  What you want is
>> imm2 = 0 if shift is -32, as you correctly guessed.
>
> Ah, I see. Doesn't the same issue apply for the 8bit and 16bit element
> sizes as well? I suppose it will be sufficient to check for a negative
> shift value and in that case force the mask value to zero.

For 8- and 16-bit elements, your shift will always be <= 16 (in
absolute value I mean :-), so you have no issue for these
cases.


Laurent

Patch

diff --git a/target-arm/translate.c b/target-arm/translate.c
index 59bf7bc..c92ecc6 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -4094,7 +4094,7 @@  static int disas_neon_data_insn(CPUState * env,  
DisasContext *s, uint32_t insn)
      int pairwise;
      int u;
      int n;
-    uint32_t imm;
+    uint32_t imm, imm2;
      TCGv tmp;
      TCGv tmp2;