diff mbox

[14/60] AArch64: Add orr instruction emulation

Message ID alpine.LNX.2.00.1311181338050.11100@wotan.suse.de
State New
Headers show

Commit Message

Michael Matz Nov. 18, 2013, 1:12 p.m. UTC
Hi,

On Mon, 18 Nov 2013, Claudio Fontana wrote:

> >> +    case 3:
> >> +        tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift);
> >> +        break;
> > 
> > Incorrect rotate for 32bit?

32bit rotates and shifts were fixed in a patch later than the 60er series 
Alex posted.  See attached.  (Generally there are many fixes to emulated 
instructions in that branch)

> >> +    if (!shift_amount && source == 0x1f) {
> 
> Besides the comment, is this correct?

No, it needs to check for opc == 1.

> >> +    tcg_dest = cpu_reg(dest);
> >> +    switch (opc) {
> >> +    case 0x0:
> >> +    case 0x3:
> >> +        tcg_gen_and_i64(tcg_dest, cpu_reg(source), tcg_op2);
> >> +        break;
> >> +    case 0x1:
> >> +        tcg_gen_or_i64(tcg_dest, cpu_reg(source), tcg_op2);
> >> +        break;
> >> +    case 0x2:
> >> +        tcg_gen_xor_i64(tcg_dest, cpu_reg(source), tcg_op2);
> >> +        break;
> >> +    }
> >> +
> >> +    if (is_32bit) {
> >> +        tcg_gen_ext32u_i64(tcg_dest, tcg_dest);
> >> +    }
> >> +
> >> +    if (setflags) {
> >> +        gen_helper_pstate_add(pstate, pstate, tcg_dest, cpu_reg(31), tcg_dest);
> >> +    }
> > 
> > Incorrect flags generated.  They're different between add/sub and logical.
> > In particular, C and V are always zero.

That's done correctly with the fixed pstate helpers coming with a later 
patch (see attached as well).  reg31 is zero, so that's flags as if for 
"dest == dest + 0", and PSTATE_C and PSTATE_V will be zero.  That is, the 
logical flags are the same as the arithmetic flags for result plus zero 
with no carry_in.


Ciao,
Michael.

Comments

Peter Maydell Nov. 18, 2013, 1:15 p.m. UTC | #1
On 18 November 2013 13:12, Michael Matz <matz@suse.de> wrote:
> Hi,
>
> On Mon, 18 Nov 2013, Claudio Fontana wrote:
>
>> >> +    case 3:
>> >> +        tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift);
>> >> +        break;
>> >
>> > Incorrect rotate for 32bit?
>
> 32bit rotates and shifts were fixed in a patch later than the 60er series
> Alex posted.  See attached.  (Generally there are many fixes to emulated
> instructions in that branch)

I think we're going to need to look through and fold in those
fixes, otherwise we'll end up reduplicating that work in the
course of code review :-(

-- PMM
Claudio Fontana Nov. 18, 2013, 1:24 p.m. UTC | #2
On 11/18/2013 02:15 PM, Peter Maydell wrote:
> On 18 November 2013 13:12, Michael Matz <matz@suse.de> wrote:
>> Hi,
>>
>> On Mon, 18 Nov 2013, Claudio Fontana wrote:
>>
>>>>> +    case 3:
>>>>> +        tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift);
>>>>> +        break;
>>>>
>>>> Incorrect rotate for 32bit?
>>
>> 32bit rotates and shifts were fixed in a patch later than the 60er series
>> Alex posted.  See attached.  (Generally there are many fixes to emulated
>> instructions in that branch)
> 
> I think we're going to need to look through and fold in those
> fixes, otherwise we'll end up reduplicating that work in the
> course of code review :-(
> 
> -- PMM
> 

Thanks all.

Regarding the access to registers in 32 bit mode, and the consequent write to registers in 32 bit mode,
I am investigating how to do it a little bit more general, in the sense that generally when we access registers in 32bit mode
we will (often) need to ignore the upper bits of the source register, and write zero to the destination register.
Not always but often. This could be done once for all to reduce the chances of mistakes.

C.
Claudio Fontana Nov. 18, 2013, 1:43 p.m. UTC | #3
Btw, in the first patch:

On 11/18/2013 02:12 PM, Michael Matz wrote:
> 
> From df54486da31d6329696effa61096eda5ab85395a Mon Sep 17 00:00:00 2001
> From: Michael Matz <matz@suse.de>
> Date: Sun, 24 Mar 2013 02:52:42 +0100
> Subject: [PATCH] Fix 32bit rotates.
> 
> The 32bit shifts generally weren't careful with the upper parts,
> either bits could leak in (for right shift) or leak or (for left shift).
> And rotate was completely off, rotating around bit 63, not 31.
> This fixes the CAST5 hash algorithm.
> ---
>  target-arm/translate-a64.c | 30 +++++++++++++++++++++++++++---
>  1 file changed, 27 insertions(+), 3 deletions(-)
> 
> diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
> index 96dc281..e3941a1 100644
> --- a/target-arm/translate-a64.c
> +++ b/target-arm/translate-a64.c
> @@ -596,25 +596,49 @@ static TCGv_i64 get_shift(int reg, int shift_type, TCGv_i64 tcg_shift,
>      r = tcg_temp_new_i64();
>  
>      /* XXX carry_out */
> +    /* Careful with the width.  We work on 64bit, but must make sure
> +       that we zero-extend the result on out, and ignore any upper bits,
> +       that might still be set in that register.  */
>      switch (shift_type) {
>      case 0: /* LSL */
> +	/* left shift is easy, simply zero-extend on out */
>          tcg_gen_shl_i64(r, cpu_reg(reg), tcg_shift);
> +	if (is_32bit)
> +	  tcg_gen_ext32u_i64 (r, r);
>          break;
>      case 1: /* LSR */
> -        tcg_gen_shr_i64(r, cpu_reg(reg), tcg_shift);
> +	/* For logical right shift we zero extend first, to zero
> +	   the upper bits.  We don't need to extend on out.  */
> +	if (is_32bit) {
> +	    tcg_gen_ext32u_i64 (r, cpu_reg(reg));
> +	    tcg_gen_shr_i64 (r, r, tcg_shift);
> +	} else
> +	  tcg_gen_shr_i64(r, cpu_reg(reg), tcg_shift);
>          break;
>      case 2: /* ASR */
> +	/* For arithmetic right shift we sign extend first, then shift,
> +	   and then need to clear the upper bits again.  */
>          if (is_32bit) {
>              TCGv_i64 tcg_tmp = tcg_temp_new_i64();
>              tcg_gen_ext32s_i64(tcg_tmp, cpu_reg(reg));
>              tcg_gen_sar_i64(r, tcg_tmp, tcg_shift);
> +	    tcg_gen_ext32u_i64 (r, r);
>              tcg_temp_free_i64(tcg_tmp);
>          } else {
>              tcg_gen_sar_i64(r, cpu_reg(reg), tcg_shift);
>          }
>          break;
> -    case 3:
> -        tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift);
> +    case 3: /* ROR */
> +	/* For rotation extending doesn't help, we really have to use
> +	   a 32bit rotate.  */
> +	if (is_32bit) {
> +	    TCGv_i32 tmp = tcg_temp_new_i32();
> +            tcg_gen_trunc_i64_i32(tmp, cpu_reg(reg));
> +	    tcg_gen_rotr_i32(tmp, tmp, tcg_shift);

Isn't this problematic?
We are using gen_rotr_i32, but passing tcg_shift, which is a TCGv_i64.
I remember I had compilation failures in the past when I tried something similar,
so my understanding is that this can work with a certain compiler under certain compiler options,
but is not guaranteed to work in all cases.

I think we need to either explicitly convert the tcg_shift to a TCGv_i32, or we need to use an open coded version of the rotr_i64 that inserts at (32 - n) instead of (64 - n)

What do you think?

C.

> +            tcg_gen_extu_i32_i64(r, tmp);
> +            tcg_temp_free_i32(tmp);
> +	} else
> +	  tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift);
>          break;
>      }
>  
> -- 1.8.1.4
>
Peter Maydell Nov. 18, 2013, 1:44 p.m. UTC | #4
On 18 November 2013 13:43, Claudio Fontana <claudio.fontana@linaro.org> wrote:
> We are using gen_rotr_i32, but passing tcg_shift, which is a TCGv_i64.
> I remember I had compilation failures in the past when I tried something similar,
> so my understanding is that this can work with a certain compiler under certain compiler options,
> but is not guaranteed to work in all cases.

It's a debug option -- if you build with --enable-debug
then TCGv_i32/TCGv_i64 mismatches should always cause
compile failures.

-- PMM
Michael Matz Nov. 18, 2013, 1:46 p.m. UTC | #5
Hi,

On Mon, 18 Nov 2013, Peter Maydell wrote:

> >> >> +    case 3:
> >> >> +        tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift);
> >> >> +        break;
> >> >
> >> > Incorrect rotate for 32bit?
> >
> > 32bit rotates and shifts were fixed in a patch later than the 60er series
> > Alex posted.  See attached.  (Generally there are many fixes to emulated
> > instructions in that branch)
> 
> I think we're going to need to look through and fold in those fixes, 
> otherwise we'll end up reduplicating that work in the course of code 
> review :-(

Most probably.  Authorship will be lost :-/ I was planning to submit all 
of these once the 60er set of Alex would be applied.  If you're reworking 
that set more of less completely then it indeed makes more sense to fold 
in those things and so it'd probably be better to have them applied (i.e. 
basically have our full branch applied when dissecting things).

The commits that fix things in the a64 decoder proper (not the linux-user 
implementation) would be:

e14c1a5 softfloat: correctly handle overflow in float[32|64] to uint64 conversion
cbc98b1 aarch64: Fix FCVTZU for single float
a91f762 aarch64: Fix UZP/ZIP/TRN
644c748 aarch64: Fix 32bit TST
2a717e8 Fix FCVTAS and FCVTAU
0dd22d0 Fix decoding of floating<->fixed conversions
d52c999 Fix implementation of USHLL/SSHLL
cfbb9e1 Fix using uninitialized value
ecfdfcd Fix typo in FSUB detection
87fd8ca Increase MAX_OP_PER_INSTR
38452d8 Fix USHLL, and implement other SIMD shifts
4146d40 Fix INS element
a62437c Fix fcmp(e) with NaNs
ec2b8f3 softfloat: Fix float64_to_uint64
b003867 Fix EXTR for 32bit
df54486 Fix 32bit rotates.
33137f8 Fix the pstate flags helpers
75cb838 Don't set flush to zero by default
564e811 Fix 128bit ldr (literal)
0ff91a0 Fix long immediate constants

(That's all on top Alex' posted patchset but not git rebased onto it, top 
of Alex roughly corresponds to commit 40d66b61)


Ciao,
Michael.
Peter Maydell Nov. 18, 2013, 1:49 p.m. UTC | #6
On 18 November 2013 13:46, Michael Matz <matz@suse.de> wrote:
> On Mon, 18 Nov 2013, Peter Maydell wrote:
>> I think we're going to need to look through and fold in those fixes,
>> otherwise we'll end up reduplicating that work in the course of code
>> review :-(
>
> Most probably.  Authorship will be lost :-/ I was planning to submit all
> of these once the 60er set of Alex would be applied.  If you're reworking
> that set more of less completely then it indeed makes more sense to fold
> in those things and so it'd probably be better to have them applied (i.e.
> basically have our full branch applied when dissecting things).
>
> The commits that fix things in the a64 decoder proper (not the linux-user
> implementation) would be:
>
> e14c1a5 softfloat: correctly handle overflow in float[32|64] to uint64 conversion
> cbc98b1 aarch64: Fix FCVTZU for single float
> a91f762 aarch64: Fix UZP/ZIP/TRN
> 644c748 aarch64: Fix 32bit TST
> 2a717e8 Fix FCVTAS and FCVTAU
> 0dd22d0 Fix decoding of floating<->fixed conversions
> d52c999 Fix implementation of USHLL/SSHLL
> cfbb9e1 Fix using uninitialized value
> ecfdfcd Fix typo in FSUB detection
> 87fd8ca Increase MAX_OP_PER_INSTR
> 38452d8 Fix USHLL, and implement other SIMD shifts
> 4146d40 Fix INS element
> a62437c Fix fcmp(e) with NaNs
> ec2b8f3 softfloat: Fix float64_to_uint64
> b003867 Fix EXTR for 32bit
> df54486 Fix 32bit rotates.
> 33137f8 Fix the pstate flags helpers
> 75cb838 Don't set flush to zero by default
> 564e811 Fix 128bit ldr (literal)
> 0ff91a0 Fix long immediate constants

This looks like a small enough list to be tractable to fold
in. My suggestion for authorship would be that we have the
'From' line indicate whoever wrote the bulk of the code and
add sign-off lines from both of you.

(Some of those, like the softfloat fixes, are probably
standalone patches anyway.)

thanks
-- PMM
Michael Matz Nov. 18, 2013, 1:55 p.m. UTC | #7
Hi,

On Mon, 18 Nov 2013, Claudio Fontana wrote:

> > +            tcg_gen_trunc_i64_i32(tmp, cpu_reg(reg));
> > +	    tcg_gen_rotr_i32(tmp, tmp, tcg_shift);
> 
> Isn't this problematic?
> We are using gen_rotr_i32, but passing tcg_shift, which is a TCGv_i64.

With CONFIG_DEBUG_TCG it'll break, yes.  Though in principle there's no 
canonical relation between the two argument types for shifts and rotates 
(unlike addition for example) TCG indeed wants to ensure that 
typeof(arg2)==typeof(arg1).

> I remember I had compilation failures in the past when I tried something 
> similar, so my understanding is that this can work with a certain 
> compiler under certain compiler options, but is not guaranteed to work 
> in all cases.
> 
> I think we need to either explicitly convert the tcg_shift to a 
> TCGv_i32, or we need to use an open coded version of the rotr_i64 that 
> inserts at (32 - n) instead of (64 - n)
> 
> What do you think?

I think converting tcg_shift might eventually lead to better generated 
code (if tcg is optmizing enough, now or in the future, haven't checked).


Ciao,
Michael.
Richard Henderson Nov. 18, 2013, 7:51 p.m. UTC | #8
On 11/18/2013 11:55 PM, Michael Matz wrote:
>> > I think we need to either explicitly convert the tcg_shift to a 
>> > TCGv_i32, or we need to use an open coded version of the rotr_i64 that 
>> > inserts at (32 - n) instead of (64 - n)
>> > 
>> > What do you think?
> I think converting tcg_shift might eventually lead to better generated 
> code (if tcg is optmizing enough, now or in the future, haven't checked).

Agreed.


r~
diff mbox

Patch

From 33137f8a660750d7d8598c7e467f4ccc8dc5ef85 Mon Sep 17 00:00:00 2001
From: Michael Matz <matz@suse.de>
Date: Sat, 23 Mar 2013 04:53:44 +0100
Subject: [PATCH] Fix the pstate flags helpers

ADCS and SBCS/SUBS sometimes gave the wrong results
for the C and V flags.  This fixes it.
---
 target-arm/helper-a64.c | 52 ++++++++++++-------------------------------------
 1 file changed, 12 insertions(+), 40 deletions(-)

diff --git a/target-arm/helper-a64.c b/target-arm/helper-a64.c
index 4375bf0..4fcb09b 100644
--- a/target-arm/helper-a64.c
+++ b/target-arm/helper-a64.c
@@ -7,8 +7,6 @@ 
 
 uint32_t HELPER(pstate_add)(uint32_t pstate, uint64_t a1, uint64_t a2, uint64_t ar)
 {
-    int64_t s1 = a1;
-    int64_t s2 = a2;
     int64_t sr = ar;
 
     pstate &= ~(PSTATE_N | PSTATE_Z | PSTATE_C | PSTATE_V);
@@ -21,11 +19,15 @@  uint32_t HELPER(pstate_add)(uint32_t pstate, uint64_t a1, uint64_t a2, uint64_t
         pstate |= PSTATE_Z;
     }
 
-    if (ar && (ar < a1)) {
+    if (ar < a1) {
         pstate |= PSTATE_C;
+    } else if (ar != (a1 + a2) && ar == a1) {
+	/* If result isn't what we expect it must be because we added
+	   in some carry.  If so we also produce a carry when ar == a1. */
+	pstate |= PSTATE_C;
     }
 
-    if ((s1 > 0 && s2 > 0 && sr < 0) || (s1 < 0 && s2 < 0 && sr > 0)) {
+    if ((int64_t)((a1 ^ a2 ^ -1) & (a1 ^ ar)) < 0) {
         pstate |= PSTATE_V;
     }
 
@@ -38,8 +40,6 @@  uint32_t HELPER(pstate_add32)(uint32_t pstate, uint64_t x1, uint64_t x2, uint64_
     uint32_t a2 = x2;
     uint32_t ar = xr;
 
-    int32_t s1 = a1;
-    int32_t s2 = a2;
     int32_t sr = ar;
 
     pstate &= ~(PSTATE_N | PSTATE_Z | PSTATE_C | PSTATE_V);
@@ -52,11 +52,13 @@  uint32_t HELPER(pstate_add32)(uint32_t pstate, uint64_t x1, uint64_t x2, uint64_
         pstate |= PSTATE_Z;
     }
 
-    if (ar && (ar < a1)) {
+    if (ar < a1) {
         pstate |= PSTATE_C;
+    } else if (ar != (a1 + a2) && ar == a1) {
+	pstate |= PSTATE_C;
     }
 
-    if ((s1 > 0 && s2 > 0 && sr < 0) || (s1 < 0 && s2 < 0 && sr > 0)) {
+    if ((int32_t)((a1 ^ a2 ^ -1) & (a1 ^ ar)) < 0) {
         pstate |= PSTATE_V;
     }
 
@@ -65,23 +67,7 @@  uint32_t HELPER(pstate_add32)(uint32_t pstate, uint64_t x1, uint64_t x2, uint64_
 
 uint32_t HELPER(pstate_sub)(uint32_t pstate, uint64_t a1, uint64_t a2, uint64_t ar)
 {
-    int64_t sr = ar;
-    int64_t s1 = a1;
-    int64_t s2 = a2;
-
-    pstate = helper_pstate_add(pstate, a1, a2, ar);
-
-    pstate &= ~(PSTATE_C | PSTATE_V);
-
-    if (a2 <= a1) {
-        pstate |= PSTATE_C;
-    }
-
-    /* XXX check if this is the only special case */
-    if ((!a1 && a2 == 0x8000000000000000ULL) || (s1 > 0 && s2 < 0 && sr < 0) || (s1 < 0 && s2 > 0 && sr > 0)) {
-        pstate |= PSTATE_V;
-    }
-
+    pstate = helper_pstate_add(pstate, a1, ~a2, ar);
     return pstate;
 }
 
@@ -90,22 +76,8 @@  uint32_t HELPER(pstate_sub32)(uint32_t pstate, uint64_t x1, uint64_t x2, uint64_
     uint32_t a1 = x1;
     uint32_t a2 = x2;
     uint32_t ar = xr;
-    int32_t sr = ar;
-    int32_t s1 = a1;
-    int32_t s2 = a2;
-
-    pstate = helper_pstate_add32(pstate, a1, a2, ar);
-
-    pstate &= ~(PSTATE_C | PSTATE_V);
-
-    if (a2 <= a1) {
-        pstate |= PSTATE_C;
-    }
-
-    if ((!a1 && a2 == 0x80000000ULL) || (s1 > 0 && s2 < 0 && sr < 0) || (s1 < 0 && s2 > 0 && sr > 0)) {
-        pstate |= PSTATE_V;
-    }
 
+    pstate = helper_pstate_add32(pstate, a1, ~a2, ar);
     return pstate;
 }
 
-- 
1.8.1.4