Patchwork [5/7] tcg-sparc: Implement setcond, movcond, setcond2, brcond2.

login
register
mail settings
Submitter Richard Henderson
Date Dec. 16, 2009, 11:26 p.m.
Message ID <4259c837ce1a62fcb495e57f18b588eb7365d286.1261012798.git.rth@twiddle.net>
Download mbox | patch
Permalink /patch/41298/
State New
Headers show

Comments

Richard Henderson - Dec. 16, 2009, 11:26 p.m.
An initial cut at conditional moves for the sparc backend.

Untested, as I don't have sparc hardware and the build system
resists attempts at cross-compilation.

Note fixes to tcg_out_movi_imm32 (wrong check_fit_tl width),
use of TCG_TARGET_REG_BITS == 64 tests instead of explicitly
checking for __sparc_v9__ everywhere.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/sparc/tcg-target.c |  415 +++++++++++++++++++++++++++++++++++++++++-------
 1 files changed, 360 insertions(+), 55 deletions(-)
Blue Swirl - Dec. 19, 2009, 10:31 a.m.
On Wed, Dec 16, 2009 at 11:26 PM, Richard Henderson <rth@twiddle.net> wrote:
> An initial cut at conditional moves for the sparc backend.
>
> Untested, as I don't have sparc hardware and the build system
> resists attempts at cross-compilation.

I can try if you have a test case.

> Note fixes to tcg_out_movi_imm32 (wrong check_fit_tl width),
> use of TCG_TARGET_REG_BITS == 64 tests instead of explicitly
> checking for __sparc_v9__ everywhere.

Good fixes. I think these should be in a different patch which could be applied.

> -    tcg_out_arith(s, ret, arg, TCG_REG_G0, ARITH_OR);
> +    if (ret != arg)
> +        tcg_out_arith(s, ret, arg, TCG_REG_G0, ARITH_OR);
>  }

This optimization is already handled at tcg-op.h:tcg_gen_mov_i32().

>  static inline void tcg_out_movi_imm32(TCGContext *s, int ret, uint32_t arg)
>  {
> -    if (check_fit_tl(arg, 12))
> +    if (check_fit_tl(arg, 13))
>         tcg_out_movi_imm13(s, ret, arg);

IIRC sign extension prevents this.

>  static inline void tcg_out_movi(TCGContext *s, TCGType type,
>                                 int ret, tcg_target_long arg)
>  {
> -#if defined(__sparc_v9__) && !defined(__sparc_v8plus__)
> -    if (!check_fit_tl(arg, 32) && (arg & ~0xffffffffULL) != 0) {
> -        tcg_out_movi_imm32(s, TCG_REG_I4, arg >> 32);
> -        tcg_out_arithi(s, TCG_REG_I4, TCG_REG_I4, 32, SHIFT_SLLX);
> +    if (type == TCG_TYPE_I32 || (arg & ~(tcg_target_long)0xffffffff))
>         tcg_out_movi_imm32(s, ret, arg);
> -        tcg_out_arith(s, ret, ret, TCG_REG_I4, ARITH_OR);
> -    } else if (check_fit_tl(arg, 12))
> -        tcg_out_movi_imm13(s, ret, arg);
> -    else {
> -        tcg_out_sethi(s, ret, arg);
> -        if (arg & 0x3ff)
> -            tcg_out_arithi(s, ret, ret, arg & 0x3ff, ARITH_OR);
> +    else if (TCG_TARGET_REG_BITS == 64) {
> +        if (check_fit_tl(arg, 32)) {
> +            /* Sign extended 32-bit constants are formed with SETHI+XOR.  */
> +            tcg_out_sethi(s, ret, ~arg);
> +            tcg_out_arithi(s, ret, ret, (arg & 0x3ff) | -0x400, ARITH_XOR);
> +        } else {
> +            tcg_out_movi_imm32(s, TCG_REG_I4, arg >> 32);
> +            tcg_out_arithi(s, TCG_REG_I4, TCG_REG_I4, 32, SHIFT_SLLX);
> +            tcg_out_movi_imm32(s, ret, arg);
> +            tcg_out_arith(s, ret, ret, TCG_REG_I4, ARITH_OR);
> +        }
>     }
> -#else
> -    tcg_out_movi_imm32(s, ret, arg);
> -#endif
>  }

Please split this also to another patch, it looks good.


> +        int32_t val = l->u.value - (tcg_target_long)s->code_ptr;
> +        tcg_out32(s, (INSN_OP(0) | opc | INSN_OP2(0x2)
>                       | INSN_OFF22(l->u.value - (unsigned long)s->code_ptr)));
>     } else {
>         tcg_out_reloc(s, s->code_ptr, R_SPARC_WDISP22, label_index, 0);
> -        tcg_out32(s, (INSN_OP(0) | INSN_COND(opc, 0) | INSN_OP2(0x2) | 0));
> +        tcg_out32(s, (INSN_OP(0) | opc | INSN_OP2(0x2) | 0));

What instruction is this? A define would be in order.

> -        tcg_out32(s, (INSN_OP(0) | INSN_COND(opc, 0) | INSN_OP2(0x1) |
> -                      (0x5 << 19) |
> +        tcg_out32(s, (INSN_OP(0) | opc | INSN_OP2(0x1) | (0x5 << 19) |
>                       INSN_OFF19(l->u.value - (unsigned long)s->code_ptr)));
>     } else {
>         tcg_out_reloc(s, s->code_ptr, R_SPARC_WDISP19, label_index, 0);
> -        tcg_out32(s, (INSN_OP(0) | INSN_COND(opc, 0) | INSN_OP2(0x1) |
> -                      (0x5 << 19) | 0));
> +        tcg_out32(s, (INSN_OP(0) | opc | INSN_OP2(0x1) | (0x5 << 19) | 0));

Same here.

>  static void tcg_out_brcond_i32(TCGContext *s, int cond,
>                                TCGArg arg1, TCGArg arg2, int const_arg2,
>                                int label_index)
>  {
> -    if (const_arg2 && arg2 == 0)
> -        /* orcc %g0, r, %g0 */
> -        tcg_out_arith(s, TCG_REG_G0, TCG_REG_G0, arg1, ARITH_ORCC);
> -    else
> -        /* subcc r1, r2, %g0 */
> -        tcg_out_arith(s, TCG_REG_G0, arg1, arg2, ARITH_SUBCC);
> -    tcg_out_branch_i32(s, tcg_cond_to_bcond[cond], label_index);
> +    tcg_out_cmp(s, arg1, arg2, const_arg2);

What's wrong with 'orcc' (produces the synthetic instruction 'tst')?
Richard Henderson - Dec. 19, 2009, 5:47 p.m.
On 12/19/2009 02:31 AM, Blue Swirl wrote:
>>   static inline void tcg_out_movi_imm32(TCGContext *s, int ret, uint32_t arg)
>>   {
>> -    if (check_fit_tl(arg, 12))
>> +    if (check_fit_tl(arg, 13))
>>          tcg_out_movi_imm13(s, ret, arg);
>
> IIRC sign extension prevents this.

Pardon?  check_fit_tl checks a signed value, the OR opcode provides one. 
  Where's the conflict?

>> -    if (const_arg2&&  arg2 == 0)
>> -        /* orcc %g0, r, %g0 */
>> -        tcg_out_arith(s, TCG_REG_G0, TCG_REG_G0, arg1, ARITH_ORCC);
>> -    else
>> -        /* subcc r1, r2, %g0 */
>> -        tcg_out_arith(s, TCG_REG_G0, arg1, arg2, ARITH_SUBCC);
>> -    tcg_out_branch_i32(s, tcg_cond_to_bcond[cond], label_index);
>> +    tcg_out_cmp(s, arg1, arg2, const_arg2);
>
> What's wrong with 'orcc' (produces the synthetic instruction 'tst')?

What result does "orcc" give that isn't produced by "subcc"?  Unlike 
i386 where "test x,x" is one byte smaller than "cmp $0,x", it seems to 
me there's no reason to distingish the arg2 == constant zero case on sparc.


r~
Blue Swirl - Dec. 19, 2009, 9:25 p.m.
On Sat, Dec 19, 2009 at 5:47 PM, Richard Henderson <rth@twiddle.net> wrote:
> On 12/19/2009 02:31 AM, Blue Swirl wrote:
>>>
>>>  static inline void tcg_out_movi_imm32(TCGContext *s, int ret, uint32_t
>>> arg)
>>>  {
>>> -    if (check_fit_tl(arg, 12))
>>> +    if (check_fit_tl(arg, 13))
>>>         tcg_out_movi_imm13(s, ret, arg);
>>
>> IIRC sign extension prevents this.
>
> Pardon?  check_fit_tl checks a signed value, the OR opcode provides one.
>  Where's the conflict?

Long time ago I tried the same change, but the generated code was not
correct. But now it seems to work.

>>> -    if (const_arg2&&  arg2 == 0)
>>> -        /* orcc %g0, r, %g0 */
>>> -        tcg_out_arith(s, TCG_REG_G0, TCG_REG_G0, arg1, ARITH_ORCC);
>>> -    else
>>> -        /* subcc r1, r2, %g0 */
>>> -        tcg_out_arith(s, TCG_REG_G0, arg1, arg2, ARITH_SUBCC);
>>> -    tcg_out_branch_i32(s, tcg_cond_to_bcond[cond], label_index);
>>> +    tcg_out_cmp(s, arg1, arg2, const_arg2);
>>
>> What's wrong with 'orcc' (produces the synthetic instruction 'tst')?
>
> What result does "orcc" give that isn't produced by "subcc"?  Unlike i386
> where "test x,x" is one byte smaller than "cmp $0,x", it seems to me there's
> no reason to distingish the arg2 == constant zero case on sparc.

Maybe it's faster on real CPUs. On my machine I don't see any
difference. I timed the following program:

#include <stdio.h>
#include <stdlib.h>

#define N 100000000

int main(int argc, char **argv)
{
    unsigned int i;

    if (atoi(argv[1])) {
        for (i = 0; i < N; i++) {
            asm volatile ("cmp %g1, 0");
        }
    } else {
        for (i = 0; i < N; i++) {
            asm volatile ("tst %g1");
        }
    }
    return 0;
}
Richard Henderson - Dec. 19, 2009, 10:52 p.m.
On 12/19/2009 01:25 PM, Blue Swirl wrote:
> On Sat, Dec 19, 2009 at 5:47 PM, Richard Henderson<rth@twiddle.net>  wrote:
>> On 12/19/2009 02:31 AM, Blue Swirl wrote:
>>>>
>>>>   static inline void tcg_out_movi_imm32(TCGContext *s, int ret, uint32_t
>>>> arg)
>>>>   {
>>>> -    if (check_fit_tl(arg, 12))
>>>> +    if (check_fit_tl(arg, 13))
>>>>          tcg_out_movi_imm13(s, ret, arg);
>>>
>>> IIRC sign extension prevents this.
>>
>> Pardon?  check_fit_tl checks a signed value, the OR opcode provides one.
>>   Where's the conflict?
>
> Long time ago I tried the same change, but the generated code was not
> correct. But now it seems to work.

I can't imagine why.  I've looked at the instruction encodings very 
carefully and double-checked the arithmetic vs GCC.

I still think the change is correct.  I can probably dig up access to a 
sparc machine and verify, but I don't know that would convince you.


r~
Blue Swirl - Dec. 20, 2009, 11:06 a.m.
On Sat, Dec 19, 2009 at 10:52 PM, Richard Henderson <rth@twiddle.net> wrote:
> On 12/19/2009 01:25 PM, Blue Swirl wrote:
>>
>> On Sat, Dec 19, 2009 at 5:47 PM, Richard Henderson<rth@twiddle.net>
>>  wrote:
>>>
>>> On 12/19/2009 02:31 AM, Blue Swirl wrote:
>>>>>
>>>>>  static inline void tcg_out_movi_imm32(TCGContext *s, int ret, uint32_t
>>>>> arg)
>>>>>  {
>>>>> -    if (check_fit_tl(arg, 12))
>>>>> +    if (check_fit_tl(arg, 13))
>>>>>         tcg_out_movi_imm13(s, ret, arg);
>>>>
>>>> IIRC sign extension prevents this.
>>>
>>> Pardon?  check_fit_tl checks a signed value, the OR opcode provides one.
>>>  Where's the conflict?
>>
>> Long time ago I tried the same change, but the generated code was not
>> correct. But now it seems to work.
>
> I can't imagine why.  I've looked at the instruction encodings very
> carefully and double-checked the arithmetic vs GCC.
>
> I still think the change is correct.  I can probably dig up access to a
> sparc machine and verify, but I don't know that would convince you.

No need for that. I tested it and now it seems to work, thanks.

There's another case a few lines later, I'll change both.

Patch

diff --git a/tcg/sparc/tcg-target.c b/tcg/sparc/tcg-target.c
index 23cd9cd..351683a 100644
--- a/tcg/sparc/tcg-target.c
+++ b/tcg/sparc/tcg-target.c
@@ -191,6 +191,8 @@  static inline int tcg_target_const_match(tcg_target_long val,
 #define INSN_RS2(x) (x)
 #define INSN_ASI(x) ((x) << 5)
 
+#define INSN_IMM10(x) ((1 << 13) | ((x) & 0x3ff))
+#define INSN_IMM11(x) ((1 << 13) | ((x) & 0x7ff))
 #define INSN_IMM13(x) ((1 << 13) | ((x) & 0x1fff))
 #define INSN_OFF19(x) (((x) >> 2) & 0x07ffff)
 #define INSN_OFF22(x) (((x) >> 2) & 0x3fffff)
@@ -214,6 +216,20 @@  static inline int tcg_target_const_match(tcg_target_long val,
 #define COND_VC    0xf
 #define BA         (INSN_OP(0) | INSN_COND(COND_A, 0) | INSN_OP2(0x2))
 
+#define BPCC_ICC   (0)
+#defien BPCC_XCC   (1 << 21)
+
+#define MOVCC_ICC  (1 << 18)
+#define MOVCC_XCC  (1 << 18 | 1 << 12)
+
+#define MRCOND_E   0x1
+#define MRCOND_LE  0x2
+#define MRCOND_L   0x3
+#define MRCOND_NE  0x5
+#define MRCOND_G   0x6
+#define MRCOND_GE  0x7
+#define INSN_MRCOND(c) ((c) << 10)
+
 #define ARITH_ADD  (INSN_OP(2) | INSN_OP3(0x00))
 #define ARITH_AND  (INSN_OP(2) | INSN_OP3(0x01))
 #define ARITH_OR   (INSN_OP(2) | INSN_OP3(0x02))
@@ -228,7 +244,9 @@  static inline int tcg_target_const_match(tcg_target_long val,
 #define ARITH_SDIV (INSN_OP(2) | INSN_OP3(0x0f))
 #define ARITH_MULX (INSN_OP(2) | INSN_OP3(0x09))
 #define ARITH_UDIVX (INSN_OP(2) | INSN_OP3(0x0d))
+#define ARITH_MOVCC (INSN_OP(2) | INSN_OP3(0x2c))
 #define ARITH_SDIVX (INSN_OP(2) | INSN_OP3(0x2d))
+#define ARITH_MOVR (INSN_OP(2) | INSN_OP3(0x2f))
 
 #define SHIFT_SLL  (INSN_OP(2) | INSN_OP3(0x25))
 #define SHIFT_SRL  (INSN_OP(2) | INSN_OP3(0x26))
@@ -287,7 +305,8 @@  static inline void tcg_out_arithi(TCGContext *s, int rd, int rs1,
 
 static inline void tcg_out_mov(TCGContext *s, int ret, int arg)
 {
-    tcg_out_arith(s, ret, arg, TCG_REG_G0, ARITH_OR);
+    if (ret != arg)
+        tcg_out_arith(s, ret, arg, TCG_REG_G0, ARITH_OR);
 }
 
 static inline void tcg_out_sethi(TCGContext *s, int ret, uint32_t arg)
@@ -302,7 +321,7 @@  static inline void tcg_out_movi_imm13(TCGContext *s, int ret, uint32_t arg)
 
 static inline void tcg_out_movi_imm32(TCGContext *s, int ret, uint32_t arg)
 {
-    if (check_fit_tl(arg, 12))
+    if (check_fit_tl(arg, 13))
         tcg_out_movi_imm13(s, ret, arg);
     else {
         tcg_out_sethi(s, ret, arg);
@@ -314,22 +333,20 @@  static inline void tcg_out_movi_imm32(TCGContext *s, int ret, uint32_t arg)
 static inline void tcg_out_movi(TCGContext *s, TCGType type,
                                 int ret, tcg_target_long arg)
 {
-#if defined(__sparc_v9__) && !defined(__sparc_v8plus__)
-    if (!check_fit_tl(arg, 32) && (arg & ~0xffffffffULL) != 0) {
-        tcg_out_movi_imm32(s, TCG_REG_I4, arg >> 32);
-        tcg_out_arithi(s, TCG_REG_I4, TCG_REG_I4, 32, SHIFT_SLLX);
+    if (type == TCG_TYPE_I32 || (arg & ~(tcg_target_long)0xffffffff))
         tcg_out_movi_imm32(s, ret, arg);
-        tcg_out_arith(s, ret, ret, TCG_REG_I4, ARITH_OR);
-    } else if (check_fit_tl(arg, 12))
-        tcg_out_movi_imm13(s, ret, arg);
-    else {
-        tcg_out_sethi(s, ret, arg);
-        if (arg & 0x3ff)
-            tcg_out_arithi(s, ret, ret, arg & 0x3ff, ARITH_OR);
+    else if (TCG_TARGET_REG_BITS == 64) {
+        if (check_fit_tl(arg, 32)) {
+            /* Sign extended 32-bit constants are formed with SETHI+XOR.  */
+            tcg_out_sethi(s, ret, ~arg);
+            tcg_out_arithi(s, ret, ret, (arg & 0x3ff) | -0x400, ARITH_XOR);
+        } else {
+            tcg_out_movi_imm32(s, TCG_REG_I4, arg >> 32);
+            tcg_out_arithi(s, TCG_REG_I4, TCG_REG_I4, 32, SHIFT_SLLX);
+            tcg_out_movi_imm32(s, ret, arg);
+            tcg_out_arith(s, ret, ret, TCG_REG_I4, ARITH_OR);
+        }
     }
-#else
-    tcg_out_movi_imm32(s, ret, arg);
-#endif
 }
 
 static inline void tcg_out_ld_raw(TCGContext *s, int ret,
@@ -345,16 +362,18 @@  static inline void tcg_out_ld_ptr(TCGContext *s, int ret,
 {
     if (!check_fit_tl(arg, 10))
         tcg_out_movi(s, TCG_TYPE_PTR, ret, arg & ~0x3ffULL);
-#if defined(__sparc_v9__) && !defined(__sparc_v8plus__)
-    tcg_out32(s, LDX | INSN_RD(ret) | INSN_RS1(ret) |
-              INSN_IMM13(arg & 0x3ff));
-#else
-    tcg_out32(s, LDUW | INSN_RD(ret) | INSN_RS1(ret) |
-              INSN_IMM13(arg & 0x3ff));
-#endif
+
+    if (TCG_TARGET_REG_BITS == 64) {
+        tcg_out32(s, LDX | INSN_RD(ret) | INSN_RS1(ret) |
+                  INSN_IMM13(arg & 0x3ff));
+    } else { 
+        tcg_out32(s, LDUW | INSN_RD(ret) | INSN_RS1(ret) |
+                  INSN_IMM13(arg & 0x3ff));
+    }
 }
 
-static inline void tcg_out_ldst(TCGContext *s, int ret, int addr, int offset, int op)
+static inline void tcg_out_ldst(TCGContext *s, int ret, int addr,
+                                int offset, int op)
 {
     if (check_fit_tl(offset, 13))
         tcg_out32(s, op | INSN_RD(ret) | INSN_RS1(addr) |
@@ -435,16 +454,16 @@  static void tcg_out_branch_i32(TCGContext *s, int opc, int label_index)
     TCGLabel *l = &s->labels[label_index];
 
     if (l->has_value) {
-        val = l->u.value - (tcg_target_long)s->code_ptr;
-        tcg_out32(s, (INSN_OP(0) | INSN_COND(opc, 0) | INSN_OP2(0x2)
+        int32_t val = l->u.value - (tcg_target_long)s->code_ptr;
+        tcg_out32(s, (INSN_OP(0) | opc | INSN_OP2(0x2)
                       | INSN_OFF22(l->u.value - (unsigned long)s->code_ptr)));
     } else {
         tcg_out_reloc(s, s->code_ptr, R_SPARC_WDISP22, label_index, 0);
-        tcg_out32(s, (INSN_OP(0) | INSN_COND(opc, 0) | INSN_OP2(0x2) | 0));
+        tcg_out32(s, (INSN_OP(0) | opc | INSN_OP2(0x2) | 0));
     }
 }
 
-#if defined(__sparc_v9__) && !defined(__sparc_v8plus__)
+#if TCG_TARGET_REG_BITS == 64
 static void tcg_out_branch_i64(TCGContext *s, int opc, int label_index)
 {
     int32_t val;
@@ -452,13 +471,11 @@  static void tcg_out_branch_i64(TCGContext *s, int opc, int label_index)
 
     if (l->has_value) {
         val = l->u.value - (tcg_target_long)s->code_ptr;
-        tcg_out32(s, (INSN_OP(0) | INSN_COND(opc, 0) | INSN_OP2(0x1) |
-                      (0x5 << 19) |
+        tcg_out32(s, (INSN_OP(0) | opc | INSN_OP2(0x1) | (0x5 << 19) |
                       INSN_OFF19(l->u.value - (unsigned long)s->code_ptr)));
     } else {
         tcg_out_reloc(s, s->code_ptr, R_SPARC_WDISP19, label_index, 0);
-        tcg_out32(s, (INSN_OP(0) | INSN_COND(opc, 0) | INSN_OP2(0x1) |
-                      (0x5 << 19) | 0));
+        tcg_out32(s, (INSN_OP(0) | opc | INSN_OP2(0x1) | (0x5 << 19) | 0));
     }
 }
 #endif
@@ -476,34 +493,280 @@  static const uint8_t tcg_cond_to_bcond[10] = {
     [TCG_COND_GTU] = COND_GU,
 };
 
+static void tcg_out_cmp(TCGContext *s, TCGArg c1, TCGArg c2, int c2const)
+{
+    if (c2const)
+        tcg_out_arithi(s, TCG_REG_G0, c1, c2, ARITH_SUBCC);
+    else
+        tcg_out_arith(s, TCG_REG_G0, c1, c2, ARITH_SUBCC);
+}
+
 static void tcg_out_brcond_i32(TCGContext *s, int cond,
                                TCGArg arg1, TCGArg arg2, int const_arg2,
                                int label_index)
 {
-    if (const_arg2 && arg2 == 0)
-        /* orcc %g0, r, %g0 */
-        tcg_out_arith(s, TCG_REG_G0, TCG_REG_G0, arg1, ARITH_ORCC);
-    else
-        /* subcc r1, r2, %g0 */
-        tcg_out_arith(s, TCG_REG_G0, arg1, arg2, ARITH_SUBCC);
-    tcg_out_branch_i32(s, tcg_cond_to_bcond[cond], label_index);
+    tcg_out_cmp(s, arg1, arg2, const_arg2);
+    tcg_out_branch_i32(s, INSN_COND(tcg_cond_to_bcond[cond], 0), label_index);
     tcg_out_nop(s);
 }
 
-#if defined(__sparc_v9__) && !defined(__sparc_v8plus__)
+#if TCG_TARGET_REG_BITS == 64
 static void tcg_out_brcond_i64(TCGContext *s, int cond,
                                TCGArg arg1, TCGArg arg2, int const_arg2,
                                int label_index)
 {
-    if (const_arg2 && arg2 == 0)
-        /* orcc %g0, r, %g0 */
-        tcg_out_arith(s, TCG_REG_G0, TCG_REG_G0, arg1, ARITH_ORCC);
-    else
-        /* subcc r1, r2, %g0 */
-        tcg_out_arith(s, TCG_REG_G0, arg1, arg2, ARITH_SUBCC);
-    tcg_out_branch_i64(s, tcg_cond_to_bcond[cond], label_index);
+    tcg_out_cmp(s, arg1, arg2, const_arg2);
+    tcg_out_branch_i64(s, INSN_COND(tcg_cond_to_bcond[cond], 0) | BPCC_XCC,
+                       label_index);
     tcg_out_nop(s);
 }
+#else
+static void tcg_out_brcond2_i32(TCGContext *s, int cond,
+                                TCGArg al, TCGArg ah,
+                                TCGArg bl, int blconst,
+                                TCGArg bh, int bhconst, int label_dest)
+{
+    int label_next = gen_new_label();
+    int c1, c2, c3;
+
+    /* ??? For v8plus, consider reserving two global registers so that we
+       can reconstruct the 64-bit values there and compare them directly.  */
+
+    switch (cond) {
+    case TCG_COND_EQ:
+        c1 = -1, c2 = TCG_COND_NE, c3 = TCG_COND_EQ;
+        break;
+    case TCG_COND_NE:
+        c1 = TCG_COND_NE, c2 = -1, c3 = TCG_COND_NE;
+        break;
+    case TCG_COND_LT:
+    case TCG_COND_LTU:
+        c1 = cond, c2 = TCG_COND_NE, c3 = TCG_COND_LTU;
+        break;
+    case TCG_COND_LE:
+    case TCG_COND_LEU:
+        c1 = cond, c2 = TCG_COND_NE, c3 = TCG_COND_LEU;
+        break;
+    case TCG_COND_GT:
+    case TCG_COND_GTU:
+        c1 = cond, c2 = TCG_COND_NE, c3 = TCG_COND_GTU;
+        break;
+    case TCG_COND_GE:
+    case TCG_COND_GEU:
+        c1 = cond, c2 = TCG_COND_NE, c3 = TCG_COND_GEU;
+        break;
+    default:
+        tcg_abort ();
+    }
+
+    tcg_out_cmp(s, ah, bh, bhconst);
+    if (c1 != -1) {
+        tcg_out_branch_i32(s, INSN_COND(tcg_cond_to_bcond[c1], 0), label_dest);
+        if (c2 != -1)
+            tcg_out_nop(s);
+    }
+    if (c2 != -1) {
+        tcg_out_branch_i32(s, INSN_COND(tcg_cond_to_bcond[c2], 0), label_next);
+    }
+    tcg_out_cmp(s, al, bl, blconst);
+    tcg_out_branch_i32(s, INSN_COND(tcg_cond_to_bcond[c3], 0), label_dest);
+    tcg_out_nop(s);
+
+    tcg_out_label(s, label_next, (tcg_target_long)s->code_ptr);
+}
+#endif
+
+#if defined(__sparc_v9__) || defined(__sparc_v8plus__)
+static const uint8_t tcg_cond_to_mrcond[10] = {
+    [TCG_COND_EQ] = MRCOND_E,
+    [TCG_COND_NE] = MRCOND_NE,
+    [TCG_COND_LT] = MRCOND_L,
+    [TCG_COND_GE] = MRCOND_GE,
+    [TCG_COND_LE] = MRCOND_LE,
+    [TCG_COND_GT] = MRCOND_G
+};
+#endif
+
+static void tcg_movcond(TCGContect *s, int cond, TCGArg d,
+                        TCGArg c1, TCGArg c2, int c2const,
+                        TCGArg vt, int vtconst, TCGArg vf, int vfconst,
+                        int i64 __attribute__((unused)))
+{
+    TCGArg t;
+
+    if (vtconst == vfconst && vt == vf) {
+        if (vtconst)
+            tcg_out_movi_imm13(s, d, vt);
+        else
+            tcg_out_mov(s, d, vt);
+        return;
+    }
+
+#if defined(__sparc_v9__) || defined(__sparc_v8plus__)
+    /* Use MOVR when possible.  We have constrained the constants to IMM11
+       for the sake of MOVCC, but MOVR only takes IMM10.  Also, we cannot
+       overwrite the condition with our initial store to D.  Any attempt to
+       work around these extra conditions is just as much work as falling
+       back to MOVCC.  */
+    if (c2 == 0 && tcg_cond_to_mrcond[cond] && d != c1) {
+        int use_movr = 1, do_swap = 0;
+
+        if (vtconst) {
+            if (!check_fit_tl(vt, 10)) {
+                do_swap = 1;
+                if (vfconst)
+                    use_movr = check_fit_tl(vt, 10);
+                else
+                    use_movr = (d != vf);
+            }
+        } else if (d == vt) {
+            do_swap = 1;
+            if (vfconst)
+                use_movr = check_fit_tl(vt, 10);
+        }
+        if (use_movr) {
+            if (do_swap) {
+                cond = tcg_invert_cond(cond);
+                t = vt, vt = vf, vf = t;
+                t = vtconst, vtconst = vfconst, vfconst = t;
+            }
+            if (vfconst)
+                tcg_out_movi_imm13(s, d, vf);
+            else
+                tcg_out_mov(s, d, vf);
+            tcg_out32(ARITH_MOVR | INSN_RD(d) | INSN_RS1(c1)
+                      | INSN_MRCOND(tcg_cond_to_mrcond[cond])
+                      | (vtconst ? INSN_IMM10(vt) : INSN_RS2(vt)));
+            return;
+        }
+    }
+
+    tcg_out_cmp(s, c1, c2, c2const);
+
+    if (!vtconst && vt == d) {
+        cond = tcg_invert_cond(cond);
+        vt = vf, vf = d;
+        vtconst = vfconst, vfconst = 0;
+    }
+    if (vfconst)
+        tcg_out_movi_imm13(s, d, vf);
+    else
+        tcg_out_mov(s, d, vf);
+    tcg_out32(ARITH_MOVCC | INSN_RD(d)
+              | INSN_RS1(tcg_cond_to_bcond[cond])
+              | (i64 ? MOVCC_XCC : MOVCC_ICC)
+              | (vtconst ? INSN_IMM11(vt) : INSN_RS2(vt)));
+#else
+    t = gen_new_label ();
+
+    tcg_out_cmp(s, c1, c2, c2const);
+    tcg_out_branch_i32(s, INSN_COND(tcg_cond_to_bcond[cond], 1), label);
+    if (vtconst)
+        tcg_out_movi_imm13(s, d, vt);
+    else
+        tcg_out_mov(s, d, vt);
+    if (vfconst)
+        tcg_out_movi_imm13(s, d, vf);
+    else
+        tcg_out_mov(s, d, vf);
+
+    tcg_out_label(s, t, (tcg_target_long)s->code_ptr);
+#endif
+}
+
+static void tcg_setcond_i32(TCGContext *s, int cond, TCGArg d,
+                            TCGArg c1, TCGArg c2, int c2const)
+{
+    TCGArg t;
+
+    /* For 32-bit comparisons, we can play games with ADDX/SUBX in order
+       to get the correct value into the register.  Don't go beyond this
+       because the movcond fallback is only 4 insns.  */
+    swtich (cond) {
+    case TCG_COND_EQ:
+        if (c2 != 0) {
+            if (c2const)
+                tcg_out_arithi(s, d, c1, c2, ARITH_XOR);
+            else
+                tcg_out_arith(s, d, c1, c2, ARITH_XOR);
+        }
+        c1 = d, c2 = TCG_REG_G0, c2const = 0;
+        cond = TCG_COND_LEU;
+        break;
+
+    case TCG_COND_NE:
+        if (c2 != 0) {
+            if (c2const)
+                tcg_out_arithi(s, d, c1, c2, ARITH_XOR);
+            else
+                tcg_out_arith(s, d, c1, c2, ARITH_XOR);
+        }
+        c1 = TCG_REG_G0, c2 = d, c2const = 0;
+        cond = TCG_COND_LTU;
+        break
+
+    case TCG_COND_GTU:
+    case TCG_COND_GEU:
+        if (c2const && c2 != 0) {
+            tcg_out_movi_imm13(s, TCG_REG_I5, c2);
+            c2 = TCG_REG_I5;
+        }
+        t = c1, c1 = c2, c2 = t, c2const = 0;
+        cond = (cond == TCG_COND_GTU ? TCG_COND_LTU : TCG_COND_LEU);
+        break;
+
+    case TCG_COND_LTU:
+    case TCG_COND_LEU:
+        break;
+
+    default:
+        tcg_movcond(s, cond, d, c1, c2, c2const, 1, 1, 0, 1, 0);
+        return;
+    }
+
+    tcg_out_arith(s, TCG_REG_G0, c1, c2, ARITH_SUBCC);
+    if (cond == TCG_COND_LTU)
+        tcg_out_arithi(s, d, TCG_REG_G0, 0, ARITH_ADDX);
+    else
+        tcg_out_arithi(s, d, TCG_REG_G0, -1, ARITH_SUBX);
+}
+
+#if TCG_TARGET_REG_BITS == 32
+static void tcg_out_setcond2_i32(TCGContext *s, int cond, TCGArg d,
+                                 TCGArg al, TCGArg ah, TCGArg bl, int blconst,
+                                 TCGArg bh, int bhconst)
+{
+    TCGArg scratch = TCG_REG_I5;
+    int label;
+
+    if (d != al && d != ah && (blconst || d != bl) && (bhconst || d != bh))
+        scratch = d;
+
+    /* ??? For v8plus, consider reserving two global registers so that we
+       can reconstruct the 64-bit values there and compare them directly.  */
+
+    switch (cond) {
+    case TCG_COND_EQ:
+        tcg_out_setcond_i32(s, TCG_COND_EQ, scratch, al, bl, blconst);
+        tcg_out_movcond(s, TCG_COND_EQ, scratch, ah, bh, bhconst,
+                        scratch, 0, 0, 1, 0);
+        break;
+    case TCG_COND_NE:
+        tcg_out_setcond_i32(s, TCG_COND_NE, scratch, al, bl, blconst);
+        tcg_out_movcond(s, TCG_COND_NE, scratch, ah, bh, bhconst,
+                        1, 1, scratch, 0);
+        break;
+
+    default:
+        label = gen_new_label();
+        tcg_out_movi_imm13(s, scratch, 1);
+        tcg_out_brcond2_i32(s, cond, al, ah, bl, blconst, bh, bhconst, label);
+        tcg_out_movi_imm13(s, scratch, 0);
+        tcg_out_label(s, label, (tcg_target_long)s->code_ptr);
+        break;
+    }
+
+    tcg_out_mov(s, d, scratch);}
 #endif
 
 /* Generate global QEMU prologue and epilogue code */
@@ -986,7 +1249,7 @@  static inline void tcg_out_op(TCGContext *s, int opc, const TCGArg *args,
         tcg_out_movi(s, TCG_TYPE_I32, args[0], (uint32_t)args[1]);
         break;
 
-#if defined(__sparc_v9__) && !defined(__sparc_v8plus__)
+#if TCG_TARGET_REG_BITS == 64
 #define OP_32_64(x)                             \
         glue(glue(case INDEX_op_, x), _i32:)    \
         glue(glue(case INDEX_op_, x), _i64:)
@@ -1007,7 +1270,7 @@  static inline void tcg_out_op(TCGContext *s, int opc, const TCGArg *args,
         tcg_out_ldst(s, args[0], args[1], args[2], LDSH);
         break;
     case INDEX_op_ld_i32:
-#if defined(__sparc_v9__) && !defined(__sparc_v8plus__)
+#if TCG_TARGET_REG_BITS == 64
     case INDEX_op_ld32u_i64:
 #endif
         tcg_out_ldst(s, args[0], args[1], args[2], LDUW);
@@ -1019,7 +1282,7 @@  static inline void tcg_out_op(TCGContext *s, int opc, const TCGArg *args,
         tcg_out_ldst(s, args[0], args[1], args[2], STH);
         break;
     case INDEX_op_st_i32:
-#if defined(__sparc_v9__) && !defined(__sparc_v8plus__)
+#if TCG_TARGET_REG_BITS == 64
     case INDEX_op_st32_i64:
 #endif
         tcg_out_ldst(s, args[0], args[1], args[2], STW);
@@ -1074,6 +1337,26 @@  static inline void tcg_out_op(TCGContext *s, int opc, const TCGArg *args,
         tcg_out_brcond_i32(s, args[2], args[0], args[1], const_args[1],
                            args[3]);
         break;
+    case INDEX_op_setcond_i32:
+        tcg_out_setcond_i32(s, args[3], args[0], args[1],
+                            args[2], const_args[2]);
+        break;
+    case INDEX_op_movcond_i32:
+        tcg_out_movcond(s, args[5], args[0], args[1], args[2],
+                        const_args[2], args[3], const_args[3],
+                        args[4], const_args[4], 0);
+        break;
+#if TCG_TARGET_REG_BITS == 32
+    case INDEX_op_brcond2_i32:
+        tcg_out_brcond2_i32(s, args[4], args[0], args[1],
+                            args[2], const_args[2],
+                            args[3], const_args[3], args[5]);
+        break;
+    case INDEX_op_setcond2_i32:
+        tcg_out_setcond2_i32(s, args[5], args[0], args[1], args[2],
+                             args[3], const_args[3], args[4], const_args[4]);
+        break;
+#endif
 
     case INDEX_op_qemu_ld8u:
         tcg_out_qemu_ld(s, args, 0);
@@ -1103,7 +1386,7 @@  static inline void tcg_out_op(TCGContext *s, int opc, const TCGArg *args,
         tcg_out_qemu_st(s, args, 2);
         break;
 
-#if defined(__sparc_v9__) && !defined(__sparc_v8plus__)
+#if TCG_TARGET_REG_BITS == 64
     case INDEX_op_movi_i64:
         tcg_out_movi(s, TCG_TYPE_I64, args[0], args[1]);
         break;
@@ -1139,6 +1422,16 @@  static inline void tcg_out_op(TCGContext *s, int opc, const TCGArg *args,
         tcg_out_brcond_i64(s, args[2], args[0], args[1], const_args[1],
                            args[3]);
         break;
+    case INDEX_op_setcond_i64:
+        tcg_out_movcond(s, args[3], args[0], args[1], args[2],
+                        const_args[2], 1, 1, 0, 1, 1);
+        break;
+    case INDEX_op_movcond_i64:
+        tcg_out_movcond(s, args[5], args[0], args[1], args[2],
+                        const_args[2], args[3], const_args[3],
+                        args[4], const_args[4], 1);
+        break;
+
     case INDEX_op_qemu_ld64:
         tcg_out_qemu_ld(s, args, 3);
         break;
@@ -1192,7 +1485,17 @@  static const TCGTargetOpDef sparc_op_defs[] = {
     { INDEX_op_shr_i32, { "r", "r", "rJ" } },
     { INDEX_op_sar_i32, { "r", "r", "rJ" } },
 
-    { INDEX_op_brcond_i32, { "r", "ri" } },
+    { INDEX_op_brcond_i32, { "r", "rJ" } },
+    { INDEX_op_setcond_i32, { "r", "r", "rJ" } },
+#if defined(__sparc_v9__) || defined(__sparc_v8plus__)
+    { INDEX_op_movcond_i32, { "r", "r", "rJ", "rI", "rI" } },
+#else
+    { INDEX_op_movcond_i32, { "r", "r", "rJ", "rJ", "rJ" } },
+#endif
+#if TCG_TARGET_REG_BITS == 32
+    { INDEX_op_brcond2_i32, { "r", "r", "rJ", "rJ" } },
+    { INDEX_op_setcond2_i32, { "r", "r", "r", "rJ", "rJ" } },
+#endif
 
     { INDEX_op_qemu_ld8u, { "r", "L" } },
     { INDEX_op_qemu_ld8s, { "r", "L" } },
@@ -1235,7 +1538,9 @@  static const TCGTargetOpDef sparc_op_defs[] = {
     { INDEX_op_shr_i64, { "r", "r", "rJ" } },
     { INDEX_op_sar_i64, { "r", "r", "rJ" } },
 
-    { INDEX_op_brcond_i64, { "r", "ri" } },
+    { INDEX_op_brcond_i64, { "r", "rJ" } },
+    { INDEX_op_setcond_i64, { "r", "r", "rJ" } },
+    { INDEX_op_movcond_i64, { "r", "r", "rJ", "rI", "rI" } },
 #endif
     { -1 },
 };
@@ -1243,7 +1548,7 @@  static const TCGTargetOpDef sparc_op_defs[] = {
 void tcg_target_init(TCGContext *s)
 {
     tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xffffffff);
-#if defined(__sparc_v9__) && !defined(__sparc_v8plus__)
+#if TCG_TARGET_REG_BITS == 64
     tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I64], 0, 0xffffffff);
 #endif
     tcg_regset_set32(tcg_target_call_clobber_regs, 0,
@@ -1264,7 +1569,7 @@  void tcg_target_init(TCGContext *s)
 
     tcg_regset_clear(s->reserved_regs);
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_G0);
-#if defined(__sparc_v9__) && !defined(__sparc_v8plus__)
+#if TCG_TARGET_REG_BITS == 32
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_I4); // for internal use
 #endif
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_I5); // for internal use