diff mbox

[v2,6/9] target-sh4: split out Q and M from of SR and optimize div1

Message ID 1387713039-9584-7-git-send-email-aurelien@aurel32.net
State New
Headers show

Commit Message

Aurelien Jarno Dec. 22, 2013, 11:50 a.m. UTC
Splitting Q and M out of SR, it's possible to optimize div1 by using
TCG code instead of an helper.

Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
---
 target-sh4/cpu.h       |   13 ++++--
 target-sh4/helper.h    |    1 -
 target-sh4/op_helper.c |  118 ------------------------------------------------
 target-sh4/translate.c |   67 +++++++++++++++++++++++----
 4 files changed, 68 insertions(+), 131 deletions(-)

Comments

Richard Henderson Dec. 24, 2013, 2:44 p.m. UTC | #1
On 12/22/2013 03:50 AM, Aurelien Jarno wrote:
>  static void gen_read_sr(TCGv dst)
>  {
> -    tcg_gen_andi_i32(dst, cpu_sr, ~(1u << SR_T));
> -    tcg_gen_or_i32(dst, dst, cpu_sr_t);
> +    TCGv t0 = tcg_temp_new();
> +    tcg_gen_andi_i32(dst, cpu_sr,
> +                     ~((1u << SR_Q) | (1u << SR_M) | (1u << SR_T)));
> +    tcg_gen_shli_i32(t0, cpu_sr_q, SR_Q);
> +    tcg_gen_or_i32(dst, dst, t0);
> +    tcg_gen_shli_i32(t0, cpu_sr_m, SR_M);
> +    tcg_gen_or_i32(dst, dst, t0);
> +    tcg_gen_shli_i32(t0, cpu_sr_t, SR_T);
> +    tcg_gen_or_i32(dst, dst, t0);
> +    tcg_temp_free_i32(t0);
>  }

Similar comments for SR_[QM] as for SR_T wrt who clears the relevant bits in
env->sr.


>      case 0x2007:		/* div0s Rm,Rn */
>  	{
> -            gen_copy_bit_i32(cpu_sr, SR_Q, REG(B11_8), 31);     /* SR_Q */
> -            gen_copy_bit_i32(cpu_sr, SR_M, REG(B7_4), 31);      /* SR_M */
> +            tcg_gen_shri_i32(cpu_sr_q, REG(B11_8), 31);         /* SR_Q */
> +            tcg_gen_mov_i32(cpu_sr_m, cpu_sr_q);                /* SR_M */
>  	    TCGv val = tcg_temp_new();
>              tcg_gen_xor_i32(cpu_sr_t, REG(B7_4), REG(B11_8));
>              tcg_gen_shri_i32(cpu_sr_t, cpu_sr_t, 31);           /* SR_T */

Error setting M.  Q and M are set from different source registers.

And as a point of optimization, T no longer needs the shift if one uses the
extracted Q and M as inputs.

> +            /* add or subtract arg0 from arg1 depending if Q == M */
> +            tcg_gen_xor_i32(t1, cpu_sr_q, cpu_sr_m);
> +            tcg_gen_subi_i32(t1, t1, 1);
> +            tcg_gen_neg_i32(t2, REG(B7_4));
> +            tcg_gen_movcond_i32(TCG_COND_EQ, t2, t1, zero, REG(B7_4), t2);
> +            tcg_gen_add2_i32(REG(B11_8), t1, REG(B11_8), zero, t2, t1);

Why so complicated with the comparison?  I'd have expected

  tcg_gen_movcond_i32(TCG_COND_EQ, t2, cpu_sr_q, cpu_sr_m, REG(B7_4), t2);

Hmm... except I see you're re-using the condition as the high-part of the add2.
 That's pretty tricky.  Perhaps expand upon the comment?


r~
diff mbox

Patch

diff --git a/target-sh4/cpu.h b/target-sh4/cpu.h
index b7dd7ab..82221c8 100644
--- a/target-sh4/cpu.h
+++ b/target-sh4/cpu.h
@@ -140,6 +140,8 @@  typedef struct CPUSH4State {
     uint32_t gregs[24];		/* general registers */
     float32 fregs[32];		/* floating point registers */
     uint32_t sr;                /* status register (with T split out) */
+    uint32_t sr_m;              /* M bit of status register */
+    uint32_t sr_q;              /* Q bit of status register */
     uint32_t sr_t;              /* T bit of status register */
     uint32_t ssr;		/* saved status register */
     uint32_t spc;		/* saved program counter */
@@ -342,13 +344,18 @@  static inline int cpu_ptel_pr (uint32_t ptel)
 
 static inline target_ulong cpu_read_sr(CPUSH4State *env)
 {
-    return (env->sr & ~(1u << SR_T)) | (env->sr_t << SR_T);
+    return (env->sr & ~((1u << SR_M) | (1u << SR_Q) | (1u << SR_T))) |
+           (env->sr_m << SR_M) |
+           (env->sr_q << SR_Q) |
+           (env->sr_t << SR_T);
 }
 
 static inline void cpu_write_sr(CPUSH4State *env, target_ulong sr)
 {
-    env->sr_t = sr & (1u << SR_T);
-    env->sr = sr & ~(1u << SR_T);
+    env->sr_m = (sr >> SR_M) & 1;
+    env->sr_q = (sr >> SR_Q) & 1;
+    env->sr_t = (sr >> SR_T) & 1;
+    env->sr = sr & ~((1u << SR_M) | (1u << SR_Q) | (1u << SR_T));
 }
 
 static inline void cpu_get_tb_cpu_state(CPUSH4State *env, target_ulong *pc,
diff --git a/target-sh4/helper.h b/target-sh4/helper.h
index 7162448..fbbe264 100644
--- a/target-sh4/helper.h
+++ b/target-sh4/helper.h
@@ -13,7 +13,6 @@  DEF_HELPER_3(movcal, void, env, i32, i32)
 DEF_HELPER_1(discard_movcal_backup, void, env)
 DEF_HELPER_2(ocbi, void, env, i32)
 
-DEF_HELPER_3(div1, i32, env, i32, i32)
 DEF_HELPER_3(macl, void, env, i32, i32)
 DEF_HELPER_3(macw, void, env, i32, i32)
 
diff --git a/target-sh4/op_helper.c b/target-sh4/op_helper.c
index 0e881a8..3ed0e2d 100644
--- a/target-sh4/op_helper.c
+++ b/target-sh4/op_helper.c
@@ -166,124 +166,6 @@  void helper_ocbi(CPUSH4State *env, uint32_t address)
     }
 }
 
-#define T (env->sr_t)
-#define Q (env->sr & (1u << SR_Q) ? 1 : 0)
-#define M (env->sr & (1u << SR_M) ? 1 : 0)
-#define SETT (env->sr_t = 1)
-#define CLRT (env->sr_t = 0)
-#define SETQ (env->sr |= (1u << SR_Q))
-#define CLRQ (env->sr &= ~(1u << SR_Q))
-#define SETM (env->sr |= (1u << SR_M))
-#define CLRM (env->sr &= ~(1u << SR_M))
-
-uint32_t helper_div1(CPUSH4State *env, uint32_t arg0, uint32_t arg1)
-{
-    uint32_t tmp0, tmp2;
-    uint8_t old_q, tmp1 = 0xff;
-
-    //printf("div1 arg0=0x%08x arg1=0x%08x M=%d Q=%d T=%d\n", arg0, arg1, M, Q, T);
-    old_q = Q;
-    if ((0x80000000 & arg1) != 0)
-	SETQ;
-    else
-	CLRQ;
-    tmp2 = arg0;
-    arg1 <<= 1;
-    arg1 |= T;
-    switch (old_q) {
-    case 0:
-	switch (M) {
-	case 0:
-	    tmp0 = arg1;
-	    arg1 -= tmp2;
-	    tmp1 = arg1 > tmp0;
-	    switch (Q) {
-	    case 0:
-		if (tmp1)
-		    SETQ;
-		else
-		    CLRQ;
-		break;
-	    case 1:
-		if (tmp1 == 0)
-		    SETQ;
-		else
-		    CLRQ;
-		break;
-	    }
-	    break;
-	case 1:
-	    tmp0 = arg1;
-	    arg1 += tmp2;
-	    tmp1 = arg1 < tmp0;
-	    switch (Q) {
-	    case 0:
-		if (tmp1 == 0)
-		    SETQ;
-		else
-		    CLRQ;
-		break;
-	    case 1:
-		if (tmp1)
-		    SETQ;
-		else
-		    CLRQ;
-		break;
-	    }
-	    break;
-	}
-	break;
-    case 1:
-	switch (M) {
-	case 0:
-	    tmp0 = arg1;
-	    arg1 += tmp2;
-	    tmp1 = arg1 < tmp0;
-	    switch (Q) {
-	    case 0:
-		if (tmp1)
-		    SETQ;
-		else
-		    CLRQ;
-		break;
-	    case 1:
-		if (tmp1 == 0)
-		    SETQ;
-		else
-		    CLRQ;
-		break;
-	    }
-	    break;
-	case 1:
-	    tmp0 = arg1;
-	    arg1 -= tmp2;
-	    tmp1 = arg1 > tmp0;
-	    switch (Q) {
-	    case 0:
-		if (tmp1 == 0)
-		    SETQ;
-		else
-		    CLRQ;
-		break;
-	    case 1:
-		if (tmp1)
-		    SETQ;
-		else
-		    CLRQ;
-		break;
-	    }
-	    break;
-	}
-	break;
-    }
-    if (Q == M)
-	SETT;
-    else
-	CLRT;
-    //printf("Output: arg1=0x%08x M=%d Q=%d T=%d\n", arg1, M, Q, T);
-    return arg1;
-}
-
 void helper_macl(CPUSH4State *env, uint32_t arg0, uint32_t arg1)
 {
     int64_t res;
diff --git a/target-sh4/translate.c b/target-sh4/translate.c
index 4ef0398..d4046f8 100644
--- a/target-sh4/translate.c
+++ b/target-sh4/translate.c
@@ -59,7 +59,8 @@  enum {
 /* global register indexes */
 static TCGv_ptr cpu_env;
 static TCGv cpu_gregs[24];
-static TCGv cpu_pc, cpu_sr, cpu_sr_t, cpu_ssr, cpu_spc, cpu_gbr;
+static TCGv cpu_sr, cpu_sr_m, cpu_sr_q, cpu_sr_t;
+static TCGv cpu_pc, cpu_ssr, cpu_spc, cpu_gbr;
 static TCGv cpu_vbr, cpu_sgr, cpu_dbr, cpu_mach, cpu_macl;
 static TCGv cpu_pr, cpu_fpscr, cpu_fpul, cpu_ldst;
 static TCGv cpu_fregs[32];
@@ -107,6 +108,10 @@  void sh4_translate_init(void)
                                     offsetof(CPUSH4State, pc), "PC");
     cpu_sr = tcg_global_mem_new_i32(TCG_AREG0,
                                     offsetof(CPUSH4State, sr), "SR");
+    cpu_sr_m = tcg_global_mem_new_i32(TCG_AREG0,
+                                    offsetof(CPUSH4State, sr_m), "SR_M");
+    cpu_sr_q = tcg_global_mem_new_i32(TCG_AREG0,
+                                    offsetof(CPUSH4State, sr_q), "SR_Q");
     cpu_sr_t = tcg_global_mem_new_i32(TCG_AREG0,
                                     offsetof(CPUSH4State, sr_t), "SR_T");
     cpu_ssr = tcg_global_mem_new_i32(TCG_AREG0,
@@ -175,14 +180,28 @@  void superh_cpu_dump_state(CPUState *cs, FILE *f,
 }
 static void gen_read_sr(TCGv dst)
 {
-    tcg_gen_andi_i32(dst, cpu_sr, ~(1u << SR_T));
-    tcg_gen_or_i32(dst, dst, cpu_sr_t);
+    TCGv t0 = tcg_temp_new();
+    tcg_gen_andi_i32(dst, cpu_sr,
+                     ~((1u << SR_Q) | (1u << SR_M) | (1u << SR_T)));
+    tcg_gen_shli_i32(t0, cpu_sr_q, SR_Q);
+    tcg_gen_or_i32(dst, dst, t0);
+    tcg_gen_shli_i32(t0, cpu_sr_m, SR_M);
+    tcg_gen_or_i32(dst, dst, t0);
+    tcg_gen_shli_i32(t0, cpu_sr_t, SR_T);
+    tcg_gen_or_i32(dst, dst, t0);
+    tcg_temp_free_i32(t0);
 }
 
 static void gen_write_sr(TCGv src)
 {
-    tcg_gen_andi_i32(cpu_sr, src, ~(1u << SR_T));
-    tcg_gen_andi_i32(cpu_sr_t, src, (1u << SR_T));
+    tcg_gen_andi_i32(cpu_sr, src,
+                     ~((1u << SR_Q) | (1u << SR_M) | (1u << SR_T)));
+    tcg_gen_shri_i32(cpu_sr_q, src, SR_Q);
+    tcg_gen_andi_i32(cpu_sr_q, cpu_sr_q, 1);
+    tcg_gen_shri_i32(cpu_sr_m, src, SR_M);
+    tcg_gen_andi_i32(cpu_sr_m, cpu_sr_m, 1);
+    tcg_gen_shri_i32(cpu_sr_t, src, SR_T);
+    tcg_gen_andi_i32(cpu_sr_t, cpu_sr_t, 1);
 }
 
 static void gen_goto_tb(DisasContext * ctx, int n, target_ulong dest)
@@ -389,7 +408,8 @@  static void _decode_opc(DisasContext * ctx)
 
     switch (ctx->opcode) {
     case 0x0019:		/* div0u */
-        tcg_gen_andi_i32(cpu_sr, cpu_sr, ~((1u << SR_M) | (1u << SR_Q)));
+        tcg_gen_movi_i32(cpu_sr_m, 0);
+        tcg_gen_movi_i32(cpu_sr_q, 0);
         tcg_gen_movi_i32(cpu_sr_t, 0);
 	return;
     case 0x000b:		/* rts */
@@ -708,8 +728,8 @@  static void _decode_opc(DisasContext * ctx)
 	return;
     case 0x2007:		/* div0s Rm,Rn */
 	{
-            gen_copy_bit_i32(cpu_sr, SR_Q, REG(B11_8), 31);     /* SR_Q */
-            gen_copy_bit_i32(cpu_sr, SR_M, REG(B7_4), 31);      /* SR_M */
+            tcg_gen_shri_i32(cpu_sr_q, REG(B11_8), 31);         /* SR_Q */
+            tcg_gen_mov_i32(cpu_sr_m, cpu_sr_q);                /* SR_M */
 	    TCGv val = tcg_temp_new();
             tcg_gen_xor_i32(cpu_sr_t, REG(B7_4), REG(B11_8));
             tcg_gen_shri_i32(cpu_sr_t, cpu_sr_t, 31);           /* SR_T */
@@ -717,7 +737,36 @@  static void _decode_opc(DisasContext * ctx)
 	}
 	return;
     case 0x3004:		/* div1 Rm,Rn */
-        gen_helper_div1(REG(B11_8), cpu_env, REG(B7_4), REG(B11_8));
+        {
+            TCGv t0 = tcg_temp_new();
+            TCGv t1 = tcg_temp_new();
+            TCGv t2 = tcg_temp_new();
+            TCGv zero = tcg_const_i32(0);
+
+            /* shift left arg1, saving the bit being pushed out and inserting
+               T on the right */
+            tcg_gen_shri_i32(t0, REG(B11_8), 31);
+            tcg_gen_shli_i32(REG(B11_8), REG(B11_8), 1);
+            tcg_gen_or_i32(REG(B11_8), REG(B11_8), cpu_sr_t);
+
+            /* add or subtract arg0 from arg1 depending if Q == M */
+            tcg_gen_xor_i32(t1, cpu_sr_q, cpu_sr_m);
+            tcg_gen_subi_i32(t1, t1, 1);
+            tcg_gen_neg_i32(t2, REG(B7_4));
+            tcg_gen_movcond_i32(TCG_COND_EQ, t2, t1, zero, REG(B7_4), t2);
+            tcg_gen_add2_i32(REG(B11_8), t1, REG(B11_8), zero, t2, t1);
+
+            /* compute T and Q depending on carry */
+            tcg_gen_andi_i32(t1, t1, 1);
+            tcg_gen_xor_i32(t1, t1, t0);
+            tcg_gen_xori_i32(cpu_sr_t, t1, 1);
+            tcg_gen_xor_i32(cpu_sr_q, cpu_sr_m, t1);
+
+            tcg_temp_free(zero);
+            tcg_temp_free(t2);
+            tcg_temp_free(t1);
+            tcg_temp_free(t0);
+        }
 	return;
     case 0x300d:		/* dmuls.l Rm,Rn */
         tcg_gen_muls2_i32(cpu_macl, cpu_mach, REG(B7_4), REG(B11_8));