diff mbox

[RFC,12/28] target-xtensa: implement shifts (ST1 and RST1 groups)

Message ID 1304470768-16924-12-git-send-email-jcmvbkbc@gmail.com
State New
Headers show

Commit Message

Max Filippov May 4, 2011, 12:59 a.m. UTC
- ST1: SAR (shift amount special register) manipulation, NSA(U);
- RST1: shifts, 16-bit multiplication.

Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
---
 target-xtensa/cpu.h       |    4 +
 target-xtensa/translate.c |  210 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 214 insertions(+), 0 deletions(-)

Comments

Richard Henderson May 4, 2011, 4:16 p.m. UTC | #1
On 05/03/2011 05:59 PM, Max Filippov wrote:
> +                    HAS_OPTION(XTENSA_OPTION_MISC_OP);
> +                    {
> +#define gen_bit_bisect(w) do { \
> +        int label = gen_new_label(); \
> +        tcg_gen_brcondi_i32(TCG_COND_LTU, tmp, 1 << (w), label); \
> +        tcg_gen_shri_i32(tmp, tmp, (w)); \
> +        tcg_gen_subi_i32(res, res, (w)); \
> +        gen_set_label(label); \
> +    } while (0)
> +
> +                        int label = gen_new_label();
> +                        TCGv_i32 res = tcg_temp_local_new_i32();
> +
> +                        tcg_gen_movi_i32(res, 32);
> +                        tcg_gen_brcondi_i32(
> +                                TCG_COND_EQ, cpu_R[RRR_S], 0, label);
> +                        {
> +                            TCGv_i32 tmp = tcg_temp_local_new_i32();
> +                            tcg_gen_mov_i32(tmp, cpu_R[RRR_S]);
> +                            tcg_gen_movi_i32(res, 31);
> +
> +                            gen_bit_bisect(16);
> +                            gen_bit_bisect(8);
> +                            gen_bit_bisect(4);
> +                            gen_bit_bisect(2);
> +                            gen_bit_bisect(1);
> +
> +                            tcg_temp_free(tmp);
> +                        }
> +                        gen_set_label(label);
> +                        tcg_gen_mov_i32(cpu_R[RRR_T], res);
> +                        tcg_temp_free(res);
> +#undef gen_bit_bisect

This instruction is probably right at the edge of the size restrictions
on the number of ops allowed to be emitted per guest insn.  It probably
makes more sense to move this to an out-of-line helper function.

Also note that this is implementable more efficiently on hosts that have
a count-leading-zeros function:

uint32_t HELPER(nsau)(uint32_t val)
{
    return val ? clz32(val) : 32;
}

uint32_t HELPER(nsa)(int32_t val)
{
    if (val < 0) {
        val = ~val;
    }
    if (val == 0) {
        return 31;
    }
    return clz32(val) - 1;
}

> +            case 9: /*SRL*/
> +                {
> +                    TCGv_i64 v = tcg_temp_new_i64();
> +                    tcg_gen_extu_i32_i64(v, cpu_R[RRR_T]);
> +                    gen_shift(shr);
> +                }
> +                break;
> +
> +            case 10: /*SLL*/
> +                {
> +                    TCGv_i64 v = tcg_temp_new_i64();
> +                    TCGv_i32 s = tcg_const_i32(32);
> +                    tcg_gen_sub_i32(s, s, cpu_SR[SAR]);
> +                    tcg_gen_extu_i32_i64(v, cpu_R[RRR_S]);
> +                    gen_shift_reg(shl, s);
> +                    tcg_temp_free(s);
> +                }
> +                break;
> +
> +            case 11: /*SRA*/
> +                {
> +                    TCGv_i64 v = tcg_temp_new_i64();
> +                    tcg_gen_ext_i32_i64(v, cpu_R[RRR_T]);
> +                    gen_shift(sar);
> +                }

Are you implementing some of these as 64-bit shifts simply
to get a shift count of 32 correct?  While I admit that it's
probably the most efficient mechanism when the host is 64-bit,
it's somewhat less than clear.  You could stand to add some
commentary here about your choice.

As a future enhancement, it might be worthwhile to track any
known contents of SAR within the TB (see how other ports put
information about the state of the flags register in the 
DisassContext).  If you have a known value in the SAR, you
can emit the proper 32-bit shift directly.


r~
Max Filippov May 4, 2011, 4:39 p.m. UTC | #2
>> +                    HAS_OPTION(XTENSA_OPTION_MISC_OP);
>> +                    {
>> +#define gen_bit_bisect(w) do { \
>> +        int label = gen_new_label(); \
>> +        tcg_gen_brcondi_i32(TCG_COND_LTU, tmp, 1 << (w), label); \
>> +        tcg_gen_shri_i32(tmp, tmp, (w)); \
>> +        tcg_gen_subi_i32(res, res, (w)); \
>> +        gen_set_label(label); \
>> +    } while (0)
>> +
>> +                        int label = gen_new_label();
>> +                        TCGv_i32 res = tcg_temp_local_new_i32();
>> +
>> +                        tcg_gen_movi_i32(res, 32);
>> +                        tcg_gen_brcondi_i32(
>> +                                TCG_COND_EQ, cpu_R[RRR_S], 0, label);
>> +                        {
>> +                            TCGv_i32 tmp = tcg_temp_local_new_i32();
>> +                            tcg_gen_mov_i32(tmp, cpu_R[RRR_S]);
>> +                            tcg_gen_movi_i32(res, 31);
>> +
>> +                            gen_bit_bisect(16);
>> +                            gen_bit_bisect(8);
>> +                            gen_bit_bisect(4);
>> +                            gen_bit_bisect(2);
>> +                            gen_bit_bisect(1);
>> +
>> +                            tcg_temp_free(tmp);
>> +                        }
>> +                        gen_set_label(label);
>> +                        tcg_gen_mov_i32(cpu_R[RRR_T], res);
>> +                        tcg_temp_free(res);
>> +#undef gen_bit_bisect
>
> This instruction is probably right at the edge of the size restrictions
> on the number of ops allowed to be emitted per guest insn.  It probably
> makes more sense to move this to an out-of-line helper function.
>
> Also note that this is implementable more efficiently on hosts that have
> a count-leading-zeros function:
>
> uint32_t HELPER(nsau)(uint32_t val)
> {
>    return val ? clz32(val) : 32;
> }
>
> uint32_t HELPER(nsa)(int32_t val)
> {
>    if (val < 0) {
>        val = ~val;
>    }
>    if (val == 0) {
>        return 31;
>    }
>    return clz32(val) - 1;
> }

Thanks for the hint, this way it looks much better.

>> +            case 9: /*SRL*/
>> +                {
>> +                    TCGv_i64 v = tcg_temp_new_i64();
>> +                    tcg_gen_extu_i32_i64(v, cpu_R[RRR_T]);
>> +                    gen_shift(shr);
>> +                }
>> +                break;
>> +
>> +            case 10: /*SLL*/
>> +                {
>> +                    TCGv_i64 v = tcg_temp_new_i64();
>> +                    TCGv_i32 s = tcg_const_i32(32);
>> +                    tcg_gen_sub_i32(s, s, cpu_SR[SAR]);
>> +                    tcg_gen_extu_i32_i64(v, cpu_R[RRR_S]);
>> +                    gen_shift_reg(shl, s);
>> +                    tcg_temp_free(s);
>> +                }
>> +                break;
>> +
>> +            case 11: /*SRA*/
>> +                {
>> +                    TCGv_i64 v = tcg_temp_new_i64();
>> +                    tcg_gen_ext_i32_i64(v, cpu_R[RRR_T]);
>> +                    gen_shift(sar);
>> +                }
>
> Are you implementing some of these as 64-bit shifts simply
> to get a shift count of 32 correct?  While I admit that it's
> probably the most efficient mechanism when the host is 64-bit,
> it's somewhat less than clear.  You could stand to add some
> commentary here about your choice.

Yes, possibility of indirect 32 bit shift was the only reason for that.
Will document that.

> As a future enhancement, it might be worthwhile to track any
> known contents of SAR within the TB (see how other ports put
> information about the state of the flags register in the
> DisassContext).  If you have a known value in the SAR, you
> can emit the proper 32-bit shift directly.

To track immediate values written to SAR? You mean that there may be
some performance difference of fixed size shift vs indirect shift and
TCG is able to tell them apart?

Thanks.
-- Max
Richard Henderson May 4, 2011, 7:07 p.m. UTC | #3
On 05/04/2011 09:39 AM, Max Filippov wrote:
> To track immediate values written to SAR? You mean that there may be
> some performance difference of fixed size shift vs indirect shift and
> TCG is able to tell them apart?

Well, not really fixed vs indirect, but if you know that the value
in the SAR register is in the right range, you can avoid using a
64-bit shift.

For instance,

	SSL	ar2
	SLL	ar0, ar1

could be implemented with

	tcg_gen_sll_i32(ar0, ar1, ar2);

assuming we have enough context.

Let us decompose the SAR register into two parts, storing both the
true value, and 32-value.

    struct DisasContext {
        // Current Stuff
	// ...

	// When valid, holds 32-SAR.
        TCGv sar_m32;
	bool sar_m32_alloc;
	bool sar_m32_valid;
	bool sar_5bit;
    };

At the beginning of the TB:

	TCGV_UNUSED_I32(dc->sar_m32);
	dc->sar_m32_alloc = false;
	dc->sar_m32_valid = false;
	dc->sar_5bit = false;



static void gen_set_sra_m32(DisasContext *dc, TCGv val)
{
    if (!dc->sar_m32_alloc) {
        dc->sar_m32_alloc = true;
        dc->sar_m32 = tcg_temp_local_new_i32();
    }
    dc->sar_m32_valid = true;

    /* Clear 5 bit because the SAR value could be 32.  */
    dc->sar_5bit = false;

    tcg_gen_movi_i32(cpu_SR[SAR], 32);
    tcg_gen_sub_i32(cpu_SR[SAR], cpu_SR[SAR], val);
    tcg_gen_mov_i32(dc->sar_m32, val);
}

static void gen_set_sra(DisasContext *dc, TCGv val, bool is_5bit)
{
    if (dc->sar_m32_alloc && dc->sar_m32_valid) {
	tcg_gen_discard_i32(dc->sar_m32);
    }
    dc->sar_m32_valid = false;
    dc->sar_5bit = is_5bit;

    tcg_gen_mov_i32(cpu_SR[SAR], val);
}

	/* SSL */
	tcg_gen_andi_i32(tmp, cpu_R[AS], 31);
	gen_set_sra_m32(dc, tmp);
	break;

	/* SRL */
	tcg_gen_andi_i32(tmp, cpu_R[AS], 31);
	gen_set_sra(dc, tmp, true);
	break;

	/* WSR.SAR */
	tcg_gen_andi_i32(tmp, cpu_R[AS], 63);
	gen_set_sra(dc, tmp, false);
	break;

	/* SSAI */
	tcg_gen_movi_i32(tmp, constant);
	gen_gen_sra(dc, tmp, true);
	break;

	/* SLL */
	if (dc->sar_m32_valid) {
	    tcg_gen_sll_i32(cpu_R[AR], cpu_R[AS], dc->sar_m32);
        } else {
	    /* your existing 64-bit shift emulation.  */
	}
	break;

	/* SRL */
	if (dc->sar_5bit) {
	    tcg_gen_srl_i32(cpu_R[AR], cpu_R[AS], cpu_SR[SAR]);
	} else {
	    /* your existing 64-bit shift emulation.  */
	}


A couple of points: The use of the local temp avoids problems with
intervening insns that might generate branch opcodes.  For the
simplest cases, as with the case at the start of the message, we
ought to be able to propagate the values into the TCG shift insn
directly.

Does that make sense?


r~
Max Filippov May 5, 2011, 8:40 a.m. UTC | #4
>> To track immediate values written to SAR? You mean that there may be
>> some performance difference of fixed size shift vs indirect shift and
>> TCG is able to tell them apart?
>
> Well, not really fixed vs indirect, but if you know that the value
> in the SAR register is in the right range, you can avoid using a
> 64-bit shift.
>
> For instance,
>
>        SSL     ar2
>        SLL     ar0, ar1
>
> could be implemented with
>
>        tcg_gen_sll_i32(ar0, ar1, ar2);
>
> assuming we have enough context.
>
> Let us decompose the SAR register into two parts, storing both the
> true value, and 32-value.
>
>    struct DisasContext {
>        // Current Stuff
>        // ...
>
>        // When valid, holds 32-SAR.
>        TCGv sar_m32;
>        bool sar_m32_alloc;
>        bool sar_m32_valid;
>        bool sar_5bit;
>    };
>
> At the beginning of the TB:
>
>        TCGV_UNUSED_I32(dc->sar_m32);
>        dc->sar_m32_alloc = false;
>        dc->sar_m32_valid = false;
>        dc->sar_5bit = false;
>
>
>
> static void gen_set_sra_m32(DisasContext *dc, TCGv val)
> {
>    if (!dc->sar_m32_alloc) {
>        dc->sar_m32_alloc = true;
>        dc->sar_m32 = tcg_temp_local_new_i32();
>    }
>    dc->sar_m32_valid = true;
>
>    /* Clear 5 bit because the SAR value could be 32.  */
>    dc->sar_5bit = false;
>
>    tcg_gen_movi_i32(cpu_SR[SAR], 32);
>    tcg_gen_sub_i32(cpu_SR[SAR], cpu_SR[SAR], val);
>    tcg_gen_mov_i32(dc->sar_m32, val);
> }
>
> static void gen_set_sra(DisasContext *dc, TCGv val, bool is_5bit)
> {
>    if (dc->sar_m32_alloc && dc->sar_m32_valid) {
>        tcg_gen_discard_i32(dc->sar_m32);
>    }
>    dc->sar_m32_valid = false;
>    dc->sar_5bit = is_5bit;
>
>    tcg_gen_mov_i32(cpu_SR[SAR], val);
> }
>
>        /* SSL */
>        tcg_gen_andi_i32(tmp, cpu_R[AS], 31);
>        gen_set_sra_m32(dc, tmp);
>        break;
>
>        /* SRL */
>        tcg_gen_andi_i32(tmp, cpu_R[AS], 31);
>        gen_set_sra(dc, tmp, true);
>        break;
>
>        /* WSR.SAR */
>        tcg_gen_andi_i32(tmp, cpu_R[AS], 63);
>        gen_set_sra(dc, tmp, false);
>        break;
>
>        /* SSAI */
>        tcg_gen_movi_i32(tmp, constant);
>        gen_gen_sra(dc, tmp, true);
>        break;
>
>        /* SLL */
>        if (dc->sar_m32_valid) {
>            tcg_gen_sll_i32(cpu_R[AR], cpu_R[AS], dc->sar_m32);
>        } else {
>            /* your existing 64-bit shift emulation.  */
>        }
>        break;
>
>        /* SRL */
>        if (dc->sar_5bit) {
>            tcg_gen_srl_i32(cpu_R[AR], cpu_R[AS], cpu_SR[SAR]);
>        } else {
>            /* your existing 64-bit shift emulation.  */
>        }
>
>
> A couple of points: The use of the local temp avoids problems with
> intervening insns that might generate branch opcodes.  For the
> simplest cases, as with the case at the start of the message, we
> ought to be able to propagate the values into the TCG shift insn
> directly.
>
> Does that make sense?

Yes it does. Thanks for the good explanation.
I tried to keep it all as simple as possible to have a working
prototype qickly. Now that it works optimizations should be no
problem.

Thanks.
-- Max
diff mbox

Patch

diff --git a/target-xtensa/cpu.h b/target-xtensa/cpu.h
index e99e3bb..a13a6cb 100644
--- a/target-xtensa/cpu.h
+++ b/target-xtensa/cpu.h
@@ -105,6 +105,10 @@  enum {
     FSR = 233,
 };
 
+enum {
+    SAR = 3,
+};
+
 typedef struct XtensaConfig {
     const char *name;
     uint64_t options;
diff --git a/target-xtensa/translate.c b/target-xtensa/translate.c
index 031873e..a940417 100644
--- a/target-xtensa/translate.c
+++ b/target-xtensa/translate.c
@@ -57,6 +57,7 @@  static TCGv_i32 cpu_UR[256];
 #include "gen-icount.h"
 
 static const char * const sregnames[256] = {
+    [SAR] = "SAR",
 };
 
 static const char * const uregnames[256] = {
@@ -295,6 +296,101 @@  static void disas_xtensa_insn(DisasContext *dc)
                 break;
 
             case 4: /*ST1*/
+                switch (RRR_R) {
+                case 0: /*SSR*/
+                    tcg_gen_andi_i32(cpu_SR[SAR], cpu_R[RRR_S], 0x1f);
+                    break;
+
+                case 1: /*SSL*/
+                    {
+                        TCGv_i32 base = tcg_const_i32(32);
+                        TCGv_i32 tmp = tcg_temp_new_i32();
+                        tcg_gen_andi_i32(tmp, cpu_R[RRR_S], 0x1f);
+                        tcg_gen_sub_i32(cpu_SR[SAR], base, tmp);
+                        tcg_temp_free(tmp);
+                        tcg_temp_free(base);
+                    }
+                    break;
+
+                case 2: /*SSA8L*/
+                    {
+                        TCGv_i32 tmp = tcg_temp_new_i32();
+                        tcg_gen_andi_i32(tmp, cpu_R[RRR_S], 0x3);
+                        tcg_gen_shli_i32(cpu_SR[SAR], tmp, 3);
+                        tcg_temp_free(tmp);
+                    }
+                    break;
+
+                case 3: /*SSA8B*/
+                    {
+                        TCGv_i32 base = tcg_const_i32(32);
+                        TCGv_i32 tmp = tcg_temp_new_i32();
+                        tcg_gen_andi_i32(tmp, cpu_R[RRR_S], 0x3);
+                        tcg_gen_shli_i32(tmp, tmp, 3);
+                        tcg_gen_sub_i32(cpu_SR[SAR], base, tmp);
+                        tcg_temp_free(tmp);
+                        tcg_temp_free(base);
+                    }
+                    break;
+
+                case 4: /*SSAI*/
+                    tcg_gen_movi_i32(cpu_SR[SAR], RRR_S | ((RRR_T & 1) << 4));
+                    break;
+
+                case 6: /*RER*/
+                    break;
+
+                case 7: /*WER*/
+                    break;
+
+                case 8: /*ROTWw*/
+                    HAS_OPTION(XTENSA_OPTION_WINDOWED_REGISTER);
+                    break;
+
+                case 14: /*NSAu*/
+                    HAS_OPTION(XTENSA_OPTION_MISC_OP);
+                    break;
+
+                case 15: /*NSAUu*/
+                    HAS_OPTION(XTENSA_OPTION_MISC_OP);
+                    {
+#define gen_bit_bisect(w) do { \
+        int label = gen_new_label(); \
+        tcg_gen_brcondi_i32(TCG_COND_LTU, tmp, 1 << (w), label); \
+        tcg_gen_shri_i32(tmp, tmp, (w)); \
+        tcg_gen_subi_i32(res, res, (w)); \
+        gen_set_label(label); \
+    } while (0)
+
+                        int label = gen_new_label();
+                        TCGv_i32 res = tcg_temp_local_new_i32();
+
+                        tcg_gen_movi_i32(res, 32);
+                        tcg_gen_brcondi_i32(
+                                TCG_COND_EQ, cpu_R[RRR_S], 0, label);
+                        {
+                            TCGv_i32 tmp = tcg_temp_local_new_i32();
+                            tcg_gen_mov_i32(tmp, cpu_R[RRR_S]);
+                            tcg_gen_movi_i32(res, 31);
+
+                            gen_bit_bisect(16);
+                            gen_bit_bisect(8);
+                            gen_bit_bisect(4);
+                            gen_bit_bisect(2);
+                            gen_bit_bisect(1);
+
+                            tcg_temp_free(tmp);
+                        }
+                        gen_set_label(label);
+                        tcg_gen_mov_i32(cpu_R[RRR_T], res);
+                        tcg_temp_free(res);
+#undef gen_bit_bisect
+                    }
+                    break;
+
+                default: /*reserved*/
+                    break;
+                }
                 break;
 
             case 5: /*TLB*/
@@ -358,6 +454,111 @@  static void disas_xtensa_insn(DisasContext *dc)
             break;
 
         case 1: /*RST1*/
+            switch (_OP2) {
+            case 0: /*SLLI*/
+            case 1:
+                tcg_gen_shli_i32(cpu_R[RRR_R], cpu_R[RRR_S],
+                        32 - (RRR_T | ((_OP2 & 1) << 4)));
+                break;
+
+            case 2: /*SRAI*/
+            case 3:
+                tcg_gen_sari_i32(cpu_R[RRR_R], cpu_R[RRR_T],
+                        RRR_S | ((_OP2 & 1) << 4));
+                break;
+
+            case 4: /*SRLI*/
+                tcg_gen_shri_i32(cpu_R[RRR_R], cpu_R[RRR_T], RRR_S);
+                break;
+
+            case 6: /*XSR*/
+                {
+                    TCGv_i32 tmp = tcg_temp_new_i32();
+                    tcg_gen_mov_i32(tmp, cpu_R[RRR_T]);
+                    gen_rsr(cpu_R[RRR_T], RSR_SR);
+                    gen_wsr(dc, RSR_SR, tmp);
+                    tcg_temp_free(tmp);
+                }
+                break;
+
+#define gen_shift_reg(cmd, reg) do { \
+                    TCGv_i64 tmp = tcg_temp_new_i64(); \
+                    tcg_gen_extu_i32_i64(tmp, reg); \
+                    tcg_gen_andi_i64(tmp, tmp, 63); \
+                    tcg_gen_##cmd##_i64(v, v, tmp); \
+                    tcg_gen_trunc_i64_i32(cpu_R[RRR_R], v); \
+                    tcg_temp_free_i64(v); \
+                    tcg_temp_free_i64(tmp); \
+                } while (0)
+
+#define gen_shift(cmd) gen_shift_reg(cmd, cpu_SR[SAR])
+
+            case 8: /*SRC*/
+                {
+                    TCGv_i64 v = tcg_temp_new_i64();
+                    tcg_gen_concat_i32_i64(v, cpu_R[RRR_T], cpu_R[RRR_S]);
+                    gen_shift(shr);
+                }
+                break;
+
+            case 9: /*SRL*/
+                {
+                    TCGv_i64 v = tcg_temp_new_i64();
+                    tcg_gen_extu_i32_i64(v, cpu_R[RRR_T]);
+                    gen_shift(shr);
+                }
+                break;
+
+            case 10: /*SLL*/
+                {
+                    TCGv_i64 v = tcg_temp_new_i64();
+                    TCGv_i32 s = tcg_const_i32(32);
+                    tcg_gen_sub_i32(s, s, cpu_SR[SAR]);
+                    tcg_gen_extu_i32_i64(v, cpu_R[RRR_S]);
+                    gen_shift_reg(shl, s);
+                    tcg_temp_free(s);
+                }
+                break;
+
+            case 11: /*SRA*/
+                {
+                    TCGv_i64 v = tcg_temp_new_i64();
+                    tcg_gen_ext_i32_i64(v, cpu_R[RRR_T]);
+                    gen_shift(sar);
+                }
+                break;
+#undef gen_shift
+#undef gen_shift_reg
+
+            case 12: /*MUL16U*/
+                HAS_OPTION(XTENSA_OPTION_16_BIT_IMUL);
+                {
+                    TCGv_i32 v1 = tcg_temp_new_i32();
+                    TCGv_i32 v2 = tcg_temp_new_i32();
+                    tcg_gen_ext16u_i32(v1, cpu_R[RRR_S]);
+                    tcg_gen_ext16u_i32(v2, cpu_R[RRR_T]);
+                    tcg_gen_mul_i32(cpu_R[RRR_R], v1, v2);
+                    tcg_temp_free(v2);
+                    tcg_temp_free(v1);
+                }
+                break;
+
+            case 13: /*MUL16S*/
+                HAS_OPTION(XTENSA_OPTION_16_BIT_IMUL);
+                {
+                    TCGv_i32 v1 = tcg_temp_new_i32();
+                    TCGv_i32 v2 = tcg_temp_new_i32();
+                    tcg_gen_ext16s_i32(v1, cpu_R[RRR_S]);
+                    tcg_gen_ext16s_i32(v2, cpu_R[RRR_T]);
+                    tcg_gen_mul_i32(cpu_R[RRR_R], v1, v2);
+                    tcg_temp_free(v2);
+                    tcg_temp_free(v1);
+                }
+                break;
+
+            default: /*reserved*/
+                break;
+            }
             break;
 
         case 2: /*RST2*/
@@ -487,6 +688,15 @@  static void disas_xtensa_insn(DisasContext *dc)
 
         case 4: /*EXTUI*/
         case 5:
+            {
+                int shiftimm = RRR_S | (_OP1 << 4);
+                int maskimm = (1 << (_OP2 + 1)) - 1;
+
+                TCGv_i32 tmp = tcg_temp_new_i32();
+                tcg_gen_shri_i32(tmp, cpu_R[RRR_T], shiftimm);
+                tcg_gen_andi_i32(cpu_R[RRR_R], tmp, maskimm);
+                tcg_temp_free(tmp);
+            }
             break;
 
         case 6: /*CUST0*/