diff mbox

[RFC,v1,09/13] target-ppc: add cmpeqb instruction

Message ID 1468861517-2508-10-git-send-email-nikunj@linux.vnet.ibm.com
State New
Headers show

Commit Message

Nikunj A Dadhania July 18, 2016, 5:05 p.m. UTC
From: Swapnil Bokade <bokadeswapnil@gmail.com>

Search a byte in the stream of 8bytes provided in the register

Signed-off-by: Sandipan Das <sandipandas1990@gmail.com>
[ Modified the logic to use lesser temporaries ]
Signed-off-by: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com>
---
 target-ppc/translate.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

Comments

Nikunj A Dadhania July 18, 2016, 5:12 p.m. UTC | #1
Nikunj A Dadhania <nikunj@linux.vnet.ibm.com> writes:

> From: Swapnil Bokade <bokadeswapnil@gmail.com>
>
> Search a byte in the stream of 8bytes provided in the register
>
> Signed-off-by: Sandipan Das <sandipandas1990@gmail.com>

Should have been:

Signed-off-by: Swapnil Bokade <bokadeswapnil@gmail.com>

> [ Modified the logic to use lesser temporaries ]
> Signed-off-by: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com>

Regards
Nikunj
Richard Henderson July 21, 2016, 6:41 a.m. UTC | #2
On 07/18/2016 10:35 PM, Nikunj A Dadhania wrote:
> +    tcg_gen_andi_tl(src1, cpu_gpr[rA(ctx->opcode)], 0xFF);
> +    for (i = 0; i < 64; i += 8) {
> +        tcg_gen_shri_tl(t0, arg1, i);
> +        tcg_gen_andi_tl(t0, t0, 0xFF);
> +        tcg_gen_brcond_tl(TCG_COND_EQ, src1, t0, l1);
> +    }
> +    tcg_gen_movi_i32(cpu_crf[crfD(ctx->opcode)], 0);
> +    tcg_gen_br(l2);
> +    gen_set_label(l1);
> +    /* Set match bit, i.e. CRF_GT */
> +    tcg_gen_movi_i32(cpu_crf[crfD(ctx->opcode)], 1 << CRF_GT);

Ew.  This can be done much better as

   http://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord

Which still might be best done in a helper, because of the constants involved 
(tcg is not nearly so good as gcc in building full 64-bit constants).

C.f. target-alpha/int_helper.c, helper_cmpbe0 (which computes different 
information than cmpeqb, but is still helpful as an example).


r~
Nikunj A Dadhania July 21, 2016, 8:02 a.m. UTC | #3
Richard Henderson <rth@twiddle.net> writes:

> On 07/18/2016 10:35 PM, Nikunj A Dadhania wrote:
>> +    tcg_gen_andi_tl(src1, cpu_gpr[rA(ctx->opcode)], 0xFF);
>> +    for (i = 0; i < 64; i += 8) {
>> +        tcg_gen_shri_tl(t0, arg1, i);
>> +        tcg_gen_andi_tl(t0, t0, 0xFF);
>> +        tcg_gen_brcond_tl(TCG_COND_EQ, src1, t0, l1);
>> +    }
>> +    tcg_gen_movi_i32(cpu_crf[crfD(ctx->opcode)], 0);
>> +    tcg_gen_br(l2);
>> +    gen_set_label(l1);
>> +    /* Set match bit, i.e. CRF_GT */
>> +    tcg_gen_movi_i32(cpu_crf[crfD(ctx->opcode)], 1 << CRF_GT);
>
> Ew.  This can be done much better as
>
>    http://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
>
> Which still might be best done in a helper, because of the constants involved 
> (tcg is not nearly so good as gcc in building full 64-bit constants).
>
> C.f. target-alpha/int_helper.c, helper_cmpbe0 (which computes different 
> information than cmpeqb, but is still helpful as an example).

Sure, I will have a look.

Nikunj
David Gibson July 22, 2016, 4:57 a.m. UTC | #4
On Mon, Jul 18, 2016 at 10:35:13PM +0530, Nikunj A Dadhania wrote:
> From: Swapnil Bokade <bokadeswapnil@gmail.com>
> 
> Search a byte in the stream of 8bytes provided in the register
> 
> Signed-off-by: Sandipan Das <sandipandas1990@gmail.com>
> [ Modified the logic to use lesser temporaries ]
> Signed-off-by: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com>

rth's reference may obsolete the suggestions below.

> ---
>  target-ppc/translate.c | 27 +++++++++++++++++++++++++++
>  1 file changed, 27 insertions(+)
> 
> diff --git a/target-ppc/translate.c b/target-ppc/translate.c
> index a57f7dd..8f7ff49 100644
> --- a/target-ppc/translate.c
> +++ b/target-ppc/translate.c
> @@ -856,6 +856,32 @@ static void gen_cmprb(DisasContext *ctx)
>      tcg_temp_free(src2hi);
>  }
>  
> +/* cmpeqb */
> +static void gen_cmpeqb(DisasContext *ctx)
> +{
> +    TCGLabel *l1 = gen_new_label();
> +    TCGLabel *l2 = gen_new_label();
> +    TCGv src1 = tcg_temp_local_new();
> +    TCGv t0 = tcg_temp_local_new();
> +    TCGv arg1 = cpu_gpr[rB(ctx->opcode)];
> +    int i;
> +
> +    tcg_gen_andi_tl(src1, cpu_gpr[rA(ctx->opcode)], 0xFF);
> +    for (i = 0; i < 64; i += 8) {
> +        tcg_gen_shri_tl(t0, arg1, i);
> +        tcg_gen_andi_tl(t0, t0, 0xFF);

Shifting direct from the original arg each time seems awkward when you
can just shift a working reg by 8 bits each loop.  I suspect that
could save you a temporary.

> +        tcg_gen_brcond_tl(TCG_COND_EQ, src1, t0, l1);
> +    }
> +    tcg_gen_movi_i32(cpu_crf[crfD(ctx->opcode)], 0);
> +    tcg_gen_br(l2);
> +    gen_set_label(l1);
> +    /* Set match bit, i.e. CRF_GT */
> +    tcg_gen_movi_i32(cpu_crf[crfD(ctx->opcode)], 1 << CRF_GT);
> +    gen_set_label(l2);

You should only need one branch at most, either by initializing the
CRF first, or by moving a variable to it.

> +    tcg_temp_free(src1);
> +    tcg_temp_free(t0);
> +}
> +
>  /* isel (PowerPC 2.03 specification) */
>  static void gen_isel(DisasContext *ctx)
>  {
> @@ -10040,6 +10066,7 @@ GEN_HANDLER(cmp, 0x1F, 0x00, 0x00, 0x00400000, PPC_INTEGER),
>  GEN_HANDLER(cmpi, 0x0B, 0xFF, 0xFF, 0x00400000, PPC_INTEGER),
>  GEN_HANDLER(cmpl, 0x1F, 0x00, 0x01, 0x00400000, PPC_INTEGER),
>  GEN_HANDLER(cmpli, 0x0A, 0xFF, 0xFF, 0x00400000, PPC_INTEGER),
> +GEN_HANDLER_E(cmpeqb, 0x1F, 0x00, 0x07, 0x00600000, PPC_NONE, PPC2_ISA300),
>  GEN_HANDLER_E(cmpb, 0x1F, 0x1C, 0x0F, 0x00000001, PPC_NONE, PPC2_ISA205),
>  GEN_HANDLER_E(cmprb, 0x1F, 0x00, 0x06, 0x00400001, PPC_NONE, PPC2_ISA300),
>  GEN_HANDLER(isel, 0x1F, 0x0F, 0xFF, 0x00000001, PPC_ISEL),
Nikunj A Dadhania July 22, 2016, 7:28 p.m. UTC | #5
Richard Henderson <rth@twiddle.net> writes:

> On 07/18/2016 10:35 PM, Nikunj A Dadhania wrote:
>> +    tcg_gen_andi_tl(src1, cpu_gpr[rA(ctx->opcode)], 0xFF);
>> +    for (i = 0; i < 64; i += 8) {
>> +        tcg_gen_shri_tl(t0, arg1, i);
>> +        tcg_gen_andi_tl(t0, t0, 0xFF);
>> +        tcg_gen_brcond_tl(TCG_COND_EQ, src1, t0, l1);
>> +    }
>> +    tcg_gen_movi_i32(cpu_crf[crfD(ctx->opcode)], 0);
>> +    tcg_gen_br(l2);
>> +    gen_set_label(l1);
>> +    /* Set match bit, i.e. CRF_GT */
>> +    tcg_gen_movi_i32(cpu_crf[crfD(ctx->opcode)], 1 << CRF_GT);
>
> Ew.  This can be done much better as
>
>    http://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
>
> Which still might be best done in a helper, because of the constants involved 
> (tcg is not nearly so good as gcc in building full 64-bit constants).
>
> C.f. target-alpha/int_helper.c, helper_cmpbe0 (which computes different 
> information than cmpeqb, but is still helpful as an example).

Thanks for the hints, that got reduce to following:

#define haszero(v) (((v) - 0x0101010101010101UL) & ~(v) & 0x8080808080808080UL)
#define hasvalue(x,n)  (haszero((x) ^ (~0UL/255 * (n))))

uint32_t helper_cmpeqb(target_ulong ra, target_ulong rb)
{
    return !!hasvalue(rb, ra);
}

Regards
Nikunj
Richard Henderson July 23, 2016, 1:17 a.m. UTC | #6
On 07/23/2016 12:58 AM, Nikunj A Dadhania wrote:
> Richard Henderson <rth@twiddle.net> writes:
>
>> On 07/18/2016 10:35 PM, Nikunj A Dadhania wrote:
>>> +    tcg_gen_andi_tl(src1, cpu_gpr[rA(ctx->opcode)], 0xFF);
>>> +    for (i = 0; i < 64; i += 8) {
>>> +        tcg_gen_shri_tl(t0, arg1, i);
>>> +        tcg_gen_andi_tl(t0, t0, 0xFF);
>>> +        tcg_gen_brcond_tl(TCG_COND_EQ, src1, t0, l1);
>>> +    }
>>> +    tcg_gen_movi_i32(cpu_crf[crfD(ctx->opcode)], 0);
>>> +    tcg_gen_br(l2);
>>> +    gen_set_label(l1);
>>> +    /* Set match bit, i.e. CRF_GT */
>>> +    tcg_gen_movi_i32(cpu_crf[crfD(ctx->opcode)], 1 << CRF_GT);
>>
>> Ew.  This can be done much better as
>>
>>    http://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
>>
>> Which still might be best done in a helper, because of the constants involved
>> (tcg is not nearly so good as gcc in building full 64-bit constants).
>>
>> C.f. target-alpha/int_helper.c, helper_cmpbe0 (which computes different
>> information than cmpeqb, but is still helpful as an example).
>
> Thanks for the hints, that got reduce to following:
>
> #define haszero(v) (((v) - 0x0101010101010101UL) & ~(v) & 0x8080808080808080UL)
> #define hasvalue(x,n)  (haszero((x) ^ (~0UL/255 * (n))))
>
> uint32_t helper_cmpeqb(target_ulong ra, target_ulong rb)
> {
>     return !!hasvalue(rb, ra);
> }

A couple of things:

(1) I don't see N being masked to 0xff before replicating.
(2) You need to use ULL for 32-bit hosts, or casts, e.g.

#define dup(x)          (((x) & 0xff) * (~(target_ulong)0 / 0xff))
#define haszero(v)      (((v) - dup(0x01)) & ~(v) & dup(0x80))
#define hasvalue(x, n)  haszero((x) ^ dup(n))

(3) You probably ought to go ahead and add compute the proper crf value here:

   return hasvalue(rb, ra) ? 1 << CRF_GT : 0;

so that within the translator you just have

   gen_helper_cmpeqb(cpu_crf[...], cpu_gpr[...], cpu_gpr[...])



r~
Nikunj A Dadhania July 23, 2016, 6:08 a.m. UTC | #7
Richard Henderson <rth@twiddle.net> writes:

> On 07/23/2016 12:58 AM, Nikunj A Dadhania wrote:
>> Richard Henderson <rth@twiddle.net> writes:
>>
>>> On 07/18/2016 10:35 PM, Nikunj A Dadhania wrote:
>>>> +    tcg_gen_andi_tl(src1, cpu_gpr[rA(ctx->opcode)], 0xFF);
>>>> +    for (i = 0; i < 64; i += 8) {
>>>> +        tcg_gen_shri_tl(t0, arg1, i);
>>>> +        tcg_gen_andi_tl(t0, t0, 0xFF);
>>>> +        tcg_gen_brcond_tl(TCG_COND_EQ, src1, t0, l1);
>>>> +    }
>>>> +    tcg_gen_movi_i32(cpu_crf[crfD(ctx->opcode)], 0);
>>>> +    tcg_gen_br(l2);
>>>> +    gen_set_label(l1);
>>>> +    /* Set match bit, i.e. CRF_GT */
>>>> +    tcg_gen_movi_i32(cpu_crf[crfD(ctx->opcode)], 1 << CRF_GT);
>>>
>>> Ew.  This can be done much better as
>>>
>>>    http://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
>>>
>>> Which still might be best done in a helper, because of the constants involved
>>> (tcg is not nearly so good as gcc in building full 64-bit constants).
>>>
>>> C.f. target-alpha/int_helper.c, helper_cmpbe0 (which computes different
>>> information than cmpeqb, but is still helpful as an example).
>>
>> Thanks for the hints, that got reduce to following:
>>
>> #define haszero(v) (((v) - 0x0101010101010101UL) & ~(v) & 0x8080808080808080UL)
>> #define hasvalue(x,n)  (haszero((x) ^ (~0UL/255 * (n))))
>>
>> uint32_t helper_cmpeqb(target_ulong ra, target_ulong rb)
>> {
>>     return !!hasvalue(rb, ra);
>> }

I had it like this:

+    tcg_gen_ext8u_tl(src1, cpu_gpr[rA(ctx->opcode)]);
+    gen_helper_cmpeqb(crf, src1, cpu_gpr[rB(ctx->opcode)]);
+    tcg_gen_shli_i32(crf, crf, CRF_GT);

> A couple of things:
>
> (1) I don't see N being masked to 0xff before replicating.

Was doing that in caller, but as suggested better to do it in the define.

> (2) You need to use ULL for 32-bit hosts, or casts, e.g.

Having it defined only for 64-bit

> #define dup(x)          (((x) & 0xff) * (~(target_ulong)0 / 0xff))
> #define haszero(v)      (((v) - dup(0x01)) & ~(v) & dup(0x80))
> #define hasvalue(x, n)  haszero((x) ^ dup(n))

Yes, had similar stuff but with different names:

+/* if x = 0xab, returns 0xababababababababa */
+#define pattern(x) (~0UL/255 * (x))
+
+/* substract 1 from each byte, AND with inverse, check if MSB is set at each
+ * byte.
+ * i.e. ((0x00 - 0x01) & ~(0x00)) & 0x80
+ *      (0xFF & 0xFF) & 0x80 = 0x80 (zero found)
+ */
+#define haszero(v) (((v) - pattern(0x01)) & ~(v) & pattern(0x80))
+
+/* When you XOR the pattern and there is a match, that byte will be zero */
+#define hasvalue(x,n)  (haszero((x) ^ pattern(n)))


> (3) You probably ought to go ahead and add compute the proper crf value here:
>
>    return hasvalue(rb, ra) ? 1 << CRF_GT : 0;

Sure

> so that within the translator you just have
>
>    gen_helper_cmpeqb(cpu_crf[...], cpu_gpr[...], cpu_gpr[...])

Right, that will be a one-liner, will change.

Regards
Nikunj
Richard Henderson July 23, 2016, 4:01 p.m. UTC | #8
On 07/23/2016 11:38 AM, Nikunj A Dadhania wrote:
>> > (2) You need to use ULL for 32-bit hosts, or casts, e.g.
> Having it defined only for 64-bit
>

32-bit *host*, not 32-bit guest.

I.e. i686 host emulating ppc64 guest.


r~
diff mbox

Patch

diff --git a/target-ppc/translate.c b/target-ppc/translate.c
index a57f7dd..8f7ff49 100644
--- a/target-ppc/translate.c
+++ b/target-ppc/translate.c
@@ -856,6 +856,32 @@  static void gen_cmprb(DisasContext *ctx)
     tcg_temp_free(src2hi);
 }
 
+/* cmpeqb */
+static void gen_cmpeqb(DisasContext *ctx)
+{
+    TCGLabel *l1 = gen_new_label();
+    TCGLabel *l2 = gen_new_label();
+    TCGv src1 = tcg_temp_local_new();
+    TCGv t0 = tcg_temp_local_new();
+    TCGv arg1 = cpu_gpr[rB(ctx->opcode)];
+    int i;
+
+    tcg_gen_andi_tl(src1, cpu_gpr[rA(ctx->opcode)], 0xFF);
+    for (i = 0; i < 64; i += 8) {
+        tcg_gen_shri_tl(t0, arg1, i);
+        tcg_gen_andi_tl(t0, t0, 0xFF);
+        tcg_gen_brcond_tl(TCG_COND_EQ, src1, t0, l1);
+    }
+    tcg_gen_movi_i32(cpu_crf[crfD(ctx->opcode)], 0);
+    tcg_gen_br(l2);
+    gen_set_label(l1);
+    /* Set match bit, i.e. CRF_GT */
+    tcg_gen_movi_i32(cpu_crf[crfD(ctx->opcode)], 1 << CRF_GT);
+    gen_set_label(l2);
+    tcg_temp_free(src1);
+    tcg_temp_free(t0);
+}
+
 /* isel (PowerPC 2.03 specification) */
 static void gen_isel(DisasContext *ctx)
 {
@@ -10040,6 +10066,7 @@  GEN_HANDLER(cmp, 0x1F, 0x00, 0x00, 0x00400000, PPC_INTEGER),
 GEN_HANDLER(cmpi, 0x0B, 0xFF, 0xFF, 0x00400000, PPC_INTEGER),
 GEN_HANDLER(cmpl, 0x1F, 0x00, 0x01, 0x00400000, PPC_INTEGER),
 GEN_HANDLER(cmpli, 0x0A, 0xFF, 0xFF, 0x00400000, PPC_INTEGER),
+GEN_HANDLER_E(cmpeqb, 0x1F, 0x00, 0x07, 0x00600000, PPC_NONE, PPC2_ISA300),
 GEN_HANDLER_E(cmpb, 0x1F, 0x1C, 0x0F, 0x00000001, PPC_NONE, PPC2_ISA205),
 GEN_HANDLER_E(cmprb, 0x1F, 0x00, 0x06, 0x00400001, PPC_NONE, PPC2_ISA300),
 GEN_HANDLER(isel, 0x1F, 0x0F, 0xFF, 0x00000001, PPC_ISEL),