diff mbox series

[RFC,v1,22/22] target/i386: reimplement (V)P(EQ, CMP)(B, W, D)

Message ID 20190731175702.4916-23-jan.bobek@gmail.com
State New
Headers show
Series reimplement (some) x86 vector instructions using tcg-gvec | expand

Commit Message

Jan Bobek July 31, 2019, 5:57 p.m. UTC
Use the gvec infrastructure to achieve the desired functionality.

Signed-off-by: Jan Bobek <jan.bobek@gmail.com>
---
 target/i386/ops_sse.h        | 13 -------
 target/i386/ops_sse_header.h |  8 -----
 target/i386/translate.c      | 66 ++++++++++++++++++++++++++++++++----
 3 files changed, 60 insertions(+), 27 deletions(-)

Comments

Richard Henderson July 31, 2019, 7:50 p.m. UTC | #1
On 7/31/19 10:57 AM, Jan Bobek wrote:
> +static inline void gen_gvec_cmpeq(unsigned vece, uint32_t dofs,
> +                                  uint32_t aofs, uint32_t bofs,
> +                                  uint32_t oprsz, uint32_t maxsz)
> +{
> +    tcg_gen_gvec_cmp(TCG_COND_EQ, vece, dofs, aofs, bofs, oprsz, maxsz);
> +}
...
> +static inline void gen_gvec_cmpgt(unsigned vece, uint32_t dofs,
> +                                  uint32_t aofs, uint32_t bofs,
> +                                  uint32_t oprsz, uint32_t maxsz)
> +{
> +    tcg_gen_gvec_cmp(TCG_COND_GT, vece, dofs, aofs, bofs, oprsz, maxsz);
> +}

Drop the inlines.

> +#define gen_pcmpgt_mm(env, s, modrm, vece)   gen_gvec_ld_modrm_mm  ((env), (s), (modrm), (vece), gen_gvec_cmpgt, 0112)
> +#define gen_pcmpgt_xmm(env, s, modrm, vece)  gen_gvec_ld_modrm_xmm ((env), (s), (modrm), (vece), gen_gvec_cmpgt, 0112)
> +#define gen_vpcmpgt_xmm(env, s, modrm, vece) gen_gvec_ld_modrm_vxmm((env), (s), (modrm), (vece), gen_gvec_cmpgt, 0123)
> +#define gen_vpcmpgt_ymm(env, s, modrm, vece) gen_gvec_ld_modrm_vymm((env), (s), (modrm), (vece), gen_gvec_cmpgt, 0123)
...
> +    case 0x64 | M_0F:                  gen_pcmpgt_mm(env, s, modrm, MO_8); return;
> +    case 0x64 | M_0F | P_66:           gen_pcmpgt_xmm(env, s, modrm, MO_8); return;
> +    case 0x64 | M_0F | P_66 | VEX_128: gen_vpcmpgt_xmm(env, s, modrm, MO_8); return;
> +    case 0x64 | M_0F | P_66 | VEX_256: gen_vpcmpgt_ymm(env, s, modrm, MO_8); return;

Looks like my comments vs PAND apply to all of the subsequent patches as well.
 But everything else looks good.


r~
Aleksandar Markovic July 31, 2019, 8:09 p.m. UTC | #2
On Wed, Jul 31, 2019 at 9:51 PM Richard Henderson <
richard.henderson@linaro.org> wrote:

> On 7/31/19 10:57 AM, Jan Bobek wrote:
> > +static inline void gen_gvec_cmpeq(unsigned vece, uint32_t dofs,
> > +                                  uint32_t aofs, uint32_t bofs,
> > +                                  uint32_t oprsz, uint32_t maxsz)
> > +{
> > +    tcg_gen_gvec_cmp(TCG_COND_EQ, vece, dofs, aofs, bofs, oprsz, maxsz);
> > +}
> ...
> > +static inline void gen_gvec_cmpgt(unsigned vece, uint32_t dofs,
> > +                                  uint32_t aofs, uint32_t bofs,
> > +                                  uint32_t oprsz, uint32_t maxsz)
> > +{
> > +    tcg_gen_gvec_cmp(TCG_COND_GT, vece, dofs, aofs, bofs, oprsz, maxsz);
> > +}
>
> Drop the inlines.
>
>
Why? The compiler will decide at the end of the day, but at least "inline"
here
says that the code author thinks that inlining is desirable, logical, and
expected
in these cases, which is in turn a valuable information for the code reader.

Yours,
Aleksandar



> > +#define gen_pcmpgt_mm(env, s, modrm, vece)   gen_gvec_ld_modrm_mm
> ((env), (s), (modrm), (vece), gen_gvec_cmpgt, 0112)
> > +#define gen_pcmpgt_xmm(env, s, modrm, vece)  gen_gvec_ld_modrm_xmm
> ((env), (s), (modrm), (vece), gen_gvec_cmpgt, 0112)
> > +#define gen_vpcmpgt_xmm(env, s, modrm, vece)
> gen_gvec_ld_modrm_vxmm((env), (s), (modrm), (vece), gen_gvec_cmpgt, 0123)
> > +#define gen_vpcmpgt_ymm(env, s, modrm, vece)
> gen_gvec_ld_modrm_vymm((env), (s), (modrm), (vece), gen_gvec_cmpgt, 0123)
> ...
> > +    case 0x64 | M_0F:                  gen_pcmpgt_mm(env, s, modrm,
> MO_8); return;
> > +    case 0x64 | M_0F | P_66:           gen_pcmpgt_xmm(env, s, modrm,
> MO_8); return;
> > +    case 0x64 | M_0F | P_66 | VEX_128: gen_vpcmpgt_xmm(env, s, modrm,
> MO_8); return;
> > +    case 0x64 | M_0F | P_66 | VEX_256: gen_vpcmpgt_ymm(env, s, modrm,
> MO_8); return;
>
> Looks like my comments vs PAND apply to all of the subsequent patches as
> well.
>  But everything else looks good.
>
>
> r~
>
>
Richard Henderson July 31, 2019, 9:31 p.m. UTC | #3
On 7/31/19 1:09 PM, Aleksandar Markovic wrote:
> 
> 
> On Wed, Jul 31, 2019 at 9:51 PM Richard Henderson <richard.henderson@linaro.org
> <mailto:richard.henderson@linaro.org>> wrote:
> 
>     On 7/31/19 10:57 AM, Jan Bobek wrote:
>     > +static inline void gen_gvec_cmpeq(unsigned vece, uint32_t dofs,
>     > +                                  uint32_t aofs, uint32_t bofs,
>     > +                                  uint32_t oprsz, uint32_t maxsz)
>     > +{
>     > +    tcg_gen_gvec_cmp(TCG_COND_EQ, vece, dofs, aofs, bofs, oprsz, maxsz);
>     > +}
>     ...
>     > +static inline void gen_gvec_cmpgt(unsigned vece, uint32_t dofs,
>     > +                                  uint32_t aofs, uint32_t bofs,
>     > +                                  uint32_t oprsz, uint32_t maxsz)
>     > +{
>     > +    tcg_gen_gvec_cmp(TCG_COND_GT, vece, dofs, aofs, bofs, oprsz, maxsz);
>     > +}
> 
>     Drop the inlines.
> 
> 
> Why? The compiler will decide at the end of the day, but at least "inline" here
> says that the code author thinks that inlining is desirable, logical, and expected
> in these cases, which is in turn a valuable information for the code reader.

In this case it is in fact a lie that will only confuse the reader, as it did
you.  Functions whose address are passed as a callback, as these are, are
always forced out of line.

But beyond that, clang diagnoses unused static inline within *.c while gcc does
not (I'm not sure I agree with clang, but it is what it is).  By leaving off
the inline, but compilers will diagnose when code rearrangement leaves a
function unused.


r~
Jan Bobek Aug. 2, 2019, 2:07 p.m. UTC | #4
On 7/31/19 5:31 PM, Richard Henderson wrote:
> On 7/31/19 1:09 PM, Aleksandar Markovic wrote:
>>
>>
>> On Wed, Jul 31, 2019 at 9:51 PM Richard Henderson <richard.henderson@linaro.org
>> <mailto:richard.henderson@linaro.org>> wrote:
>>
>>     On 7/31/19 10:57 AM, Jan Bobek wrote:
>>     > +static inline void gen_gvec_cmpeq(unsigned vece, uint32_t dofs,
>>     > +                                  uint32_t aofs, uint32_t bofs,
>>     > +                                  uint32_t oprsz, uint32_t maxsz)
>>     > +{
>>     > +    tcg_gen_gvec_cmp(TCG_COND_EQ, vece, dofs, aofs, bofs, oprsz, maxsz);
>>     > +}
>>     ...
>>     > +static inline void gen_gvec_cmpgt(unsigned vece, uint32_t dofs,
>>     > +                                  uint32_t aofs, uint32_t bofs,
>>     > +                                  uint32_t oprsz, uint32_t maxsz)
>>     > +{
>>     > +    tcg_gen_gvec_cmp(TCG_COND_GT, vece, dofs, aofs, bofs, oprsz, maxsz);
>>     > +}
>>
>>     Drop the inlines.
>>
>>
>> Why? The compiler will decide at the end of the day, but at least "inline" here
>> says that the code author thinks that inlining is desirable, logical, and expected
>> in these cases, which is in turn a valuable information for the code reader.
> 
> In this case it is in fact a lie that will only confuse the reader, as it did
> you.  Functions whose address are passed as a callback, as these are, are
> always forced out of line.
> 
> But beyond that, clang diagnoses unused static inline within *.c while gcc does
> not (I'm not sure I agree with clang, but it is what it is).  By leaving off
> the inline, but compilers will diagnose when code rearrangement leaves a
> function unused.

Dang, I completely forgot about the function-address vs. inlining rule. I thought
of these as macros, really; they are only functions because I needed to pass
them to the gen_gvec_ld_modrm_* helpers.

I'll drop the inline, compilers ignore it anyway.

-Jan
Aleksandar Markovic Aug. 2, 2019, 2:18 p.m. UTC | #5
>
>
>   Functions whose address are passed as a callback, as these are, are
> always forced out of line.
>
>
OK, Richard. However, on a much higher level than this single patch, I am
really
curious about this: what would be the rationale beyond the use of callbacks
in TCG
vector support interface? What is, in fact, achieved with such interface
design that
could not be achieved with callback-less approach?

Thanks,
Aleksandar
diff mbox series

Patch

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index 75ff686bb6..b6ace9410f 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -337,11 +337,6 @@  static inline int satsw(int x)
     }
 }
 
-#define FCMPGTB(a, b) ((int8_t)(a) > (int8_t)(b) ? -1 : 0)
-#define FCMPGTW(a, b) ((int16_t)(a) > (int16_t)(b) ? -1 : 0)
-#define FCMPGTL(a, b) ((int32_t)(a) > (int32_t)(b) ? -1 : 0)
-#define FCMPEQ(a, b) ((a) == (b) ? -1 : 0)
-
 #define FMULLW(a, b) ((a) * (b))
 #define FMULHRW(a, b) (((int16_t)(a) * (int16_t)(b) + 0x8000) >> 16)
 #define FMULHUW(a, b) ((a) * (b) >> 16)
@@ -350,14 +345,6 @@  static inline int satsw(int x)
 #define FAVG(a, b) (((a) + (b) + 1) >> 1)
 #endif
 
-SSE_HELPER_B(helper_pcmpgtb, FCMPGTB)
-SSE_HELPER_W(helper_pcmpgtw, FCMPGTW)
-SSE_HELPER_L(helper_pcmpgtl, FCMPGTL)
-
-SSE_HELPER_B(helper_pcmpeqb, FCMPEQ)
-SSE_HELPER_W(helper_pcmpeqw, FCMPEQ)
-SSE_HELPER_L(helper_pcmpeql, FCMPEQ)
-
 SSE_HELPER_W(helper_pmullw, FMULLW)
 #if SHIFT == 0
 SSE_HELPER_W(helper_pmulhrw, FMULHRW)
diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h
index 9c7451d28e..d8e33dff6b 100644
--- a/target/i386/ops_sse_header.h
+++ b/target/i386/ops_sse_header.h
@@ -60,14 +60,6 @@  DEF_HELPER_3(glue(pslldq, SUFFIX), void, env, Reg, Reg)
 #define SSE_HELPER_Q(name, F)\
     DEF_HELPER_3(glue(name, SUFFIX), void, env, Reg, Reg)
 
-SSE_HELPER_B(pcmpgtb, FCMPGTB)
-SSE_HELPER_W(pcmpgtw, FCMPGTW)
-SSE_HELPER_L(pcmpgtl, FCMPGTL)
-
-SSE_HELPER_B(pcmpeqb, FCMPEQ)
-SSE_HELPER_W(pcmpeqw, FCMPEQ)
-SSE_HELPER_L(pcmpeql, FCMPEQ)
-
 SSE_HELPER_W(pmullw, FMULLW)
 #if SHIFT == 0
 SSE_HELPER_W(pmulhrw, FMULHRW)
diff --git a/target/i386/translate.c b/target/i386/translate.c
index d08d2cedce..729509e1ff 100644
--- a/target/i386/translate.c
+++ b/target/i386/translate.c
@@ -2783,9 +2783,9 @@  static const SSEFunc_0_epp sse_op_table1[256][4] = {
     [0x61] = MMX_OP2(punpcklwd),
     [0x62] = MMX_OP2(punpckldq),
     [0x63] = MMX_OP2(packsswb),
-    [0x64] = MMX_OP2(pcmpgtb),
-    [0x65] = MMX_OP2(pcmpgtw),
-    [0x66] = MMX_OP2(pcmpgtl),
+    [0x64] = { SSE_TOMBSTONE, SSE_TOMBSTONE },
+    [0x65] = { SSE_TOMBSTONE, SSE_TOMBSTONE },
+    [0x66] = { SSE_TOMBSTONE, SSE_TOMBSTONE },
     [0x67] = MMX_OP2(packuswb),
     [0x68] = MMX_OP2(punpckhbw),
     [0x69] = MMX_OP2(punpckhwd),
@@ -2802,9 +2802,9 @@  static const SSEFunc_0_epp sse_op_table1[256][4] = {
     [0x71] = { SSE_SPECIAL, SSE_SPECIAL }, /* shiftw */
     [0x72] = { SSE_SPECIAL, SSE_SPECIAL }, /* shiftd */
     [0x73] = { SSE_SPECIAL, SSE_SPECIAL }, /* shiftq */
-    [0x74] = MMX_OP2(pcmpeqb),
-    [0x75] = MMX_OP2(pcmpeqw),
-    [0x76] = MMX_OP2(pcmpeql),
+    [0x74] = { SSE_TOMBSTONE, SSE_TOMBSTONE },
+    [0x75] = { SSE_TOMBSTONE, SSE_TOMBSTONE },
+    [0x76] = { SSE_TOMBSTONE, SSE_TOMBSTONE },
     [0x77] = { SSE_DUMMY }, /* emms */
     [0x78] = { NULL, SSE_SPECIAL, NULL, SSE_SPECIAL }, /* extrq_i, insertq_i */
     [0x79] = { NULL, gen_helper_extrq_r, NULL, gen_helper_insertq_r },
@@ -3216,6 +3216,30 @@  static inline void gen_gvec_ld_modrm_3(CPUX86State *env, DisasContext *s,
 #define gen_vpmaxu_xmm(env, s, modrm, vece) gen_gvec_ld_modrm_vxmm((env), (s), (modrm), (vece), tcg_gen_gvec_umax, 0123)
 #define gen_vpmaxu_ymm(env, s, modrm, vece) gen_gvec_ld_modrm_vymm((env), (s), (modrm), (vece), tcg_gen_gvec_umax, 0123)
 
+static inline void gen_gvec_cmpeq(unsigned vece, uint32_t dofs,
+                                  uint32_t aofs, uint32_t bofs,
+                                  uint32_t oprsz, uint32_t maxsz)
+{
+    tcg_gen_gvec_cmp(TCG_COND_EQ, vece, dofs, aofs, bofs, oprsz, maxsz);
+}
+
+#define gen_pcmpeq_mm(env, s, modrm, vece)   gen_gvec_ld_modrm_mm  ((env), (s), (modrm), (vece), gen_gvec_cmpeq, 0112)
+#define gen_pcmpeq_xmm(env, s, modrm, vece)  gen_gvec_ld_modrm_xmm ((env), (s), (modrm), (vece), gen_gvec_cmpeq, 0112)
+#define gen_vpcmpeq_xmm(env, s, modrm, vece) gen_gvec_ld_modrm_vxmm((env), (s), (modrm), (vece), gen_gvec_cmpeq, 0123)
+#define gen_vpcmpeq_ymm(env, s, modrm, vece) gen_gvec_ld_modrm_vymm((env), (s), (modrm), (vece), gen_gvec_cmpeq, 0123)
+
+static inline void gen_gvec_cmpgt(unsigned vece, uint32_t dofs,
+                                  uint32_t aofs, uint32_t bofs,
+                                  uint32_t oprsz, uint32_t maxsz)
+{
+    tcg_gen_gvec_cmp(TCG_COND_GT, vece, dofs, aofs, bofs, oprsz, maxsz);
+}
+
+#define gen_pcmpgt_mm(env, s, modrm, vece)   gen_gvec_ld_modrm_mm  ((env), (s), (modrm), (vece), gen_gvec_cmpgt, 0112)
+#define gen_pcmpgt_xmm(env, s, modrm, vece)  gen_gvec_ld_modrm_xmm ((env), (s), (modrm), (vece), gen_gvec_cmpgt, 0112)
+#define gen_vpcmpgt_xmm(env, s, modrm, vece) gen_gvec_ld_modrm_vxmm((env), (s), (modrm), (vece), gen_gvec_cmpgt, 0123)
+#define gen_vpcmpgt_ymm(env, s, modrm, vece) gen_gvec_ld_modrm_vymm((env), (s), (modrm), (vece), gen_gvec_cmpgt, 0123)
+
 #define gen_pand_mm(env, s, modrm)   gen_gvec_ld_modrm_mm  ((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0112)
 #define gen_pand_xmm(env, s, modrm)  gen_gvec_ld_modrm_xmm ((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0112)
 #define gen_vpand_xmm(env, s, modrm) gen_gvec_ld_modrm_vxmm((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0123)
@@ -3451,6 +3475,36 @@  static void gen_sse(CPUX86State *env, DisasContext *s, int b)
     case 0xee | M_0F | P_66 | VEX_128: gen_vpmaxs_xmm(env, s, modrm, MO_16); return;
     case 0xee | M_0F | P_66 | VEX_256: gen_vpmaxs_ymm(env, s, modrm, MO_16); return;
 
+    case 0x64 | M_0F:                  gen_pcmpgt_mm(env, s, modrm, MO_8); return;
+    case 0x64 | M_0F | P_66:           gen_pcmpgt_xmm(env, s, modrm, MO_8); return;
+    case 0x64 | M_0F | P_66 | VEX_128: gen_vpcmpgt_xmm(env, s, modrm, MO_8); return;
+    case 0x64 | M_0F | P_66 | VEX_256: gen_vpcmpgt_ymm(env, s, modrm, MO_8); return;
+
+    case 0x65 | M_0F:                  gen_pcmpgt_mm(env, s, modrm, MO_16); return;
+    case 0x65 | M_0F | P_66:           gen_pcmpgt_xmm(env, s, modrm, MO_16); return;
+    case 0x65 | M_0F | P_66 | VEX_128: gen_vpcmpgt_xmm(env, s, modrm, MO_16); return;
+    case 0x65 | M_0F | P_66 | VEX_256: gen_vpcmpgt_ymm(env, s, modrm, MO_16); return;
+
+    case 0x66 | M_0F:                  gen_pcmpgt_mm(env, s, modrm, MO_32); return;
+    case 0x66 | M_0F | P_66:           gen_pcmpgt_xmm(env, s, modrm, MO_32); return;
+    case 0x66 | M_0F | P_66 | VEX_128: gen_vpcmpgt_xmm(env, s, modrm, MO_32); return;
+    case 0x66 | M_0F | P_66 | VEX_256: gen_vpcmpgt_ymm(env, s, modrm, MO_32); return;
+
+    case 0x74 | M_0F:                  gen_pcmpeq_mm(env, s, modrm, MO_8); return;
+    case 0x74 | M_0F | P_66:           gen_pcmpeq_xmm(env, s, modrm, MO_8); return;
+    case 0x74 | M_0F | P_66 | VEX_128: gen_vpcmpeq_xmm(env, s, modrm, MO_8); return;
+    case 0x74 | M_0F | P_66 | VEX_256: gen_vpcmpeq_ymm(env, s, modrm, MO_8); return;
+
+    case 0x75 | M_0F:                  gen_pcmpeq_mm(env, s, modrm, MO_16); return;
+    case 0x75 | M_0F | P_66:           gen_pcmpeq_xmm(env, s, modrm, MO_16); return;
+    case 0x75 | M_0F | P_66 | VEX_128: gen_vpcmpeq_xmm(env, s, modrm, MO_16); return;
+    case 0x75 | M_0F | P_66 | VEX_256: gen_vpcmpeq_ymm(env, s, modrm, MO_16); return;
+
+    case 0x76 | M_0F:                  gen_pcmpeq_mm(env, s, modrm, MO_32); return;
+    case 0x76 | M_0F | P_66:           gen_pcmpeq_xmm(env, s, modrm, MO_32); return;
+    case 0x76 | M_0F | P_66 | VEX_128: gen_vpcmpeq_xmm(env, s, modrm, MO_32); return;
+    case 0x76 | M_0F | P_66 | VEX_256: gen_vpcmpeq_ymm(env, s, modrm, MO_32); return;
+
     case 0xdb | M_0F:                  gen_pand_mm(env, s, modrm); return;
     case 0xdb | M_0F | P_66:           gen_pand_xmm(env, s, modrm); return;
     case 0xdb | M_0F | P_66 | VEX_128: gen_vpand_xmm(env, s, modrm); return;