diff mbox series

[25/37] target/i386: reimplement 0x0f 0xd0-0xd7, 0xe0-0xe7, 0xf0-0xf7, add AVX

Message ID 20220911230418.340941-26-pbonzini@redhat.com
State New
Headers show
Series target/i386: new decoder + AVX implementation | expand

Commit Message

Paolo Bonzini Sept. 11, 2022, 11:04 p.m. UTC
The more complicated ones here are d6-d7, e6-e7, f7.  The others
are trivial.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/tcg/decode-new.c.inc | 39 +++++++++++++
 target/i386/tcg/emit.c.inc       | 99 +++++++++++++++++++++++++++++---
 target/i386/tcg/translate.c      |  4 +-
 3 files changed, 133 insertions(+), 9 deletions(-)

Comments

Richard Henderson Sept. 12, 2022, 3:06 p.m. UTC | #1
On 9/12/22 00:04, Paolo Bonzini wrote:
> +    [0xd7] = X86_OP_ENTRY3(PMOVMSKB,  G,d, None,None, U,x,  vex7 mmx avx2_256 p_00_66), /* MOVNTQ/MOVNTDQ */

Cut and paste comment?

> +BINARY_INT_MMX(PMULLW,  pmullw)

tcg_gen_gvec_mul

> +static void gen_VCVTpd_dq(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
> +{
> +    SSEFunc_0_epp fn = NULL;
> +    switch (sse_prefix(s)) {
> +    case 0x66:
> +        fn = s->vex_l ? gen_helper_cvttpd2dq_ymm : gen_helper_cvttpd2dq_xmm;
> +        break;
> +    case 0xf3:
> +        fn = s->vex_l ? gen_helper_cvtdq2pd_ymm : gen_helper_cvtdq2pd_xmm;
> +        break;
> +    case 0xf2:
> +        fn = s->vex_l ? gen_helper_cvtpd2dq_ymm : gen_helper_cvtpd2dq_xmm;
> +        break;
> +    }
> +    fn(cpu_env, s->ptr0, s->ptr2);
> +}

Earlier decode?


r~
diff mbox series

Patch

diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc
index 0e2da85934..e9a9981a7f 100644
--- a/target/i386/tcg/decode-new.c.inc
+++ b/target/i386/tcg/decode-new.c.inc
@@ -242,6 +242,18 @@  static void decode_0F7E(DisasContext *s, CPUX86State *env, X86OpEntry *entry, ui
     }
 }
 
+static void decode_0FD6(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+    static const X86OpEntry movq[4] = {
+        {},
+        X86_OP_ENTRY3(MOVQ,    W,x,  None, None, V,q, vex5),
+        X86_OP_ENTRY3(MOVq_dq, V,dq, None, None, N,q),
+        X86_OP_ENTRY3(MOVq_dq, P,q,  None, None, U,q),
+    };
+
+    *entry = *decode_by_prefix(s, movq);
+}
+
 static const X86OpEntry opcodes_0F38_00toEF[240] = {
 };
 
@@ -396,6 +408,33 @@  static const X86OpEntry opcodes_0F[256] = {
     [0x7e] = X86_OP_GROUP0(0F7E),
     [0x7f] = X86_OP_GROUP3(0F6F,       W,x, None,None, V,x, vex5 mmx p_00_66_f3),
 
+    [0xd0] = X86_OP_ENTRY3(VADDSUB,   V,x, H,x, W,x,        vex2 cpuid(SSE3) p_66_f2),
+    [0xd1] = X86_OP_ENTRY3(PSRLW_r,   V,x, H,x, W,x,        vex4 mmx avx2_256 p_00_66),
+    [0xd2] = X86_OP_ENTRY3(PSRLD_r,   V,x, H,x, W,x,        vex4 mmx avx2_256 p_00_66),
+    [0xd3] = X86_OP_ENTRY3(PSRLQ_r,   V,x, H,x, W,x,        vex4 mmx avx2_256 p_00_66),
+    [0xd4] = X86_OP_ENTRY3(PADDQ,     V,x, H,x, W,x,        vex4 mmx avx2_256 p_00_66),
+    [0xd5] = X86_OP_ENTRY3(PMULLW,    V,x, H,x, W,x,        vex4 mmx avx2_256 p_00_66),
+    [0xd6] = X86_OP_GROUP0(0FD6),
+    [0xd7] = X86_OP_ENTRY3(PMOVMSKB,  G,d, None,None, U,x,  vex7 mmx avx2_256 p_00_66), /* MOVNTQ/MOVNTDQ */
+
+    [0xe0] = X86_OP_ENTRY3(PAVGB,     V,x, H,x, W,x,        vex4 mmx avx2_256 p_00_66),
+    [0xe1] = X86_OP_ENTRY3(PSRAW_r,   V,x, H,x, W,x,        vex7 mmx avx2_256 p_00_66),
+    [0xe2] = X86_OP_ENTRY3(PSRAD_r,   V,x, H,x, W,x,        vex7 mmx avx2_256 p_00_66),
+    [0xe3] = X86_OP_ENTRY3(PAVGW,     V,x, H,x, W,x,        vex4 mmx avx2_256 p_00_66),
+    [0xe4] = X86_OP_ENTRY3(PMULHUW,   V,x, H,x, W,x,        vex4 mmx avx2_256 p_00_66),
+    [0xe5] = X86_OP_ENTRY3(PMULHW,    V,x, H,x, W,x,        vex4 mmx avx2_256 p_00_66),
+    [0xe6] = X86_OP_ENTRY2(VCVTpd_dq, V,x, W,x,             vex2 p_66_f3_f2),
+    [0xe7] = X86_OP_ENTRY3(MOVDQ,     W,x, None,None, V,x,  vex1 mmx p_00_66), /* MOVNTQ/MOVNTDQ */
+
+    [0xf0] = X86_OP_ENTRY3(LDDQU,    V,x, None,None, M,x,   vex4_unal cpuid(SSE3) p_f2),
+    [0xf1] = X86_OP_ENTRY3(PSLLW_r,  V,x, H,x, W,x,         vex7 mmx avx2_256 p_00_66),
+    [0xf2] = X86_OP_ENTRY3(PSLLD_r,  V,x, H,x, W,x,         vex7 mmx avx2_256 p_00_66),
+    [0xf3] = X86_OP_ENTRY3(PSLLQ_r,  V,x, H,x, W,x,         vex7 mmx avx2_256 p_00_66),
+    [0xf4] = X86_OP_ENTRY3(PMULUDQ,  V,x, H,x, W,x,         vex4 mmx avx2_256 p_00_66),
+    [0xf5] = X86_OP_ENTRY3(PMADDWD,  V,x, H,x, W,x,         vex4 mmx avx2_256 p_00_66),
+    [0xf6] = X86_OP_ENTRY3(PSADBW,   V,x, H,x, W,x,         vex4 mmx avx2_256 p_00_66),
+    [0xf7] = X86_OP_ENTRY3(MASKMOV,  None,None, V,dq, U,dq, vex4_unal avx2_256 mmx p_00_66),
+
     /* Incorrectly missing from 2-17 */
     [0xd8] = X86_OP_ENTRY3(PSUBUSB,  V,x, H,x, W,x,  vex4 mmx avx2_256 p_00_66),
     [0xd9] = X86_OP_ENTRY3(PSUBUSW,  V,x, H,x, W,x,  vex4 mmx avx2_256 p_00_66),
diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc
index fb01035d06..c90f933093 100644
--- a/target/i386/tcg/emit.c.inc
+++ b/target/i386/tcg/emit.c.inc
@@ -403,6 +403,7 @@  static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decod
 }
 HORIZONTAL_FP_SSE(VHADD, hadd)
 HORIZONTAL_FP_SSE(VHSUB, hsub)
+HORIZONTAL_FP_SSE(VADDSUB, addsub)
 
 /*
  * 00 = p*  Pq, Qq (if mmx not NULL; no VEX)
@@ -462,6 +463,24 @@  BINARY_INT_MMX(PADDSB, paddsb)
 BINARY_INT_MMX(PADDSW, paddsw)
 BINARY_INT_MMX(PMAXSW, pmaxsw)
 
+BINARY_INT_MMX(PAVGB,   pavgb)
+BINARY_INT_MMX(PAVGW,   pavgw)
+BINARY_INT_MMX(PMADDWD, pmaddwd)
+BINARY_INT_MMX(PMULHUW, pmulhuw)
+BINARY_INT_MMX(PMULHW,  pmulhw)
+BINARY_INT_MMX(PMULLW,  pmullw)
+BINARY_INT_MMX(PMULUDQ, pmuludq)
+BINARY_INT_MMX(PSADBW,  psadbw)
+
+BINARY_INT_MMX(PSLLW_r, psllw)
+BINARY_INT_MMX(PSLLD_r, pslld)
+BINARY_INT_MMX(PSLLQ_r, psllq)
+BINARY_INT_MMX(PSRLW_r, psrlw)
+BINARY_INT_MMX(PSRLD_r, psrld)
+BINARY_INT_MMX(PSRLQ_r, psrlq)
+BINARY_INT_MMX(PSRAW_r, psraw)
+BINARY_INT_MMX(PSRAD_r, psrad)
+
 /* Instructions with no MMX equivalent.  */
 #define BINARY_INT_SSE(uname, lname)                                               \
 static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
@@ -680,6 +699,24 @@  static void gen_EMMS_VZERO(DisasContext *s, CPUX86State *env, X86DecodedInsn *de
     }
 }
 
+static void gen_LDDQU(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    gen_load_sse(s, s->T0, decode->op[0].ot, decode->op[0].offset);
+}
+
+static void gen_MASKMOV(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    tcg_gen_mov_tl(s->A0, cpu_regs[R_EDI]);
+    gen_extu(s->aflag, s->A0);
+    gen_add_A0_ds_seg(s);
+
+    if (s->prefix & PREFIX_DATA) {
+        gen_helper_maskmov_xmm(cpu_env, s->ptr1, s->ptr2, s->A0);
+    } else {
+        gen_helper_maskmov_mmx(cpu_env, s->ptr1, s->ptr2, s->A0);
+    }
+}
+
 static void gen_MOVBE(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[0].ot;
@@ -756,14 +793,26 @@  static void gen_MOVMSK(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode
 
 static void gen_MOVQ(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
 {
-    int vec_len = sse_vec_len(s, decode);
-    int lo_ofs = decode->op[0].offset
-        - xmm_offset(decode->op[0].ot)
-        + xmm_offset(MO_64);
-
     tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[2].offset);
-    tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
-    tcg_gen_st_i64(s->tmp1_i64, cpu_env, lo_ofs);
+
+    if (decode->op[0].has_ea) {
+        gen_op_st_v(s, MO_64, s->tmp1_i64, s->A0);
+    } else {
+        int vec_len = sse_vec_len(s, decode);
+        int lo_ofs = decode->op[0].offset
+            - xmm_offset(decode->op[0].ot)
+            + xmm_offset(MO_64);
+
+        tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
+        tcg_gen_st_i64(s->tmp1_i64, cpu_env, lo_ofs);
+    }
+}
+
+static void gen_MOVq_dq(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    gen_helper_enter_mmx(cpu_env);
+    /* Otherwise the same as any other movq.  */
+    return gen_MOVQ(s, env, decode);
 }
 
 static void gen_MULX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
@@ -816,6 +865,15 @@  static void gen_PADDD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
                      decode->op[2].offset, vec_len, vec_len);
 }
 
+static void gen_PADDQ(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    int vec_len = sse_vec_len(s, decode);
+
+    tcg_gen_gvec_add(MO_64,
+                     decode->op[0].offset, decode->op[1].offset,
+                     decode->op[2].offset, vec_len, vec_len);
+}
+
 static void gen_PAND(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
 {
     int vec_len = sse_vec_len(s, decode);
@@ -906,6 +964,16 @@  static void gen_PEXT(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
     gen_helper_pext(s->T0, s->T0, s->T1);
 }
 
+static void gen_PMOVMSKB(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    if (s->prefix & PREFIX_DATA) {
+        gen_helper_pmovmskb_xmm(s->tmp2_i32, cpu_env, s->ptr2);
+    } else {
+        gen_helper_pmovmskb_mmx(s->tmp2_i32, cpu_env, s->ptr2);
+    }
+    tcg_gen_extu_i32_tl(s->T0, s->tmp2_i32);
+}
+
 static void gen_POR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
 {
     int vec_len = sse_vec_len(s, decode);
@@ -1202,6 +1270,23 @@  static void gen_VCVTfp2fp(DisasContext *s, CPUX86State *env, X86DecodedInsn *dec
                      gen_helper_cvtsd2ss, gen_helper_cvtss2sd);
 }
 
+static void gen_VCVTpd_dq(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    SSEFunc_0_epp fn = NULL;
+    switch (sse_prefix(s)) {
+    case 0x66:
+        fn = s->vex_l ? gen_helper_cvttpd2dq_ymm : gen_helper_cvttpd2dq_xmm;
+        break;
+    case 0xf3:
+        fn = s->vex_l ? gen_helper_cvtdq2pd_ymm : gen_helper_cvtdq2pd_xmm;
+        break;
+    case 0xf2:
+        fn = s->vex_l ? gen_helper_cvtpd2dq_ymm : gen_helper_cvtpd2dq_xmm;
+        break;
+    }
+    fn(cpu_env, s->ptr0, s->ptr2);
+}
+
 static void gen_VCVTps_dq(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
 {
     SSEFunc_0_epp fn = NULL;
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 45287dfea2..d15e988891 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -4668,8 +4668,8 @@  static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
 #endif
         if (use_new &&
             ((b >= 0x150 && b <= 0x17f) ||
-             (b >= 0x1d8 && b <= 0x1ff && (b & 8)))) {
-            return disas_insn_new(s, cpu, b + 0x100);
+             (b >= 0x1d0 && b <= 0x1ff))) {
+            return disas_insn_new(s, cpu, b);
         }
         break;
     case 0xf3: