diff mbox series

[24/37] target/i386: reimplement 0x0f 0x70-0x77, add AVX

Message ID 20220911230418.340941-25-pbonzini@redhat.com
State New
Headers show
Series target/i386: new decoder + AVX implementation | expand

Commit Message

Paolo Bonzini Sept. 11, 2022, 11:04 p.m. UTC
This includes shifts by immediate, which use bits 3-5 of the ModRM byte
as an opcode extension.  With the exception of 128-bit shifts, they are
implemented using gvec.

This also covers VZEROALL and VZEROUPPER, which use the same opcode
as EMMS.  If we were wanting to optimize out gen_clear_ymmh then this
would be one of the starting points.  The implementation of the VZEROALL
and VZEROUPPER helpers is by Paul Brook.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/helper.h             |   7 +
 target/i386/tcg/decode-new.c.inc |  76 ++++++++++
 target/i386/tcg/emit.c.inc       | 232 +++++++++++++++++++++++++++++++
 target/i386/tcg/fpu_helper.c     |  46 ++++++
 target/i386/tcg/translate.c      |   3 +-
 5 files changed, 362 insertions(+), 2 deletions(-)

Comments

Richard Henderson Sept. 12, 2022, 2:29 p.m. UTC | #1
On 9/12/22 00:04, Paolo Bonzini wrote:
> This includes shifts by immediate, which use bits 3-5 of the ModRM byte
> as an opcode extension.  With the exception of 128-bit shifts, they are
> implemented using gvec.
> 
> This also covers VZEROALL and VZEROUPPER, which use the same opcode
> as EMMS.  If we were wanting to optimize out gen_clear_ymmh then this
> would be one of the starting points.  The implementation of the VZEROALL
> and VZEROUPPER helpers is by Paul Brook.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>   target/i386/helper.h             |   7 +
>   target/i386/tcg/decode-new.c.inc |  76 ++++++++++
>   target/i386/tcg/emit.c.inc       | 232 +++++++++++++++++++++++++++++++
>   target/i386/tcg/fpu_helper.c     |  46 ++++++
>   target/i386/tcg/translate.c      |   3 +-
>   5 files changed, 362 insertions(+), 2 deletions(-)
> 
> diff --git a/target/i386/helper.h b/target/i386/helper.h
> index 3da5df98b9..d7e6878263 100644
> --- a/target/i386/helper.h
> +++ b/target/i386/helper.h
> @@ -221,6 +221,13 @@ DEF_HELPER_3(movq, void, env, ptr, ptr)
>   #define SHIFT 2
>   #include "ops_sse_header.h"
>   
> +DEF_HELPER_1(vzeroall, void, env)
> +DEF_HELPER_1(vzeroupper, void, env)
> +#ifdef TARGET_X86_64
> +DEF_HELPER_1(vzeroall_hi8, void, env)
> +DEF_HELPER_1(vzeroupper_hi8, void, env)
> +#endif
> +
>   DEF_HELPER_3(rclb, tl, env, tl, tl)
>   DEF_HELPER_3(rclw, tl, env, tl, tl)
>   DEF_HELPER_3(rcll, tl, env, tl, tl)
> diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc
> index 6aa8bac74f..0e2da85934 100644
> --- a/target/i386/tcg/decode-new.c.inc
> +++ b/target/i386/tcg/decode-new.c.inc
> @@ -133,6 +133,19 @@ static uint8_t get_modrm(DisasContext *s, CPUX86State *env)
>       return s->modrm;
>   }
>   
> +static inline const X86OpEntry *decode_by_prefix(DisasContext *s, const X86OpEntry entries[4])
> +{
> +    if (s->prefix & PREFIX_REPNZ) {
> +        return &entries[3];
> +    } else if (s->prefix & PREFIX_REPZ) {
> +        return &entries[2];
> +    } else if (s->prefix & PREFIX_DATA) {
> +        return &entries[1];
> +    } else {
> +        return &entries[0];
> +    }
> +}

This is the sort of thing I would have expected for some of the other insns for which the 
distiction was delayed until generation, like SSE4a_{R,I}.

> +static void decode_group12_13_14(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
> +{
> +    static const X86OpEntry group[3][8] = {
> +        {
> +            /* grp12 */
> +            {},
> +            {},
> +            X86_OP_ENTRY3(PSRLW_i,  H,x, U,x, I,b, vex7 mmx avx2_256 p_00_66),
> +            {},
> +            X86_OP_ENTRY3(PSRAW_i,  H,x, U,x, I,b, vex7 mmx avx2_256 p_00_66),
> +            {},
> +            X86_OP_ENTRY3(PSLLW_i,  H,x, U,x, I,b, vex7 mmx avx2_256 p_00_66),
> +            {},
> +        },

Why combine these 3 groups?

> +    *entry = group[*b - 0x71][op];

Split them and you avoid this magic number.

> +static inline void gen_unary_imm_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
> +                                     SSEFunc_0_ppi xmm, SSEFunc_0_ppi ymm)
> +{
> +    TCGv_i32 imm = tcg_const_i32(decode->immediate);

Use tcg_constant_i32, which need not be freed.

> +static void gen_EMMS_VZERO(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
> +{
> +    if (!(s->prefix & PREFIX_VEX)) {
> +        gen_helper_emms(cpu_env);
> +        return;
> +    }

Split in decode?  That would make vex8 simpler too.

> +static inline TCGv_ptr make_imm_mmx_vec(uint32_t imm)

Unused?  Please do drop all of the inline markers, and/or do build testing with clang, 
which will Werror on this.

> +static inline TCGv_ptr make_imm_xmm_vec(uint32_t imm, int vec_len)
> +{
> +    MemOp ot = vec_len == 16 ? MO_128 : MO_256;
> +    TCGv_i32 imm_v = tcg_const_i32(imm);

tcg_constant_i32, however I think this use can go away too.

> +static void gen_PSRLDQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
> +{
> +    int vec_len = sse_vec_len(s, decode);
> +    TCGv_ptr imm_vec = make_imm_xmm_vec(decode->immediate, vec_len);
> +
> +    if (s->vex_l) {
> +        gen_helper_psrldq_ymm(cpu_env, s->ptr0, s->ptr1, imm_vec);
> +    } else {
> +        gen_helper_psrldq_xmm(cpu_env, s->ptr0, s->ptr1, imm_vec);
> +    }
> +    tcg_temp_free_ptr(imm_vec);

Let's just do this inline:

     int shift = decode->immediate * 8;

     if (shift >= 128) {
         zero;
         return;
     }

     for (lane = 0; lane <= s->vex_l; lane++) {
         TCGv_i64 q0 = tcg_temp_new_i64();
         TCGv_i64 q1 = tcg_temp_new_i64();

         tcg_gen_ld_i64(q0, cpu_env, offset + lane * 16 + offsetof(XMMReg, MMX_Q(0));
         tcg_gen_ld_i64(q1, ...);

         if (shift >= 64) {
             tcg_gen_shri_i64(q0, q1, shift - 64);
             tcg_gen_movi_i64(q1, 0);
         } else {
             tcg_gen_extract2_i64(q0, q0, q1, shift);
             tcg_gen_shri_i64(q1, q1, shift);
         }

         tcg_gen_st_i64(q0, cpu_env, offset + lane * 16 + offsetof(XMMReg, MMX_Q(0));
         tcg_gen_st_i64(q1, ...);
     }


> +static void gen_PSLLDQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
> +{
> +    int vec_len = sse_vec_len(s, decode);
> +    TCGv_ptr imm_vec = make_imm_xmm_vec(decode->immediate, vec_len);
> +
> +    if (s->vex_l) {
> +        gen_helper_pslldq_ymm(cpu_env, s->ptr0, s->ptr1, imm_vec);
> +    } else {
> +        gen_helper_pslldq_xmm(cpu_env, s->ptr0, s->ptr1, imm_vec);
> +    }
> +    tcg_temp_free_ptr(imm_vec);
> +}

Similar, but the extract2 becomes

     tcg_gen_extract2_i64(q1, q0, q1, 64 - shift);

> +void helper_vzeroall(CPUX86State *env)
> +{
> +    int i;
> +
> +    for (i = 0; i < 8; i++) {
> +        env->xmm_regs[i].ZMM_Q(0) = 0;
> +        env->xmm_regs[i].ZMM_Q(1) = 0;
> +        env->xmm_regs[i].ZMM_Q(2) = 0;
> +        env->xmm_regs[i].ZMM_Q(3) = 0;
> +    }
> +}

Better with memset, I think, available as gen_helper_memset().

> +#ifdef TARGET_X86_64
> +void helper_vzeroall_hi8(CPUX86State *env)
> +{
> +    int i;
> +
> +    for (i = 8; i < 16; i++) {
> +        env->xmm_regs[i].ZMM_Q(0) = 0;
> +        env->xmm_regs[i].ZMM_Q(1) = 0;
> +        env->xmm_regs[i].ZMM_Q(2) = 0;
> +        env->xmm_regs[i].ZMM_Q(3) = 0;
> +    }
> +}

Likewise.


> +
> +void helper_vzeroupper_hi8(CPUX86State *ense_new &&
> -            ((b >= 0x150 && b <= 0x16f) ||
> -             (b >= 0x178 && b <= 0x17f) ||
> +            ((b >= 0x150 && b <= 0x17f) ||
>                (b >= 0x1d8 && b <= 0x1ff && (b & 8)))) {
>               return disas_insn_new(s, cpu, b + 0x100);
>           }

More mailer lossage?


r~
diff mbox series

Patch

diff --git a/target/i386/helper.h b/target/i386/helper.h
index 3da5df98b9..d7e6878263 100644
--- a/target/i386/helper.h
+++ b/target/i386/helper.h
@@ -221,6 +221,13 @@  DEF_HELPER_3(movq, void, env, ptr, ptr)
 #define SHIFT 2
 #include "ops_sse_header.h"
 
+DEF_HELPER_1(vzeroall, void, env)
+DEF_HELPER_1(vzeroupper, void, env)
+#ifdef TARGET_X86_64
+DEF_HELPER_1(vzeroall_hi8, void, env)
+DEF_HELPER_1(vzeroupper_hi8, void, env)
+#endif
+
 DEF_HELPER_3(rclb, tl, env, tl, tl)
 DEF_HELPER_3(rclw, tl, env, tl, tl)
 DEF_HELPER_3(rcll, tl, env, tl, tl)
diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc
index 6aa8bac74f..0e2da85934 100644
--- a/target/i386/tcg/decode-new.c.inc
+++ b/target/i386/tcg/decode-new.c.inc
@@ -133,6 +133,19 @@  static uint8_t get_modrm(DisasContext *s, CPUX86State *env)
     return s->modrm;
 }
 
+static inline const X86OpEntry *decode_by_prefix(DisasContext *s, const X86OpEntry entries[4])
+{
+    if (s->prefix & PREFIX_REPNZ) {
+        return &entries[3];
+    } else if (s->prefix & PREFIX_REPZ) {
+        return &entries[2];
+    } else if (s->prefix & PREFIX_DATA) {
+        return &entries[1];
+    } else {
+        return &entries[0];
+    }
+}
+
 static void decode_group17(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
 {
     static const X86GenFunc group17_gen[8] = {
@@ -142,6 +155,48 @@  static void decode_group17(DisasContext *s, CPUX86State *env, X86OpEntry *entry,
     entry->gen = group17_gen[op];
 }
 
+static void decode_group12_13_14(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+    static const X86OpEntry group[3][8] = {
+        {
+            /* grp12 */
+            {},
+            {},
+            X86_OP_ENTRY3(PSRLW_i,  H,x, U,x, I,b, vex7 mmx avx2_256 p_00_66),
+            {},
+            X86_OP_ENTRY3(PSRAW_i,  H,x, U,x, I,b, vex7 mmx avx2_256 p_00_66),
+            {},
+            X86_OP_ENTRY3(PSLLW_i,  H,x, U,x, I,b, vex7 mmx avx2_256 p_00_66),
+            {},
+        },
+        {
+            /* grp13 */
+            {},
+            {},
+            X86_OP_ENTRY3(PSRLD_i,  H,x, U,x, I,b, vex7 mmx avx2_256 p_00_66),
+            {},
+            X86_OP_ENTRY3(PSRAD_i,  H,x, U,x, I,b, vex7 mmx avx2_256 p_00_66),
+            {},
+            X86_OP_ENTRY3(PSLLD_i,  H,x, U,x, I,b, vex7 mmx avx2_256 p_00_66),
+            {},
+        },
+        {
+            /* grp14 */
+            {},
+            {},
+            X86_OP_ENTRY3(PSRLQ_i,  H,x, U,x, I,b, vex7 mmx avx2_256 p_00_66),
+            X86_OP_ENTRY3(PSRLDQ_i, H,x, U,x, I,b, vex7 avx2_256 p_66),
+            {},
+            {},
+            X86_OP_ENTRY3(PSLLQ_i,  H,x, U,x, I,b, vex7 mmx avx2_256 p_00_66),
+            X86_OP_ENTRY3(PSLLDQ_i, H,x, U,x, I,b, vex7 avx2_256 p_66),
+        }
+    };
+
+    int op = (get_modrm(s, env) >> 3) & 7;
+    *entry = group[*b - 0x71][op];
+}
+
 static void decode_0F6F(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
 {
     if (s->prefix & PREFIX_REPNZ) {
@@ -159,6 +214,18 @@  static void decode_0F6F(DisasContext *s, CPUX86State *env, X86OpEntry *entry, ui
     }
 }
 
+static void decode_0F70(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+    static const X86OpEntry pshufw[4] = {
+        X86_OP_ENTRY3(PSHUFW,  P,q, Q,q, I,b, vex4),
+        X86_OP_ENTRY3(PSHUFD,  V,x, W,x, I,b, vex4 avx2_256),
+        X86_OP_ENTRY3(PSHUFHW, V,x, W,x, I,b, vex4 avx2_256),
+        X86_OP_ENTRY3(PSHUFLW, V,x, W,x, I,b, vex4 avx2_256),
+    };
+
+    *entry = *decode_by_prefix(s, pshufw);
+}
+
 static void decode_0F7E(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
 {
     static const X86OpEntry movd_from_vec =
@@ -292,6 +359,15 @@  static const X86OpEntry opcodes_0F[256] = {
     [0x66] = X86_OP_ENTRY3(PCMPGTD,    V,x, H,x, W,x,  vex4 mmx avx2_256 p_00_66),
     [0x67] = X86_OP_ENTRY3(PACKUSWB,   V,x, H,x, W,x,  vex4 mmx avx2_256 p_00_66),
 
+    [0x70] = X86_OP_GROUP0(0F70),
+    [0x71] = X86_OP_GROUP0(group12_13_14),
+    [0x72] = X86_OP_GROUP0(group12_13_14),
+    [0x73] = X86_OP_GROUP0(group12_13_14),
+    [0x74] = X86_OP_ENTRY3(PCMPEQB,    V,x, H,x, W,x,  vex4 mmx avx2_256 p_00_66),
+    [0x75] = X86_OP_ENTRY3(PCMPEQW,    V,x, H,x, W,x,  vex4 mmx avx2_256 p_00_66),
+    [0x76] = X86_OP_ENTRY3(PCMPEQD,    V,x, H,x, W,x,  vex4 mmx avx2_256 p_00_66),
+    [0x77] = X86_OP_ENTRY0(EMMS_VZERO, vex8),
+
     [0x38] = X86_OP_GROUP0(0F38),
     [0x3a] = X86_OP_GROUP0(0F3A),
 
diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc
index 2053c9d8fb..fb01035d06 100644
--- a/target/i386/tcg/emit.c.inc
+++ b/target/i386/tcg/emit.c.inc
@@ -475,6 +475,30 @@  static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decod
 BINARY_INT_SSE(PUNPCKLQDQ, punpcklqdq)
 BINARY_INT_SSE(PUNPCKHQDQ, punpckhqdq)
 
+static inline void gen_unary_imm_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+                                     SSEFunc_0_ppi xmm, SSEFunc_0_ppi ymm)
+{
+    TCGv_i32 imm = tcg_const_i32(decode->immediate);
+    if (!s->vex_l) {
+        xmm(s->ptr0, s->ptr1, imm);
+    } else {
+        ymm(s->ptr0, s->ptr1, imm);
+    }
+    tcg_temp_free_i32(imm);
+}
+
+#define UNARY_IMM_SSE(uname, lname)                                                \
+static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+{                                                                                  \
+    gen_unary_imm_sse(s, env, decode,                                              \
+                      gen_helper_##lname##_xmm,                                    \
+                      gen_helper_##lname##_ymm);                                   \
+}
+
+UNARY_IMM_SSE(PSHUFD,     pshufd)
+UNARY_IMM_SSE(PSHUFHW,    pshufhw)
+UNARY_IMM_SSE(PSHUFLW,    pshuflw)
+
 static void gen_ADCOX(DisasContext *s, CPUX86State *env, MemOp ot, int cc_op)
 {
     TCGv carry_in = NULL;
@@ -633,6 +657,29 @@  static void gen_CRC32(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
     gen_helper_crc32(s->T0, s->tmp2_i32, s->T1, tcg_const_i32(8 << ot));
 }
 
+static void gen_EMMS_VZERO(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    if (!(s->prefix & PREFIX_VEX)) {
+        gen_helper_emms(cpu_env);
+        return;
+    }
+    if (s->vex_l) {
+        gen_helper_vzeroall(cpu_env);
+#ifdef TARGET_X86_64
+        if (CODE64(s)) {
+            gen_helper_vzeroall_hi8(cpu_env);
+        }
+#endif
+    } else {
+        gen_helper_vzeroupper(cpu_env);
+#ifdef TARGET_X86_64
+        if (CODE64(s)) {
+            gen_helper_vzeroupper_hi8(cpu_env);
+        }
+#endif
+    }
+}
+
 static void gen_MOVBE(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[0].ot;
@@ -787,6 +834,33 @@  static void gen_PANDN(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
                       decode->op[1].offset, vec_len, vec_len);
 }
 
+static void gen_PCMPEQB(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    int vec_len = sse_vec_len(s, decode);
+
+    tcg_gen_gvec_cmp(TCG_COND_EQ, MO_8,
+                     decode->op[0].offset, decode->op[1].offset,
+                     decode->op[2].offset, vec_len, vec_len);
+}
+
+static void gen_PCMPEQW(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    int vec_len = sse_vec_len(s, decode);
+
+    tcg_gen_gvec_cmp(TCG_COND_EQ, MO_16,
+                     decode->op[0].offset, decode->op[1].offset,
+                     decode->op[2].offset, vec_len, vec_len);
+}
+
+static void gen_PCMPEQD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    int vec_len = sse_vec_len(s, decode);
+
+    tcg_gen_gvec_cmp(TCG_COND_EQ, MO_32,
+                     decode->op[0].offset, decode->op[1].offset,
+                     decode->op[2].offset, vec_len, vec_len);
+}
+
 static void gen_PCMPGTB(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
 {
     int vec_len = sse_vec_len(s, decode);
@@ -841,6 +915,164 @@  static void gen_POR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
                     decode->op[2].offset, vec_len, vec_len);
 }
 
+static void gen_PSHUFW(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    TCGv_i32 imm = tcg_const_i32(decode->immediate);
+    gen_helper_pshufw_mmx(s->ptr0, s->ptr1, imm);
+    tcg_temp_free_i32(imm);
+}
+
+static void gen_PSRLW_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    int vec_len = sse_vec_len(s, decode);
+
+    if (decode->immediate >= 16) {
+        tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
+    } else {
+        tcg_gen_gvec_shri(MO_16,
+                          decode->op[0].offset, decode->op[1].offset,
+                          decode->immediate, vec_len, vec_len);
+    }
+}
+
+static void gen_PSLLW_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    int vec_len = sse_vec_len(s, decode);
+
+    if (decode->immediate >= 16) {
+        tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
+    } else {
+        tcg_gen_gvec_shli(MO_16,
+                          decode->op[0].offset, decode->op[1].offset,
+                          decode->immediate, vec_len, vec_len);
+    }
+}
+
+static void gen_PSRAW_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    int vec_len = sse_vec_len(s, decode);
+
+    if (decode->immediate >= 16) {
+        decode->immediate = 15;
+    }
+    tcg_gen_gvec_sari(MO_16,
+                      decode->op[0].offset, decode->op[1].offset,
+                      decode->immediate, vec_len, vec_len);
+}
+
+static void gen_PSRLD_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    int vec_len = sse_vec_len(s, decode);
+
+    if (decode->immediate >= 32) {
+        tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
+    } else {
+        tcg_gen_gvec_shri(MO_32,
+                          decode->op[0].offset, decode->op[1].offset,
+                          decode->immediate, vec_len, vec_len);
+    }
+}
+
+static void gen_PSLLD_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    int vec_len = sse_vec_len(s, decode);
+
+    if (decode->immediate >= 32) {
+        tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
+    } else {
+        tcg_gen_gvec_shli(MO_32,
+                          decode->op[0].offset, decode->op[1].offset,
+                          decode->immediate, vec_len, vec_len);
+    }
+}
+
+static void gen_PSRAD_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    int vec_len = sse_vec_len(s, decode);
+
+    if (decode->immediate >= 32) {
+        decode->immediate = 31;
+    }
+    tcg_gen_gvec_sari(MO_32,
+                      decode->op[0].offset, decode->op[1].offset,
+                      decode->immediate, vec_len, vec_len);
+}
+
+static void gen_PSRLQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    int vec_len = sse_vec_len(s, decode);
+
+    if (decode->immediate >= 64) {
+        tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
+    } else {
+        tcg_gen_gvec_shri(MO_64,
+                          decode->op[0].offset, decode->op[1].offset,
+                          decode->immediate, vec_len, vec_len);
+    }
+}
+
+static void gen_PSLLQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    int vec_len = sse_vec_len(s, decode);
+
+    if (decode->immediate >= 64) {
+        tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
+    } else {
+        tcg_gen_gvec_shli(MO_64,
+                          decode->op[0].offset, decode->op[1].offset,
+                          decode->immediate, vec_len, vec_len);
+    }
+}
+
+static inline TCGv_ptr make_imm_mmx_vec(uint32_t imm)
+{
+    TCGv_i64 imm_v = tcg_const_i64(imm);
+    TCGv_ptr ptr = tcg_temp_new_ptr();
+    tcg_gen_addi_ptr(ptr, cpu_env, offsetof(CPUX86State, mmx_t0));
+    tcg_gen_st_i64(imm_v, ptr, offsetof(MMXReg, MMX_Q(0)));
+    return ptr;
+}
+
+static inline TCGv_ptr make_imm_xmm_vec(uint32_t imm, int vec_len)
+{
+    MemOp ot = vec_len == 16 ? MO_128 : MO_256;
+    TCGv_i32 imm_v = tcg_const_i32(imm);
+    TCGv_ptr ptr = tcg_temp_new_ptr();
+
+    tcg_gen_gvec_dup_imm(MO_64, offsetof(CPUX86State, xmm_t0) + xmm_offset(ot),
+                         vec_len, vec_len, 0);
+
+    tcg_gen_addi_ptr(ptr, cpu_env, offsetof(CPUX86State, xmm_t0));
+    tcg_gen_st_i32(imm_v, ptr, offsetof(ZMMReg, ZMM_L(0)));
+    return ptr;
+}
+
+static void gen_PSRLDQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    int vec_len = sse_vec_len(s, decode);
+    TCGv_ptr imm_vec = make_imm_xmm_vec(decode->immediate, vec_len);
+
+    if (s->vex_l) {
+        gen_helper_psrldq_ymm(cpu_env, s->ptr0, s->ptr1, imm_vec);
+    } else {
+        gen_helper_psrldq_xmm(cpu_env, s->ptr0, s->ptr1, imm_vec);
+    }
+    tcg_temp_free_ptr(imm_vec);
+}
+
+static void gen_PSLLDQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    int vec_len = sse_vec_len(s, decode);
+    TCGv_ptr imm_vec = make_imm_xmm_vec(decode->immediate, vec_len);
+
+    if (s->vex_l) {
+        gen_helper_pslldq_ymm(cpu_env, s->ptr0, s->ptr1, imm_vec);
+    } else {
+        gen_helper_pslldq_xmm(cpu_env, s->ptr0, s->ptr1, imm_vec);
+    }
+    tcg_temp_free_ptr(imm_vec);
+}
+
 static void gen_PXOR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
 {
     int vec_len = sse_vec_len(s, decode);
diff --git a/target/i386/tcg/fpu_helper.c b/target/i386/tcg/fpu_helper.c
index 819e920ec6..230907bc5c 100644
--- a/target/i386/tcg/fpu_helper.c
+++ b/target/i386/tcg/fpu_helper.c
@@ -3056,3 +3056,49 @@  void helper_movq(CPUX86State *env, void *d, void *s)
 
 #define SHIFT 2
 #include "ops_sse.h"
+
+void helper_vzeroall(CPUX86State *env)
+{
+    int i;
+
+    for (i = 0; i < 8; i++) {
+        env->xmm_regs[i].ZMM_Q(0) = 0;
+        env->xmm_regs[i].ZMM_Q(1) = 0;
+        env->xmm_regs[i].ZMM_Q(2) = 0;
+        env->xmm_regs[i].ZMM_Q(3) = 0;
+    }
+}
+
+void helper_vzeroupper(CPUX86State *env)
+{
+    int i;
+
+    for (i = 0; i < 8; i++) {
+        env->xmm_regs[i].ZMM_Q(2) = 0;
+        env->xmm_regs[i].ZMM_Q(3) = 0;
+    }
+}
+
+#ifdef TARGET_X86_64
+void helper_vzeroall_hi8(CPUX86State *env)
+{
+    int i;
+
+    for (i = 8; i < 16; i++) {
+        env->xmm_regs[i].ZMM_Q(0) = 0;
+        env->xmm_regs[i].ZMM_Q(1) = 0;
+        env->xmm_regs[i].ZMM_Q(2) = 0;
+        env->xmm_regs[i].ZMM_Q(3) = 0;
+    }
+}
+
+void helper_vzeroupper_hi8(CPUX86State *ense_new &&
-            ((b >= 0x150 && b <= 0x16f) ||
-             (b >= 0x178 && b <= 0x17f) ||
+            ((b >= 0x150 && b <= 0x17f) ||
              (b >= 0x1d8 && b <= 0x1ff && (b & 8)))) {
             return disas_insn_new(s, cpu, b + 0x100);
         }
-- 
2.37.2