diff mbox series

[v2,34/42] i386: Implement VGATHER

Message ID 20220424220204.2493824-35-paul@nowt.org
State New
Headers show
Series AVX guest implementation | expand

Commit Message

Paul Brook April 24, 2022, 10:01 p.m. UTC
These are scatter load instructions that need introduce a new "Vector SIB"
encoding.  Also a bit of hair to handle different index sizes and scaling
factors, but overall the combinatorial explosion doesn't end up too bad.

The other thing of note is probably that these also modify the mask operand.
Thankfully the operands may not overlap, and we do not have to make the whole
thing appear atomic.

Signed-off-by: Paul Brook <paul@nowt.org>
---
 target/i386/ops_sse.h        | 65 +++++++++++++++++++++++++++++++
 target/i386/ops_sse_header.h | 16 ++++++++
 target/i386/tcg/translate.c  | 74 ++++++++++++++++++++++++++++++++++++
 3 files changed, 155 insertions(+)
diff mbox series

Patch

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index ffcba3d02c..14a2d1bf78 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -3288,6 +3288,71 @@  void glue(helper_vpmaskmovq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
 #endif
 }
 
+#define VGATHER_HELPER(scale)                                       \
+void glue(helper_vpgatherdd ## scale, SUFFIX)(CPUX86State *env,     \
+        Reg *d, Reg *v, Reg *s, target_ulong a0)                    \
+{                                                                   \
+    int i;                                                          \
+    for (i = 0; i < (2 << SHIFT); i++) {                            \
+        if (v->L(i) >> 31) {                                        \
+            target_ulong addr = a0                                  \
+                + ((target_ulong)(int32_t)s->L(i) << scale);        \
+            d->L(i) = cpu_ldl_data_ra(env, addr, GETPC());          \
+        }                                                           \
+        v->L(i) = 0;                                                \
+    }                                                               \
+}                                                                   \
+void glue(helper_vpgatherdq ## scale, SUFFIX)(CPUX86State *env,     \
+        Reg *d, Reg *v, Reg *s, target_ulong a0)                    \
+{                                                                   \
+    int i;                                                          \
+    for (i = 0; i < (1 << SHIFT); i++) {                            \
+        if (v->Q(i) >> 63) {                                        \
+            target_ulong addr = a0                                  \
+                + ((target_ulong)(int32_t)s->L(i) << scale);        \
+            d->Q(i) = cpu_ldq_data_ra(env, addr, GETPC());          \
+        }                                                           \
+        v->Q(i) = 0;                                                \
+    }                                                               \
+}                                                                   \
+void glue(helper_vpgatherqd ## scale, SUFFIX)(CPUX86State *env,     \
+        Reg *d, Reg *v, Reg *s, target_ulong a0)                    \
+{                                                                   \
+    int i;                                                          \
+    for (i = 0; i < (1 << SHIFT); i++) {                            \
+        if (v->L(i) >> 31) {                                        \
+            target_ulong addr = a0                                  \
+                + ((target_ulong)(int64_t)s->Q(i) << scale);        \
+            d->L(i) = cpu_ldl_data_ra(env, addr, GETPC());          \
+        }                                                           \
+        v->L(i) = 0;                                                \
+    }                                                               \
+    d->Q(SHIFT) = 0;                                                    \
+    v->Q(SHIFT) = 0;                                                    \
+    YMM_ONLY(                                                       \
+    d->Q(3) = 0;                                                    \
+    v->Q(3) = 0;                                                    \
+    )                                                               \
+}                                                                   \
+void glue(helper_vpgatherqq ## scale, SUFFIX)(CPUX86State *env,     \
+        Reg *d, Reg *v, Reg *s, target_ulong a0)                    \
+{                                                                   \
+    int i;                                                          \
+    for (i = 0; i < (1 << SHIFT); i++) {                            \
+        if (v->Q(i) >> 63) {                                        \
+            target_ulong addr = a0                                  \
+                + ((target_ulong)(int64_t)s->Q(i) << scale);        \
+            d->Q(i) = cpu_ldq_data_ra(env, addr, GETPC());          \
+        }                                                           \
+        v->Q(i) = 0;                                                \
+    }                                                               \
+}
+
+VGATHER_HELPER(0)
+VGATHER_HELPER(1)
+VGATHER_HELPER(2)
+VGATHER_HELPER(3)
+
 #if SHIFT == 2
 void glue(helper_vbroadcastdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h
index a7a6bf6b10..e5d8ea9bb7 100644
--- a/target/i386/ops_sse_header.h
+++ b/target/i386/ops_sse_header.h
@@ -433,6 +433,22 @@  DEF_HELPER_4(glue(vpmaskmovd_st, SUFFIX), void, env, Reg, Reg, tl)
 DEF_HELPER_4(glue(vpmaskmovq_st, SUFFIX), void, env, Reg, Reg, tl)
 DEF_HELPER_4(glue(vpmaskmovd, SUFFIX), void, env, Reg, Reg, Reg)
 DEF_HELPER_4(glue(vpmaskmovq, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_5(glue(vpgatherdd0, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherdq0, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherqd0, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherqq0, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherdd1, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherdq1, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherqd1, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherqq1, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherdd2, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherdq2, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherqd2, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherqq2, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherdd3, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherdq3, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherqd3, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherqq3, SUFFIX), void, env, Reg, Reg, Reg, tl)
 #if SHIFT == 2
 DEF_HELPER_3(glue(vbroadcastdq, SUFFIX), void, env, Reg, Reg)
 DEF_HELPER_1(vzeroall, void, env)
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index e00195d301..fe1ab58d07 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -3315,6 +3315,10 @@  static const struct SSEOpHelper_table6 sse_op_table6[256] = {
     /* vpmaskmovd, vpmaskmovq */
     [0x8c] = BINARY_OP(vpmaskmovd, AVX, SSE_OPF_AVX2),
     [0x8e] = SPECIAL_OP(AVX), /* vpmaskmovd, vpmaskmovq */
+    [0x90] = SPECIAL_OP(AVX), /* vpgatherdd, vpgatherdq */
+    [0x91] = SPECIAL_OP(AVX), /* vpgatherqd, vpgatherqq */
+    [0x92] = SPECIAL_OP(AVX), /* vgatherdpd, vgatherdps */
+    [0x93] = SPECIAL_OP(AVX), /* vgatherqpd, vgatherqps */
 #define gen_helper_aesimc_ymm NULL
     [0xdb] = UNARY_OP(aesimc, AES, 0),
     [0xdc] = BINARY_OP(aesenc, AES, 0),
@@ -3381,6 +3385,25 @@  static const SSEFunc_0_eppt sse_op_table9[2][2] = {
     SSE_OP(vpmaskmovd_st),
     SSE_OP(vpmaskmovq_st),
 };
+
+static const SSEFunc_0_epppt sse_op_table10[16][2] = {
+    SSE_OP(vpgatherdd0),
+    SSE_OP(vpgatherdq0),
+    SSE_OP(vpgatherqd0),
+    SSE_OP(vpgatherqq0),
+    SSE_OP(vpgatherdd1),
+    SSE_OP(vpgatherdq1),
+    SSE_OP(vpgatherqd1),
+    SSE_OP(vpgatherqq1),
+    SSE_OP(vpgatherdd2),
+    SSE_OP(vpgatherdq2),
+    SSE_OP(vpgatherqd2),
+    SSE_OP(vpgatherqq2),
+    SSE_OP(vpgatherdd3),
+    SSE_OP(vpgatherdq3),
+    SSE_OP(vpgatherqd3),
+    SSE_OP(vpgatherqq3),
+};
 #undef SSE_OP
 
 /* VEX prefix not allowed */
@@ -4350,6 +4373,57 @@  static void gen_sse(CPUX86State *env, DisasContext *s, int b,
                 }
                 op1_offset = ZMM_OFFSET(reg);
 
+                if ((b & 0xfc) == 0x90) { /* vgather */
+                    int scale, index, base;
+                    target_long disp = 0;
+                    CHECK_AVX2(s);
+                    if (mod == 3 || rm != 4) {
+                        goto illegal_op;
+                    }
+
+                    /* Vector SIB */
+                    val = x86_ldub_code(env, s);
+                    scale = (val >> 6) & 3;
+                    index = ((val >> 3) & 7) | REX_X(s);
+                    base = (val & 7) | REX_B(s);
+                    switch (mod) {
+                    case 0:
+                        if (base == 5) {
+                            base = -1;
+                            disp = (int32_t)x86_ldl_code(env, s);
+                        }
+                        break;
+                    case 1:
+                        disp = (int8_t)x86_ldub_code(env, s);
+                        break;
+                    default:
+                    case 2:
+                        disp = (int32_t)x86_ldl_code(env, s);
+                        break;
+                    }
+
+                    /* destination, index and mask registers must not overlap */
+                    if (reg == index || reg == reg_v) {
+                        goto illegal_op;
+                    }
+
+                    tcg_gen_addi_tl(s->A0, cpu_regs[base], disp);
+                    gen_add_A0_ds_seg(s);
+                    op2_offset = ZMM_OFFSET(index);
+                    v_offset = ZMM_OFFSET(reg_v);
+                    tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset);
+                    tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset);
+                    tcg_gen_addi_ptr(s->ptr2, cpu_env, v_offset);
+                    b1 = REX_W(s) | ((b & 1) << 1) | (scale << 2);
+                    sse_op_table10[b1][s->vex_l](cpu_env,
+                            s->ptr0, s->ptr2, s->ptr1, s->A0);
+                    if (!s->vex_l) {
+                        gen_clear_ymmh(s, reg);
+                        gen_clear_ymmh(s, reg_v);
+                    }
+                    return;
+                }
+
                 if (op6.flags & SSE_OPF_MMX) {
                     CHECK_AVX2_256(s);
                 }