@@ -3288,6 +3288,71 @@ void glue(helper_vpmaskmovq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
#endif
}
+#define VGATHER_HELPER(scale) \
+void glue(helper_vpgatherdd ## scale, SUFFIX)(CPUX86State *env, \
+ Reg *d, Reg *v, Reg *s, target_ulong a0) \
+{ \
+ int i; \
+ for (i = 0; i < (2 << SHIFT); i++) { \
+ if (v->L(i) >> 31) { \
+ target_ulong addr = a0 \
+ + ((target_ulong)(int32_t)s->L(i) << scale); \
+ d->L(i) = cpu_ldl_data_ra(env, addr, GETPC()); \
+ } \
+ v->L(i) = 0; \
+ } \
+} \
+void glue(helper_vpgatherdq ## scale, SUFFIX)(CPUX86State *env, \
+ Reg *d, Reg *v, Reg *s, target_ulong a0) \
+{ \
+ int i; \
+ for (i = 0; i < (1 << SHIFT); i++) { \
+ if (v->Q(i) >> 63) { \
+ target_ulong addr = a0 \
+ + ((target_ulong)(int32_t)s->L(i) << scale); \
+ d->Q(i) = cpu_ldq_data_ra(env, addr, GETPC()); \
+ } \
+ v->Q(i) = 0; \
+ } \
+} \
+void glue(helper_vpgatherqd ## scale, SUFFIX)(CPUX86State *env, \
+ Reg *d, Reg *v, Reg *s, target_ulong a0) \
+{ \
+ int i; \
+ for (i = 0; i < (1 << SHIFT); i++) { \
+ if (v->L(i) >> 31) { \
+ target_ulong addr = a0 \
+ + ((target_ulong)(int64_t)s->Q(i) << scale); \
+ d->L(i) = cpu_ldl_data_ra(env, addr, GETPC()); \
+ } \
+ v->L(i) = 0; \
+ } \
+ d->Q(SHIFT) = 0; \
+ v->Q(SHIFT) = 0; \
+ YMM_ONLY( \
+ d->Q(3) = 0; \
+ v->Q(3) = 0; \
+ ) \
+} \
+void glue(helper_vpgatherqq ## scale, SUFFIX)(CPUX86State *env, \
+ Reg *d, Reg *v, Reg *s, target_ulong a0) \
+{ \
+ int i; \
+ for (i = 0; i < (1 << SHIFT); i++) { \
+ if (v->Q(i) >> 63) { \
+ target_ulong addr = a0 \
+ + ((target_ulong)(int64_t)s->Q(i) << scale); \
+ d->Q(i) = cpu_ldq_data_ra(env, addr, GETPC()); \
+ } \
+ v->Q(i) = 0; \
+ } \
+}
+
+VGATHER_HELPER(0)
+VGATHER_HELPER(1)
+VGATHER_HELPER(2)
+VGATHER_HELPER(3)
+
#if SHIFT == 2
void glue(helper_vbroadcastdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
{
@@ -433,6 +433,22 @@ DEF_HELPER_4(glue(vpmaskmovd_st, SUFFIX), void, env, Reg, Reg, tl)
DEF_HELPER_4(glue(vpmaskmovq_st, SUFFIX), void, env, Reg, Reg, tl)
DEF_HELPER_4(glue(vpmaskmovd, SUFFIX), void, env, Reg, Reg, Reg)
DEF_HELPER_4(glue(vpmaskmovq, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_5(glue(vpgatherdd0, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherdq0, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherqd0, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherqq0, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherdd1, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherdq1, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherqd1, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherqq1, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherdd2, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherdq2, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherqd2, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherqq2, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherdd3, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherdq3, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherqd3, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherqq3, SUFFIX), void, env, Reg, Reg, Reg, tl)
#if SHIFT == 2
DEF_HELPER_3(glue(vbroadcastdq, SUFFIX), void, env, Reg, Reg)
DEF_HELPER_1(vzeroall, void, env)
@@ -3315,6 +3315,10 @@ static const struct SSEOpHelper_table6 sse_op_table6[256] = {
/* vpmaskmovd, vpmaskmovq */
[0x8c] = BINARY_OP(vpmaskmovd, AVX, SSE_OPF_AVX2),
[0x8e] = SPECIAL_OP(AVX), /* vpmaskmovd, vpmaskmovq */
+ [0x90] = SPECIAL_OP(AVX), /* vpgatherdd, vpgatherdq */
+ [0x91] = SPECIAL_OP(AVX), /* vpgatherqd, vpgatherqq */
+ [0x92] = SPECIAL_OP(AVX), /* vgatherdpd, vgatherdps */
+ [0x93] = SPECIAL_OP(AVX), /* vgatherqpd, vgatherqps */
#define gen_helper_aesimc_ymm NULL
[0xdb] = UNARY_OP(aesimc, AES, 0),
[0xdc] = BINARY_OP(aesenc, AES, 0),
@@ -3381,6 +3385,25 @@ static const SSEFunc_0_eppt sse_op_table9[2][2] = {
SSE_OP(vpmaskmovd_st),
SSE_OP(vpmaskmovq_st),
};
+
+static const SSEFunc_0_epppt sse_op_table10[16][2] = {
+ SSE_OP(vpgatherdd0),
+ SSE_OP(vpgatherdq0),
+ SSE_OP(vpgatherqd0),
+ SSE_OP(vpgatherqq0),
+ SSE_OP(vpgatherdd1),
+ SSE_OP(vpgatherdq1),
+ SSE_OP(vpgatherqd1),
+ SSE_OP(vpgatherqq1),
+ SSE_OP(vpgatherdd2),
+ SSE_OP(vpgatherdq2),
+ SSE_OP(vpgatherqd2),
+ SSE_OP(vpgatherqq2),
+ SSE_OP(vpgatherdd3),
+ SSE_OP(vpgatherdq3),
+ SSE_OP(vpgatherqd3),
+ SSE_OP(vpgatherqq3),
+};
#undef SSE_OP
/* VEX prefix not allowed */
@@ -4350,6 +4373,57 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
}
op1_offset = ZMM_OFFSET(reg);
+ if ((b & 0xfc) == 0x90) { /* vgather */
+ int scale, index, base;
+ target_long disp = 0;
+ CHECK_AVX2(s);
+ if (mod == 3 || rm != 4) {
+ goto illegal_op;
+ }
+
+ /* Vector SIB */
+ val = x86_ldub_code(env, s);
+ scale = (val >> 6) & 3;
+ index = ((val >> 3) & 7) | REX_X(s);
+ base = (val & 7) | REX_B(s);
+ switch (mod) {
+ case 0:
+ if (base == 5) {
+ base = -1;
+ disp = (int32_t)x86_ldl_code(env, s);
+ }
+ break;
+ case 1:
+ disp = (int8_t)x86_ldub_code(env, s);
+ break;
+ default:
+ case 2:
+ disp = (int32_t)x86_ldl_code(env, s);
+ break;
+ }
+
+ /* destination, index and mask registers must not overlap */
+ if (reg == index || reg == reg_v) {
+ goto illegal_op;
+ }
+
+ tcg_gen_addi_tl(s->A0, cpu_regs[base], disp);
+ gen_add_A0_ds_seg(s);
+ op2_offset = ZMM_OFFSET(index);
+ v_offset = ZMM_OFFSET(reg_v);
+ tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset);
+ tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset);
+ tcg_gen_addi_ptr(s->ptr2, cpu_env, v_offset);
+ b1 = REX_W(s) | ((b & 1) << 1) | (scale << 2);
+ sse_op_table10[b1][s->vex_l](cpu_env,
+ s->ptr0, s->ptr2, s->ptr1, s->A0);
+ if (!s->vex_l) {
+ gen_clear_ymmh(s, reg);
+ gen_clear_ymmh(s, reg_v);
+ }
+ return;
+ }
+
if (op6.flags & SSE_OPF_MMX) {
CHECK_AVX2_256(s);
}
These are scatter load instructions that need introduce a new "Vector SIB" encoding. Also a bit of hair to handle different index sizes and scaling factors, but overall the combinatorial explosion doesn't end up too bad. The other thing of note is probably that these also modify the mask operand. Thankfully the operands may not overlap, and we do not have to make the whole thing appear atomic. Signed-off-by: Paul Brook <paul@nowt.org> --- target/i386/ops_sse.h | 65 +++++++++++++++++++++++++++++++ target/i386/ops_sse_header.h | 16 ++++++++ target/i386/tcg/translate.c | 74 ++++++++++++++++++++++++++++++++++++ 3 files changed, 155 insertions(+)