Message ID | 20190731175702.4916-9-jan.bobek@gmail.com |
---|---|
State | New |
Headers | show |
Series | reimplement (some) x86 vector instructions using tcg-gvec | expand |
On 7/31/19 10:56 AM, Jan Bobek wrote: > +#define gen_pand_mm(env, s, modrm) gen_gvec_ld_modrm_mm ((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0112) > +#define gen_pand_xmm(env, s, modrm) gen_gvec_ld_modrm_xmm ((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0112) > +#define gen_vpand_xmm(env, s, modrm) gen_gvec_ld_modrm_vxmm((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0123) > +#define gen_vpand_ymm(env, s, modrm) gen_gvec_ld_modrm_vymm((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0123) > +#define gen_andps_xmm gen_pand_xmm > +#define gen_vandps_xmm gen_vpand_xmm > +#define gen_vandps_ymm gen_vpand_ymm > +#define gen_andpd_xmm gen_pand_xmm > +#define gen_vandpd_xmm gen_vpand_xmm > +#define gen_vandpd_ymm gen_vpand_ymm Why all of these extra defines? > + enum { > + M_0F = 0x01 << 8, > + M_0F38 = 0x02 << 8, > + M_0F3A = 0x04 << 8, > + P_66 = 0x08 << 8, > + P_F3 = 0x10 << 8, > + P_F2 = 0x20 << 8, > + VEX_128 = 0x40 << 8, > + VEX_256 = 0x80 << 8, > + }; > + > + switch(b | M_0F > + | (s->prefix & PREFIX_DATA ? P_66 : 0) > + | (s->prefix & PREFIX_REPZ ? P_F3 : 0) > + | (s->prefix & PREFIX_REPNZ ? P_F2 : 0) > + | (s->prefix & PREFIX_VEX ? (s->vex_l ? VEX_256 : VEX_128) : 0)) { I think you can move this above almost everything in this function, so that all of the legacy bits follow this switch. > + case 0xdb | M_0F: gen_pand_mm(env, s, modrm); return; You'll want to put these on the next lines -- checkpatch.pl again. > + case 0xdb | M_0F | P_66: gen_pand_xmm(env, s, modrm); return; > + case 0xdb | M_0F | P_66 | VEX_128: gen_vpand_xmm(env, s, modrm); return; > + case 0xdb | M_0F | P_66 | VEX_256: gen_vpand_ymm(env, s, modrm); return; > + case 0x54 | M_0F: gen_andps_xmm(env, s, modrm); return; > + case 0x54 | M_0F | VEX_128: gen_vandps_xmm(env, s, modrm); return; > + case 0x54 | M_0F | VEX_256: gen_vandps_ymm(env, s, modrm); return; > + case 0x54 | M_0F | P_66: gen_andpd_xmm(env, s, modrm); return; > + case 0x54 | M_0F | P_66 | VEX_128: gen_vandpd_xmm(env, s, modrm); return; > + case 0x54 | M_0F | P_66 | VEX_256: gen_vandpd_ymm(env, s, modrm); return; > + default: break; > + } Perhaps group cases together? case 0xdb | M_0F | P_66: /* PAND */ case 0x54 | M_0F: /* ANDPS */ case 0x54 | M_0F | P_66: /* ANDPD */ gen_gvec_ld_modrm_xmm(env, s, modrm, MO_64, tcg_gen_gvec_and, 0112); return; How are you planning to handle CPUID checks? I know the currently handling is quite spotty, but with a reorg we might as well fix that too. r~
On Wed, Jul 31, 2019 at 9:36 PM Richard Henderson < richard.henderson@linaro.org> wrote: > On 7/31/19 10:56 AM, Jan Bobek wrote: > > +#define gen_pand_mm(env, s, modrm) gen_gvec_ld_modrm_mm ((env), (s), > (modrm), MO_64, tcg_gen_gvec_and, 0112) > > +#define gen_pand_xmm(env, s, modrm) gen_gvec_ld_modrm_xmm ((env), (s), > (modrm), MO_64, tcg_gen_gvec_and, 0112) > > +#define gen_vpand_xmm(env, s, modrm) gen_gvec_ld_modrm_vxmm((env), (s), > (modrm), MO_64, tcg_gen_gvec_and, 0123) > > +#define gen_vpand_ymm(env, s, modrm) gen_gvec_ld_modrm_vymm((env), (s), > (modrm), MO_64, tcg_gen_gvec_and, 0123) > > +#define gen_andps_xmm gen_pand_xmm > > +#define gen_vandps_xmm gen_vpand_xmm > > +#define gen_vandps_ymm gen_vpand_ymm > > +#define gen_andpd_xmm gen_pand_xmm > > +#define gen_vandpd_xmm gen_vpand_xmm > > +#define gen_vandpd_ymm gen_vpand_ymm > > > Why all of these extra defines? > Because of code clarity and safety, I would say. This line: case 0x54 | M_0F: gen_andps_xmm(env, s, modrm); return; looks much clearer than this one: case 0x54 | M_0F: gen_gvec_ld_modrm_mm(env, s, modrm, MO_64, tcg_gen_gvec_and, 0112) and such organization is also much less prone to copy/paste bugs etc. Needless to say there is no performance price at all, so it is a no-brainer. Sincerely, Aleksandar > > > + enum { > > + M_0F = 0x01 << 8, > > + M_0F38 = 0x02 << 8, > > + M_0F3A = 0x04 << 8, > > + P_66 = 0x08 << 8, > > + P_F3 = 0x10 << 8, > > + P_F2 = 0x20 << 8, > > + VEX_128 = 0x40 << 8, > > + VEX_256 = 0x80 << 8, > > + }; > > + > > + switch(b | M_0F > > + | (s->prefix & PREFIX_DATA ? P_66 : 0) > > + | (s->prefix & PREFIX_REPZ ? P_F3 : 0) > > + | (s->prefix & PREFIX_REPNZ ? P_F2 : 0) > > + | (s->prefix & PREFIX_VEX ? (s->vex_l ? VEX_256 : VEX_128) : > 0)) { > > I think you can move this above almost everything in this function, so > that all > of the legacy bits follow this switch. > > > + case 0xdb | M_0F: gen_pand_mm(env, s, modrm); > return; > > You'll want to put these on the next lines -- checkpatch.pl again. > > > + case 0xdb | M_0F | P_66: gen_pand_xmm(env, s, modrm); > return; > > + case 0xdb | M_0F | P_66 | VEX_128: gen_vpand_xmm(env, s, modrm); > return; > > + case 0xdb | M_0F | P_66 | VEX_256: gen_vpand_ymm(env, s, modrm); > return; > > + case 0x54 | M_0F: gen_andps_xmm(env, s, modrm); > return; > > + case 0x54 | M_0F | VEX_128: gen_vandps_xmm(env, s, modrm); > return; > > + case 0x54 | M_0F | VEX_256: gen_vandps_ymm(env, s, modrm); > return; > > + case 0x54 | M_0F | P_66: gen_andpd_xmm(env, s, modrm); > return; > > + case 0x54 | M_0F | P_66 | VEX_128: gen_vandpd_xmm(env, s, modrm); > return; > > + case 0x54 | M_0F | P_66 | VEX_256: gen_vandpd_ymm(env, s, modrm); > return; > > + default: break; > > + } > > Perhaps group cases together? > > case 0xdb | M_0F | P_66: /* PAND */ > case 0x54 | M_0F: /* ANDPS */ > case 0x54 | M_0F | P_66: /* ANDPD */ > gen_gvec_ld_modrm_xmm(env, s, modrm, MO_64, tcg_gen_gvec_and, 0112); > return; > > How are you planning to handle CPUID checks? I know the currently > handling is > quite spotty, but with a reorg we might as well fix that too. > > > r~ > >
On 7/31/19 1:27 PM, Aleksandar Markovic wrote: > > > On Wed, Jul 31, 2019 at 9:36 PM Richard Henderson <richard.henderson@linaro.org > <mailto:richard.henderson@linaro.org>> wrote: > > On 7/31/19 10:56 AM, Jan Bobek wrote: > > +#define gen_pand_mm(env, s, modrm) gen_gvec_ld_modrm_mm ((env), (s), > (modrm), MO_64, tcg_gen_gvec_and, 0112) > > +#define gen_pand_xmm(env, s, modrm) gen_gvec_ld_modrm_xmm ((env), (s), > (modrm), MO_64, tcg_gen_gvec_and, 0112) > > +#define gen_vpand_xmm(env, s, modrm) gen_gvec_ld_modrm_vxmm((env), (s), > (modrm), MO_64, tcg_gen_gvec_and, 0123) > > +#define gen_vpand_ymm(env, s, modrm) gen_gvec_ld_modrm_vymm((env), (s), > (modrm), MO_64, tcg_gen_gvec_and, 0123) > > +#define gen_andps_xmm gen_pand_xmm > > +#define gen_vandps_xmm gen_vpand_xmm > > +#define gen_vandps_ymm gen_vpand_ymm > > +#define gen_andpd_xmm gen_pand_xmm > > +#define gen_vandpd_xmm gen_vpand_xmm > > +#define gen_vandpd_ymm gen_vpand_ymm > > > Why all of these extra defines? > > > Because of code clarity and safety, I would say. > > This line: > > case 0x54 | M_0F: gen_andps_xmm(env, s, modrm); return; > > looks much clearer than this one: > > case 0x54 | M_0F: gen_gvec_ld_modrm_mm(env, s, modrm, MO_64, > tcg_gen_gvec_and, 0112) > > and such organization is also much less prone to copy/paste bugs etc. I'm not convinced. These macros will be used exactly once. Because there is no reuse, there is no added safety. There is only the chance of a typo in a location removed from the actual use within the switch. I agree that the goal is to minimize any replication per case. But I don't think this particular arrangement of macros is the way to accomplish that. r~
On 7/31/19 3:35 PM, Richard Henderson wrote: > On 7/31/19 10:56 AM, Jan Bobek wrote: >> +#define gen_pand_mm(env, s, modrm) gen_gvec_ld_modrm_mm ((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0112) >> +#define gen_pand_xmm(env, s, modrm) gen_gvec_ld_modrm_xmm ((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0112) >> +#define gen_vpand_xmm(env, s, modrm) gen_gvec_ld_modrm_vxmm((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0123) >> +#define gen_vpand_ymm(env, s, modrm) gen_gvec_ld_modrm_vymm((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0123) >> +#define gen_andps_xmm gen_pand_xmm >> +#define gen_vandps_xmm gen_vpand_xmm >> +#define gen_vandps_ymm gen_vpand_ymm >> +#define gen_andpd_xmm gen_pand_xmm >> +#define gen_vandpd_xmm gen_vpand_xmm >> +#define gen_vandpd_ymm gen_vpand_ymm > > > Why all of these extra defines? > >> + enum { >> + M_0F = 0x01 << 8, >> + M_0F38 = 0x02 << 8, >> + M_0F3A = 0x04 << 8, >> + P_66 = 0x08 << 8, >> + P_F3 = 0x10 << 8, >> + P_F2 = 0x20 << 8, >> + VEX_128 = 0x40 << 8, >> + VEX_256 = 0x80 << 8, >> + }; >> + >> + switch(b | M_0F >> + | (s->prefix & PREFIX_DATA ? P_66 : 0) >> + | (s->prefix & PREFIX_REPZ ? P_F3 : 0) >> + | (s->prefix & PREFIX_REPNZ ? P_F2 : 0) >> + | (s->prefix & PREFIX_VEX ? (s->vex_l ? VEX_256 : VEX_128) : 0)) { > > I think you can move this above almost everything in this function, so that all > of the legacy bits follow this switch. > >> + case 0xdb | M_0F: gen_pand_mm(env, s, modrm); return; > > You'll want to put these on the next lines -- checkpatch.pl again. > >> + case 0xdb | M_0F | P_66: gen_pand_xmm(env, s, modrm); return; >> + case 0xdb | M_0F | P_66 | VEX_128: gen_vpand_xmm(env, s, modrm); return; >> + case 0xdb | M_0F | P_66 | VEX_256: gen_vpand_ymm(env, s, modrm); return; >> + case 0x54 | M_0F: gen_andps_xmm(env, s, modrm); return; >> + case 0x54 | M_0F | VEX_128: gen_vandps_xmm(env, s, modrm); return; >> + case 0x54 | M_0F | VEX_256: gen_vandps_ymm(env, s, modrm); return; >> + case 0x54 | M_0F | P_66: gen_andpd_xmm(env, s, modrm); return; >> + case 0x54 | M_0F | P_66 | VEX_128: gen_vandpd_xmm(env, s, modrm); return; >> + case 0x54 | M_0F | P_66 | VEX_256: gen_vandpd_ymm(env, s, modrm); return; >> + default: break; >> + } > > Perhaps group cases together? > > case 0xdb | M_0F | P_66: /* PAND */ > case 0x54 | M_0F: /* ANDPS */ > case 0x54 | M_0F | P_66: /* ANDPD */ > gen_gvec_ld_modrm_xmm(env, s, modrm, MO_64, tcg_gen_gvec_and, 0112); > return; As Aleksandar pointed out in his email, the general intuition was to have self-documenting code. Seeing case 0x54 | M_0F | VEX_256: gen_vandps_ymm(env, s, modrm); return; clearly states that this particular case is a VANDPS, and if one wants to see what we do with it, they can go look gen_vandps_ymm up. That being said, I have to the conclusion in the meantime that keeping all the extra macros is just too much code and not worth it, so I'll do it like you suggest above. > How are you planning to handle CPUID checks? I know the currently handling is > quite spotty, but with a reorg we might as well fix that too. Good question. CPUID checks are not handled in this patch at all, I will need to come up with a workable approach. -Jan
diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index ed05989768..b3ba23287d 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -353,7 +353,6 @@ static inline int satsw(int x) #define FMAXUB(a, b) ((a) > (b)) ? (a) : (b) #define FMAXSW(a, b) ((int16_t)(a) > (int16_t)(b)) ? (a) : (b) -#define FAND(a, b) ((a) & (b)) #define FANDN(a, b) ((~(a)) & (b)) #define FOR(a, b) ((a) | (b)) #define FXOR(a, b) ((a) ^ (b)) @@ -397,7 +396,6 @@ SSE_HELPER_B(helper_pmaxub, FMAXUB) SSE_HELPER_W(helper_pminsw, FMINSW) SSE_HELPER_W(helper_pmaxsw, FMAXSW) -SSE_HELPER_Q(helper_pand, FAND) SSE_HELPER_Q(helper_pandn, FANDN) SSE_HELPER_Q(helper_por, FOR) SSE_HELPER_Q(helper_pxor, FXOR) diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h index 094aafc573..63b4376389 100644 --- a/target/i386/ops_sse_header.h +++ b/target/i386/ops_sse_header.h @@ -86,7 +86,6 @@ SSE_HELPER_B(pmaxub, FMAXUB) SSE_HELPER_W(pminsw, FMINSW) SSE_HELPER_W(pmaxsw, FMAXSW) -SSE_HELPER_Q(pand, FAND) SSE_HELPER_Q(pandn, FANDN) SSE_HELPER_Q(por, FOR) SSE_HELPER_Q(pxor, FXOR) diff --git a/target/i386/translate.c b/target/i386/translate.c index d576b3345c..3821733a4e 100644 --- a/target/i386/translate.c +++ b/target/i386/translate.c @@ -23,6 +23,7 @@ #include "disas/disas.h" #include "exec/exec-all.h" #include "tcg-op.h" +#include "tcg-op-gvec.h" #include "exec/cpu_ldst.h" #include "exec/translator.h" @@ -2723,6 +2724,7 @@ typedef void (*SSEFunc_0_eppt)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b, #define SSE_SPECIAL ((void *)1) #define SSE_DUMMY ((void *)2) +#define SSE_TOMBSTONE ((void *)3) #define MMX_OP2(x) { gen_helper_ ## x ## _mmx, gen_helper_ ## x ## _xmm } #define SSE_FOP(x) { gen_helper_ ## x ## ps, gen_helper_ ## x ## pd, \ @@ -2754,7 +2756,7 @@ static const SSEFunc_0_epp sse_op_table1[256][4] = { [0x51] = SSE_FOP(sqrt), [0x52] = { gen_helper_rsqrtps, NULL, gen_helper_rsqrtss, NULL }, [0x53] = { gen_helper_rcpps, NULL, gen_helper_rcpss, NULL }, - [0x54] = { gen_helper_pand_xmm, gen_helper_pand_xmm }, /* andps, andpd */ + [0x54] = { SSE_TOMBSTONE, SSE_TOMBSTONE }, /* andps, andpd */ [0x55] = { gen_helper_pandn_xmm, gen_helper_pandn_xmm }, /* andnps, andnpd */ [0x56] = { gen_helper_por_xmm, gen_helper_por_xmm }, /* orps, orpd */ [0x57] = { gen_helper_pxor_xmm, gen_helper_pxor_xmm }, /* xorps, xorpd */ @@ -2823,7 +2825,7 @@ static const SSEFunc_0_epp sse_op_table1[256][4] = { [0xd8] = MMX_OP2(psubusb), [0xd9] = MMX_OP2(psubusw), [0xda] = MMX_OP2(pminub), - [0xdb] = MMX_OP2(pand), + [0xdb] = { SSE_TOMBSTONE, SSE_TOMBSTONE }, [0xdc] = MMX_OP2(paddusb), [0xdd] = MMX_OP2(paddusw), [0xde] = MMX_OP2(pmaxub), @@ -3164,6 +3166,17 @@ static inline void gen_gvec_ld_modrm_3(CPUX86State *env, DisasContext *s, gen_ld_modrm_VxHxWx, \ gen_gvec_2_fp, (opctl)) +#define gen_pand_mm(env, s, modrm) gen_gvec_ld_modrm_mm ((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0112) +#define gen_pand_xmm(env, s, modrm) gen_gvec_ld_modrm_xmm ((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0112) +#define gen_vpand_xmm(env, s, modrm) gen_gvec_ld_modrm_vxmm((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0123) +#define gen_vpand_ymm(env, s, modrm) gen_gvec_ld_modrm_vymm((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0123) +#define gen_andps_xmm gen_pand_xmm +#define gen_vandps_xmm gen_vpand_xmm +#define gen_vandps_ymm gen_vpand_ymm +#define gen_andpd_xmm gen_pand_xmm +#define gen_vandpd_xmm gen_vpand_xmm +#define gen_vandpd_ymm gen_vpand_ymm + static void gen_sse(CPUX86State *env, DisasContext *s, int b) { int b1, op1_offset, op2_offset, is_xmm, val; @@ -3238,6 +3251,38 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b) reg |= REX_R(s); } mod = (modrm >> 6) & 3; + + enum { + M_0F = 0x01 << 8, + M_0F38 = 0x02 << 8, + M_0F3A = 0x04 << 8, + P_66 = 0x08 << 8, + P_F3 = 0x10 << 8, + P_F2 = 0x20 << 8, + VEX_128 = 0x40 << 8, + VEX_256 = 0x80 << 8, + }; + + switch(b | M_0F + | (s->prefix & PREFIX_DATA ? P_66 : 0) + | (s->prefix & PREFIX_REPZ ? P_F3 : 0) + | (s->prefix & PREFIX_REPNZ ? P_F2 : 0) + | (s->prefix & PREFIX_VEX ? (s->vex_l ? VEX_256 : VEX_128) : 0)) { + case 0xdb | M_0F: gen_pand_mm(env, s, modrm); return; + case 0xdb | M_0F | P_66: gen_pand_xmm(env, s, modrm); return; + case 0xdb | M_0F | P_66 | VEX_128: gen_vpand_xmm(env, s, modrm); return; + case 0xdb | M_0F | P_66 | VEX_256: gen_vpand_ymm(env, s, modrm); return; + case 0x54 | M_0F: gen_andps_xmm(env, s, modrm); return; + case 0x54 | M_0F | VEX_128: gen_vandps_xmm(env, s, modrm); return; + case 0x54 | M_0F | VEX_256: gen_vandps_ymm(env, s, modrm); return; + case 0x54 | M_0F | P_66: gen_andpd_xmm(env, s, modrm); return; + case 0x54 | M_0F | P_66 | VEX_128: gen_vandpd_xmm(env, s, modrm); return; + case 0x54 | M_0F | P_66 | VEX_256: gen_vandpd_ymm(env, s, modrm); return; + default: break; + } + + assert(sse_fn_epp != SSE_TOMBSTONE); + if (sse_fn_epp == SSE_SPECIAL) { b |= (b1 << 8); switch(b) {
Use the gvec infrastructure to achieve the desired functionality. Note: This commit adds several bits which will not be part of the final patch series and which are only present to allow for incremenal write-and-test development cycle. Notably, the SSE_TOMBSTONE define will go away entirely with all of the tables, and nothing will follow the new dispatch switch in gen_sse. Signed-off-by: Jan Bobek <jan.bobek@gmail.com> --- target/i386/ops_sse.h | 2 -- target/i386/ops_sse_header.h | 1 - target/i386/translate.c | 49 ++++++++++++++++++++++++++++++++++-- 3 files changed, 47 insertions(+), 5 deletions(-)