Message ID | 20200828183354.27913-44-peter.maydell@linaro.org |
---|---|
State | New |
Headers | show |
Series | target/arm: Implement fp16 for AArch32 VFP and Neon | expand |
On 8/28/20 11:33 AM, Peter Maydell wrote: > +#define float16_nop(N, M, S) (M) > +#define float32_nop(N, M, S) (M) > +#define float64_nop(N, M, S) (M) > > +DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16, H2) > +DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32, H4) > +DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64, ) > + > +/* > + * Non-fused multiply-accumulate operations, for Neon. NB that unlike > + * the fused ops below they assume accumulate both from and into Vd. > + */ > +DO_FMUL_IDX(gvec_fmla_nf_idx_h, add, float16, H2) > +DO_FMUL_IDX(gvec_fmla_nf_idx_s, add, float32, H4) > +DO_FMUL_IDX(gvec_fmls_nf_idx_h, sub, float16, H2) > +DO_FMUL_IDX(gvec_fmls_nf_idx_s, sub, float32, H4) > + > +#undef float16_nop > +#undef float32_nop > +#undef float64_nop This floatN_nop stuff is pretty ugly. Better to pass in either floatN_mul, or the floatN_muladd_nf helpers that you added earlier. Although I guess you're missing float64_muladd_nf so far. r~
On Sat, 29 Aug 2020 at 00:24, Richard Henderson <richard.henderson@linaro.org> wrote: > > On 8/28/20 11:33 AM, Peter Maydell wrote: > > +#define float16_nop(N, M, S) (M) > > +#define float32_nop(N, M, S) (M) > > +#define float64_nop(N, M, S) (M) > > > > +DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16, H2) > > +DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32, H4) > > +DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64, ) > > + > > +/* > > + * Non-fused multiply-accumulate operations, for Neon. NB that unlike > > + * the fused ops below they assume accumulate both from and into Vd. > > + */ > > +DO_FMUL_IDX(gvec_fmla_nf_idx_h, add, float16, H2) > > +DO_FMUL_IDX(gvec_fmla_nf_idx_s, add, float32, H4) > > +DO_FMUL_IDX(gvec_fmls_nf_idx_h, sub, float16, H2) > > +DO_FMUL_IDX(gvec_fmls_nf_idx_s, sub, float32, H4) > > + > > +#undef float16_nop > > +#undef float32_nop > > +#undef float64_nop > > This floatN_nop stuff is pretty ugly. > > Better to pass in either floatN_mul, or the floatN_muladd_nf helpers that you > added earlier. Although I guess you're missing float64_muladd_nf so far. I thought about doing that, but the float*_muladd_nf functions don't have the same signature as float*_mul -- they take (dest, op1, op2, stat) and float*_mul only takes (op1, op2, stat) -- so it doesn't work. You'd have to construct a wrapper for the mul function that took and ignored the dest argument, or split out mul entirely into its own macro rather than using DO_FMUL_IDX for mul and muladd. The nop macros seemed the simplest. thanks -- PMM
diff --git a/target/arm/helper.h b/target/arm/helper.h index cbdbf824d8d..8defd7c8019 100644 --- a/target/arm/helper.h +++ b/target/arm/helper.h @@ -740,6 +740,16 @@ DEF_HELPER_FLAGS_5(gvec_fmul_idx_s, TCG_CALL_NO_RWG, DEF_HELPER_FLAGS_5(gvec_fmul_idx_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_fmla_nf_idx_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_fmla_nf_idx_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(gvec_fmls_nf_idx_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_fmls_nf_idx_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + DEF_HELPER_FLAGS_6(gvec_fmla_idx_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_6(gvec_fmla_idx_s, TCG_CALL_NO_RWG, diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c index b27b90e1dd8..a973454e4f4 100644 --- a/target/arm/vec_helper.c +++ b/target/arm/vec_helper.c @@ -1085,7 +1085,7 @@ DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, ) #undef DO_MLA_IDX -#define DO_FMUL_IDX(NAME, TYPE, H) \ +#define DO_FMUL_IDX(NAME, ADD, TYPE, H) \ void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ { \ intptr_t i, j, oprsz = simd_oprsz(desc); \ @@ -1095,16 +1095,33 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ TYPE mm = m[H(i + idx)]; \ for (j = 0; j < segment; j++) { \ - d[i + j] = TYPE##_mul(n[i + j], mm, stat); \ + d[i + j] = TYPE##_##ADD(d[i + j], \ + TYPE##_mul(n[i + j], mm, stat), stat); \ } \ } \ clear_tail(d, oprsz, simd_maxsz(desc)); \ } -DO_FMUL_IDX(gvec_fmul_idx_h, float16, H2) -DO_FMUL_IDX(gvec_fmul_idx_s, float32, H4) -DO_FMUL_IDX(gvec_fmul_idx_d, float64, ) +#define float16_nop(N, M, S) (M) +#define float32_nop(N, M, S) (M) +#define float64_nop(N, M, S) (M) +DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16, H2) +DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32, H4) +DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64, ) + +/* + * Non-fused multiply-accumulate operations, for Neon. NB that unlike + * the fused ops below they assume accumulate both from and into Vd. + */ +DO_FMUL_IDX(gvec_fmla_nf_idx_h, add, float16, H2) +DO_FMUL_IDX(gvec_fmla_nf_idx_s, add, float32, H4) +DO_FMUL_IDX(gvec_fmls_nf_idx_h, sub, float16, H2) +DO_FMUL_IDX(gvec_fmls_nf_idx_s, sub, float32, H4) + +#undef float16_nop +#undef float32_nop +#undef float64_nop #undef DO_FMUL_IDX #define DO_FMLA_IDX(NAME, TYPE, H) \
Add gvec helpers for doing Neon-style indexed non-fused fp multiply-and-accumulate operations. Signed-off-by: Peter Maydell <peter.maydell@linaro.org> --- target/arm/helper.h | 10 ++++++++++ target/arm/vec_helper.c | 27 ++++++++++++++++++++++----- 2 files changed, 32 insertions(+), 5 deletions(-)