Message ID | 57398E46.9050903@foss.arm.com |
---|---|
State | New |
Headers | show |
On Mon, May 16, 2016 at 10:09:26AM +0100, Jiong Wang wrote: > The support of vfma_n_f64, vfms_n_f32, vfmsq_n_f32, vfmsq_n_f64 are > missing in current gcc arm_neon.h. > > Meanwhile, besides "(fma (vec_dup (vec_select)))", fma by element can > also comes from "(fma (vec_dup(scalar" where the scalar value is already > sitting in vector register then duplicated to other lanes, and there is > no lane size change. > > This patch implement this and can generate better code under some > context. For example: > > cat test.c > === > typedef __Float32x2_t float32x2_t; > typedef float float32_t; > > float32x2_t > vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c) > { > return __builtin_aarch64_fmav2sf (__b, (float32x2_t) {__c, > __c}, __a); > } > > before (-O2) > === > vfma_n_f32: > dup v2.2s, v2.s[0] > fmla v0.2s, v1.2s, v2.2s > ret > after > === > vfma_n_f32: > fmla v0.2s, v1.2s, v2.s[0] > ret > > OK for trunk? > > 2016-05-16 Jiong Wang <jiong.wang@arm.com> This ChangeLog entry is not correctly formatted. There should be two spaces between your name and your email, and each line should start with a tab. > > gcc/ > * config/aarch64/aarch64-simd.md (*aarch64_fma4_elt_to_128df): Rename > to *aarch64_fma4_elt_from_dup<mode>. > (*aarch64_fnma4_elt_to_128df): Rename to > *aarch64_fnma4_elt_from_dup<mode>. > * config/aarch64/arm_neon.h (vfma_n_f64): New. > (vfms_n_f32): Likewise. > (vfms_n_f64): Likewise. > (vfmsq_n_f32): Likewise. > (vfmsq_n_f64): Likewise. > > gcc/testsuite/ > * gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c: Use > standard syntax. > * gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c: Likewise. The paths of these two entries are incorrect. Remove the gcc/testsuite from the front. I don't understand what you mean by "Use standard syntax.", please fix this to describe what you are actually changing. > * gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h: New entry > for float64x1. > * gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c: New. These two changes need approval from an ARM maintainer as they are in common files. From an AArch64 perspective, this patch is OK with a fixed ChangeLog. Please wait for an ARM OK for the test changes. Thanks, James > diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md > index bd73bce64414e8bc01732d14311d742cf28f4586..90eaca176b4706e6cc42f16ce2c956f1c8ad17b1 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -1579,16 +1579,16 @@ > [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")] > ) > > -(define_insn "*aarch64_fma4_elt_to_128df" > - [(set (match_operand:V2DF 0 "register_operand" "=w") > - (fma:V2DF > - (vec_duplicate:V2DF > - (match_operand:DF 1 "register_operand" "w")) > - (match_operand:V2DF 2 "register_operand" "w") > - (match_operand:V2DF 3 "register_operand" "0")))] > +(define_insn "*aarch64_fma4_elt_from_dup<mode>" > + [(set (match_operand:VMUL 0 "register_operand" "=w") > + (fma:VMUL > + (vec_duplicate:VMUL > + (match_operand:<VEL> 1 "register_operand" "w")) > + (match_operand:VMUL 2 "register_operand" "w") > + (match_operand:VMUL 3 "register_operand" "0")))] > "TARGET_SIMD" > - "fmla\\t%0.2d, %2.2d, %1.2d[0]" > - [(set_attr "type" "neon_fp_mla_d_scalar_q")] > + "fmla\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]" > + [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")] > ) > > (define_insn "*aarch64_fma4_elt_to_64v2df" > @@ -1656,17 +1656,17 @@ > [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")] > ) > > -(define_insn "*aarch64_fnma4_elt_to_128df" > - [(set (match_operand:V2DF 0 "register_operand" "=w") > - (fma:V2DF > - (neg:V2DF > - (match_operand:V2DF 2 "register_operand" "w")) > - (vec_duplicate:V2DF > - (match_operand:DF 1 "register_operand" "w")) > - (match_operand:V2DF 3 "register_operand" "0")))] > - "TARGET_SIMD" > - "fmls\\t%0.2d, %2.2d, %1.2d[0]" > - [(set_attr "type" "neon_fp_mla_d_scalar_q")] > +(define_insn "*aarch64_fnma4_elt_from_dup<mode>" > + [(set (match_operand:VMUL 0 "register_operand" "=w") > + (fma:VMUL > + (neg:VMUL > + (match_operand:VMUL 2 "register_operand" "w")) > + (vec_duplicate:VMUL > + (match_operand:<VEL> 1 "register_operand" "w")) > + (match_operand:VMUL 3 "register_operand" "0")))] > + "TARGET_SIMD" > + "fmls\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]" > + [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")] > ) > > (define_insn "*aarch64_fnma4_elt_to_64v2df" > diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h > index 2612a325718918cf7cd808f28c09c9c4c7b11c07..ca7ace5aa656163826569d046fcbf02f9f7d4d6c 100644 > --- a/gcc/config/aarch64/arm_neon.h > +++ b/gcc/config/aarch64/arm_neon.h > @@ -14456,6 +14456,12 @@ vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c) > return __builtin_aarch64_fmav2sf (__b, vdup_n_f32 (__c), __a); > } > > +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) > +vfma_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c) > +{ > + return (float64x1_t) {__b[0] * __c + __a[0]}; > +} > + > __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) > vfmaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c) > { > @@ -14597,6 +14603,29 @@ vfmsq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c) > return __builtin_aarch64_fmav2df (-__b, __c, __a); > } > > +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) > +vfms_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c) > +{ > + return __builtin_aarch64_fmav2sf (-__b, vdup_n_f32 (__c), __a); > +} > + > +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) > +vfms_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c) > +{ > + return (float64x1_t) {-__b[0] * __c + __a[0]}; > +} > + > +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) > +vfmsq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c) > +{ > + return __builtin_aarch64_fmav4sf (-__b, vdupq_n_f32 (__c), __a); > +} > + > +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) > +vfmsq_n_f64 (float64x2_t __a, float64x2_t __b, float64_t __c) > +{ > + return __builtin_aarch64_fmav2df (-__b, vdupq_n_f64 (__c), __a); > +} > > /* vfms_lane */ > > diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h > index 49fbd843e507ede8aa81d02c175a82a1221750a4..cf90825f87391b72aca9a29980210d21f4321c04 100644 > --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h > +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h > @@ -136,6 +136,7 @@ static ARRAY(result, poly, 16, 4); > #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) > static ARRAY(result, float, 16, 4); > #endif > +static ARRAY(result, float, 64, 1); > static ARRAY(result, float, 32, 2); > static ARRAY(result, int, 8, 16); > static ARRAY(result, int, 16, 8); > @@ -169,6 +170,7 @@ extern ARRAY(expected, poly, 8, 8); > extern ARRAY(expected, poly, 16, 4); > extern ARRAY(expected, hfloat, 16, 4); > extern ARRAY(expected, hfloat, 32, 2); > +extern ARRAY(expected, hfloat, 64, 1); > extern ARRAY(expected, int, 8, 16); > extern ARRAY(expected, int, 16, 8); > extern ARRAY(expected, int, 32, 4); > diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c > new file mode 100644 > index 0000000000000000000000000000000000000000..26223763c59c849607b5320f6ec37098a556ce2e > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c > @@ -0,0 +1,490 @@ > +#include <arm_neon.h> > +#include "arm-neon-ref.h" > +#include "compute-ref-data.h" > + > +#define A0 123.4f > +#define A1 -3.8f > +#define A2 -29.4f > +#define A3 (__builtin_inff ()) > +#define A4 0.0f > +#define A5 24.0f > +#define A6 124.0f > +#define A7 1024.0f > + > +#define B0 -5.8f > +#define B1 -0.0f > +#define B2 -10.8f > +#define B3 10.0f > +#define B4 23.4f > +#define B5 -1234.8f > +#define B6 8.9f > +#define B7 4.0f > + > +#define E0 9.8f > +#define E1 -1024.0f > +#define E2 (-__builtin_inff ()) > +#define E3 479.0f > +float32_t elem0 = E0; > +float32_t elem1 = E1; > +float32_t elem2 = E2; > +float32_t elem3 = E3; > + > +#define DA0 1231234.4 > +#define DA1 -3.8 > +#define DA2 -2980.4 > +#define DA3 -5.8 > +#define DA4 0.01123 > +#define DA5 24.0 > +#define DA6 124.12345 > +#define DA7 1024.0 > + > +#define DB0 -5.8 > +#define DB1 (__builtin_inf ()) > +#define DB2 -105.8 > +#define DB3 10.0 > +#define DB4 (-__builtin_inf ()) > +#define DB5 -1234.8 > +#define DB6 848.9 > +#define DB7 44444.0 > + > +#define DE0 9.8 > +#define DE1 -1024.0 > +#define DE2 105.8 > +#define DE3 479.0 > +float64_t delem0 = DE0; > +float64_t delem1 = DE1; > +float64_t delem2 = DE2; > +float64_t delem3 = DE3; > + > +#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA) > + > +/* Expected results for vfms_n. */ > + > +VECT_VAR_DECL(expectedfms0, float, 32, 2) [] = {A0 + -B0 * E0, A1 + -B1 * E0}; > +VECT_VAR_DECL(expectedfms1, float, 32, 2) [] = {A2 + -B2 * E1, A3 + -B3 * E1}; > +VECT_VAR_DECL(expectedfms2, float, 32, 2) [] = {A4 + -B4 * E2, A5 + -B5 * E2}; > +VECT_VAR_DECL(expectedfms3, float, 32, 2) [] = {A6 + -B6 * E3, A7 + -B7 * E3}; > +VECT_VAR_DECL(expectedfma0, float, 32, 2) [] = {A0 + B0 * E0, A1 + B1 * E0}; > +VECT_VAR_DECL(expectedfma1, float, 32, 2) [] = {A2 + B2 * E1, A3 + B3 * E1}; > +VECT_VAR_DECL(expectedfma2, float, 32, 2) [] = {A4 + B4 * E2, A5 + B5 * E2}; > +VECT_VAR_DECL(expectedfma3, float, 32, 2) [] = {A6 + B6 * E3, A7 + B7 * E3}; > + > +hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 2) = > + (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 2); > +hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 2) = > + (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 2); > +hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 2) = > + (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 2); > +hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 2) = > + (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 2); > +hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 2) = > + (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 2); > +hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 2) = > + (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 2); > +hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 2) = > + (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 2); > +hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 2) = > + (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 2); > + > + > +VECT_VAR_DECL(expectedfms0, float, 32, 4) [] = {A0 + -B0 * E0, A1 + -B1 * E0, > + A2 + -B2 * E0, A3 + -B3 * E0}; > +VECT_VAR_DECL(expectedfms1, float, 32, 4) [] = {A4 + -B4 * E1, A5 + -B5 * E1, > + A6 + -B6 * E1, A7 + -B7 * E1}; > +VECT_VAR_DECL(expectedfms2, float, 32, 4) [] = {A0 + -B0 * E2, A2 + -B2 * E2, > + A4 + -B4 * E2, A6 + -B6 * E2}; > +VECT_VAR_DECL(expectedfms3, float, 32, 4) [] = {A1 + -B1 * E3, A3 + -B3 * E3, > + A5 + -B5 * E3, A7 + -B7 * E3}; > +VECT_VAR_DECL(expectedfma0, float, 32, 4) [] = {A0 + B0 * E0, A1 + B1 * E0, > + A2 + B2 * E0, A3 + B3 * E0}; > +VECT_VAR_DECL(expectedfma1, float, 32, 4) [] = {A4 + B4 * E1, A5 + B5 * E1, > + A6 + B6 * E1, A7 + B7 * E1}; > +VECT_VAR_DECL(expectedfma2, float, 32, 4) [] = {A0 + B0 * E2, A2 + B2 * E2, > + A4 + B4 * E2, A6 + B6 * E2}; > +VECT_VAR_DECL(expectedfma3, float, 32, 4) [] = {A1 + B1 * E3, A3 + B3 * E3, > + A5 + B5 * E3, A7 + B7 * E3}; > + > +hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 4) = > + (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 4); > +hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 4) = > + (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 4); > +hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 4) = > + (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 4); > +hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 4) = > + (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 4); > +hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 4) = > + (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 4); > +hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 4) = > + (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 4); > +hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 4) = > + (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 4); > +hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 4) = > + (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 4); > + > +VECT_VAR_DECL(expectedfms0, float, 64, 2) [] = {DA0 + -DB0 * DE0, > + DA1 + -DB1 * DE0}; > +VECT_VAR_DECL(expectedfms1, float, 64, 2) [] = {DA2 + -DB2 * DE1, > + DA3 + -DB3 * DE1}; > +VECT_VAR_DECL(expectedfms2, float, 64, 2) [] = {DA4 + -DB4 * DE2, > + DA5 + -DB5 * DE2}; > +VECT_VAR_DECL(expectedfms3, float, 64, 2) [] = {DA6 + -DB6 * DE3, > + DA7 + -DB7 * DE3}; > +VECT_VAR_DECL(expectedfma0, float, 64, 2) [] = {DA0 + DB0 * DE0, > + DA1 + DB1 * DE0}; > +VECT_VAR_DECL(expectedfma1, float, 64, 2) [] = {DA2 + DB2 * DE1, > + DA3 + DB3 * DE1}; > +VECT_VAR_DECL(expectedfma2, float, 64, 2) [] = {DA4 + DB4 * DE2, > + DA5 + DB5 * DE2}; > +VECT_VAR_DECL(expectedfma3, float, 64, 2) [] = {DA6 + DB6 * DE3, > + DA7 + DB7 * DE3}; > +hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 2) = > + (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 2); > +hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 2) = > + (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 2); > +hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 2) = > + (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 2); > +hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 2) = > + (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 2); > +hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 2) = > + (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 2); > +hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 2) = > + (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 2); > +hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 2) = > + (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 2); > +hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 2) = > + (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 2); > + > +VECT_VAR_DECL(expectedfms0, float, 64, 1) [] = {DA0 + -DB0 * DE0}; > +VECT_VAR_DECL(expectedfms1, float, 64, 1) [] = {DA2 + -DB2 * DE1}; > +VECT_VAR_DECL(expectedfms2, float, 64, 1) [] = {DA4 + -DB4 * DE2}; > +VECT_VAR_DECL(expectedfms3, float, 64, 1) [] = {DA6 + -DB6 * DE3}; > +VECT_VAR_DECL(expectedfma0, float, 64, 1) [] = {DA0 + DB0 * DE0}; > +VECT_VAR_DECL(expectedfma1, float, 64, 1) [] = {DA2 + DB2 * DE1}; > +VECT_VAR_DECL(expectedfma2, float, 64, 1) [] = {DA4 + DB4 * DE2}; > +VECT_VAR_DECL(expectedfma3, float, 64, 1) [] = {DA6 + DB6 * DE3}; > + > +hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 1) = > + (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 1); > +hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 1) = > + (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 1); > +hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 1) = > + (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 1); > +hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 1) = > + (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 1); > +hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 1) = > + (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 1); > +hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 1) = > + (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 1); > +hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 1) = > + (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 1); > +hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 1) = > + (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 1); > + > +void exec_vfma_vfms_n (void) > +{ > +#undef TEST_MSG > +#define TEST_MSG "VFMS_VFMA_N (FP32)" > + clean_results (); > + > + DECL_VARIABLE(vsrc_1, float, 32, 2); > + DECL_VARIABLE(vsrc_2, float, 32, 2); > + VECT_VAR_DECL (buf_src_1, float, 32, 2) [] = {A0, A1}; > + VECT_VAR_DECL (buf_src_2, float, 32, 2) [] = {B0, B1}; > + VLOAD (vsrc_1, buf_src_1, , float, f, 32, 2); > + VLOAD (vsrc_2, buf_src_2, , float, f, 32, 2); > + DECL_VARIABLE (vector_res, float, 32, 2) = > + vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), > + VECT_VAR (vsrc_2, float, 32, 2), elem0); > + vst1_f32 (VECT_VAR (result, float, 32, 2), > + VECT_VAR (vector_res, float, 32, 2)); > + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms0_static, ""); > + VECT_VAR (vector_res, float, 32, 2) = > + vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), > + VECT_VAR (vsrc_2, float, 32, 2), elem0); > + vst1_f32 (VECT_VAR (result, float, 32, 2), > + VECT_VAR (vector_res, float, 32, 2)); > + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma0_static, ""); > + > + VECT_VAR_DECL (buf_src_3, float, 32, 2) [] = {A2, A3}; > + VECT_VAR_DECL (buf_src_4, float, 32, 2) [] = {B2, B3}; > + VLOAD (vsrc_1, buf_src_3, , float, f, 32, 2); > + VLOAD (vsrc_2, buf_src_4, , float, f, 32, 2); > + VECT_VAR (vector_res, float, 32, 2) = > + vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), > + VECT_VAR (vsrc_2, float, 32, 2), elem1); > + vst1_f32 (VECT_VAR (result, float, 32, 2), > + VECT_VAR (vector_res, float, 32, 2)); > + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms1_static, ""); > + VECT_VAR (vector_res, float, 32, 2) = > + vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), > + VECT_VAR (vsrc_2, float, 32, 2), elem1); > + vst1_f32 (VECT_VAR (result, float, 32, 2), > + VECT_VAR (vector_res, float, 32, 2)); > + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma1_static, ""); > + > + VECT_VAR_DECL (buf_src_5, float, 32, 2) [] = {A4, A5}; > + VECT_VAR_DECL (buf_src_6, float, 32, 2) [] = {B4, B5}; > + VLOAD (vsrc_1, buf_src_5, , float, f, 32, 2); > + VLOAD (vsrc_2, buf_src_6, , float, f, 32, 2); > + VECT_VAR (vector_res, float, 32, 2) = > + vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), > + VECT_VAR (vsrc_2, float, 32, 2), elem2); > + vst1_f32 (VECT_VAR (result, float, 32, 2), > + VECT_VAR (vector_res, float, 32, 2)); > + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms2_static, ""); > + VECT_VAR (vector_res, float, 32, 2) = > + vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), > + VECT_VAR (vsrc_2, float, 32, 2), elem2); > + vst1_f32 (VECT_VAR (result, float, 32, 2), > + VECT_VAR (vector_res, float, 32, 2)); > + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma2_static, ""); > + > + VECT_VAR_DECL (buf_src_7, float, 32, 2) [] = {A6, A7}; > + VECT_VAR_DECL (buf_src_8, float, 32, 2) [] = {B6, B7}; > + VLOAD (vsrc_1, buf_src_7, , float, f, 32, 2); > + VLOAD (vsrc_2, buf_src_8, , float, f, 32, 2); > + VECT_VAR (vector_res, float, 32, 2) = > + vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), > + VECT_VAR (vsrc_2, float, 32, 2), elem3); > + vst1_f32 (VECT_VAR (result, float, 32, 2), > + VECT_VAR (vector_res, float, 32, 2)); > + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms3_static, ""); > + VECT_VAR (vector_res, float, 32, 2) = > + vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), > + VECT_VAR (vsrc_2, float, 32, 2), elem3); > + vst1_f32 (VECT_VAR (result, float, 32, 2), > + VECT_VAR (vector_res, float, 32, 2)); > + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma3_static, ""); > + > +#undef TEST_MSG > +#define TEST_MSG "VFMSQ_VFMAQ_N (FP32)" > + clean_results (); > + > + DECL_VARIABLE(vsrc_1, float, 32, 4); > + DECL_VARIABLE(vsrc_2, float, 32, 4); > + VECT_VAR_DECL (buf_src_1, float, 32, 4) [] = {A0, A1, A2, A3}; > + VECT_VAR_DECL (buf_src_2, float, 32, 4) [] = {B0, B1, B2, B3}; > + VLOAD (vsrc_1, buf_src_1, q, float, f, 32, 4); > + VLOAD (vsrc_2, buf_src_2, q, float, f, 32, 4); > + DECL_VARIABLE (vector_res, float, 32, 4) = > + vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), > + VECT_VAR (vsrc_2, float, 32, 4), elem0); > + vst1q_f32 (VECT_VAR (result, float, 32, 4), > + VECT_VAR (vector_res, float, 32, 4)); > + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms0_static, ""); > + VECT_VAR (vector_res, float, 32, 4) = > + vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), > + VECT_VAR (vsrc_2, float, 32, 4), elem0); > + vst1q_f32 (VECT_VAR (result, float, 32, 4), > + VECT_VAR (vector_res, float, 32, 4)); > + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma0_static, ""); > + > + VECT_VAR_DECL (buf_src_3, float, 32, 4) [] = {A4, A5, A6, A7}; > + VECT_VAR_DECL (buf_src_4, float, 32, 4) [] = {B4, B5, B6, B7}; > + VLOAD (vsrc_1, buf_src_3, q, float, f, 32, 4); > + VLOAD (vsrc_2, buf_src_4, q, float, f, 32, 4); > + VECT_VAR (vector_res, float, 32, 4) = > + vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), > + VECT_VAR (vsrc_2, float, 32, 4), elem1); > + vst1q_f32 (VECT_VAR (result, float, 32, 4), > + VECT_VAR (vector_res, float, 32, 4)); > + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms1_static, ""); > + VECT_VAR (vector_res, float, 32, 4) = > + vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), > + VECT_VAR (vsrc_2, float, 32, 4), elem1); > + vst1q_f32 (VECT_VAR (result, float, 32, 4), > + VECT_VAR (vector_res, float, 32, 4)); > + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma1_static, ""); > + > + VECT_VAR_DECL (buf_src_5, float, 32, 4) [] = {A0, A2, A4, A6}; > + VECT_VAR_DECL (buf_src_6, float, 32, 4) [] = {B0, B2, B4, B6}; > + VLOAD (vsrc_1, buf_src_5, q, float, f, 32, 4); > + VLOAD (vsrc_2, buf_src_6, q, float, f, 32, 4); > + VECT_VAR (vector_res, float, 32, 4) = > + vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), > + VECT_VAR (vsrc_2, float, 32, 4), elem2); > + vst1q_f32 (VECT_VAR (result, float, 32, 4), > + VECT_VAR (vector_res, float, 32, 4)); > + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms2_static, ""); > + VECT_VAR (vector_res, float, 32, 4) = > + vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), > + VECT_VAR (vsrc_2, float, 32, 4), elem2); > + vst1q_f32 (VECT_VAR (result, float, 32, 4), > + VECT_VAR (vector_res, float, 32, 4)); > + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma2_static, ""); > + > + VECT_VAR_DECL (buf_src_7, float, 32, 4) [] = {A1, A3, A5, A7}; > + VECT_VAR_DECL (buf_src_8, float, 32, 4) [] = {B1, B3, B5, B7}; > + VLOAD (vsrc_1, buf_src_7, q, float, f, 32, 4); > + VLOAD (vsrc_2, buf_src_8, q, float, f, 32, 4); > + VECT_VAR (vector_res, float, 32, 4) = > + vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), > + VECT_VAR (vsrc_2, float, 32, 4), elem3); > + vst1q_f32 (VECT_VAR (result, float, 32, 4), > + VECT_VAR (vector_res, float, 32, 4)); > + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms3_static, ""); > + VECT_VAR (vector_res, float, 32, 4) = > + vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), > + VECT_VAR (vsrc_2, float, 32, 4), elem3); > + vst1q_f32 (VECT_VAR (result, float, 32, 4), > + VECT_VAR (vector_res, float, 32, 4)); > + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma3_static, ""); > + > +#undef TEST_MSG > +#define TEST_MSG "VFMSQ_VFMAQ_N (FP64)" > + clean_results (); > + > + DECL_VARIABLE(vsrc_1, float, 64, 2); > + DECL_VARIABLE(vsrc_2, float, 64, 2); > + VECT_VAR_DECL (buf_src_1, float, 64, 2) [] = {DA0, DA1}; > + VECT_VAR_DECL (buf_src_2, float, 64, 2) [] = {DB0, DB1}; > + VLOAD (vsrc_1, buf_src_1, q, float, f, 64, 2); > + VLOAD (vsrc_2, buf_src_2, q, float, f, 64, 2); > + DECL_VARIABLE (vector_res, float, 64, 2) = > + vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), > + VECT_VAR (vsrc_2, float, 64, 2), delem0); > + vst1q_f64 (VECT_VAR (result, float, 64, 2), > + VECT_VAR (vector_res, float, 64, 2)); > + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms0_static, ""); > + VECT_VAR (vector_res, float, 64, 2) = > + vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), > + VECT_VAR (vsrc_2, float, 64, 2), delem0); > + vst1q_f64 (VECT_VAR (result, float, 64, 2), > + VECT_VAR (vector_res, float, 64, 2)); > + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma0_static, ""); > + > + VECT_VAR_DECL (buf_src_3, float, 64, 2) [] = {DA2, DA3}; > + VECT_VAR_DECL (buf_src_4, float, 64, 2) [] = {DB2, DB3}; > + VLOAD (vsrc_1, buf_src_3, q, float, f, 64, 2); > + VLOAD (vsrc_2, buf_src_4, q, float, f, 64, 2); > + VECT_VAR (vector_res, float, 64, 2) = > + vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), > + VECT_VAR (vsrc_2, float, 64, 2), delem1); > + vst1q_f64 (VECT_VAR (result, float, 64, 2), > + VECT_VAR (vector_res, float, 64, 2)); > + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms1_static, ""); > + VECT_VAR (vector_res, float, 64, 2) = > + vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), > + VECT_VAR (vsrc_2, float, 64, 2), delem1); > + vst1q_f64 (VECT_VAR (result, float, 64, 2), > + VECT_VAR (vector_res, float, 64, 2)); > + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma1_static, ""); > + > + VECT_VAR_DECL (buf_src_5, float, 64, 2) [] = {DA4, DA5}; > + VECT_VAR_DECL (buf_src_6, float, 64, 2) [] = {DB4, DB5}; > + VLOAD (vsrc_1, buf_src_5, q, float, f, 64, 2); > + VLOAD (vsrc_2, buf_src_6, q, float, f, 64, 2); > + VECT_VAR (vector_res, float, 64, 2) = > + vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), > + VECT_VAR (vsrc_2, float, 64, 2), delem2); > + vst1q_f64 (VECT_VAR (result, float, 64, 2), > + VECT_VAR (vector_res, float, 64, 2)); > + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms2_static, ""); > + VECT_VAR (vector_res, float, 64, 2) = > + vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), > + VECT_VAR (vsrc_2, float, 64, 2), delem2); > + vst1q_f64 (VECT_VAR (result, float, 64, 2), > + VECT_VAR (vector_res, float, 64, 2)); > + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma2_static, ""); > + > + VECT_VAR_DECL (buf_src_7, float, 64, 2) [] = {DA6, DA7}; > + VECT_VAR_DECL (buf_src_8, float, 64, 2) [] = {DB6, DB7}; > + VLOAD (vsrc_1, buf_src_7, q, float, f, 64, 2); > + VLOAD (vsrc_2, buf_src_8, q, float, f, 64, 2); > + VECT_VAR (vector_res, float, 64, 2) = > + vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), > + VECT_VAR (vsrc_2, float, 64, 2), delem3); > + vst1q_f64 (VECT_VAR (result, float, 64, 2), > + VECT_VAR (vector_res, float, 64, 2)); > + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms3_static, ""); > + VECT_VAR (vector_res, float, 64, 2) = > + vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), > + VECT_VAR (vsrc_2, float, 64, 2), delem3); > + vst1q_f64 (VECT_VAR (result, float, 64, 2), > + VECT_VAR (vector_res, float, 64, 2)); > + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma3_static, ""); > + > +#undef TEST_MSG > +#define TEST_MSG "VFMS_VFMA_N (FP64)" > + clean_results (); > + > + DECL_VARIABLE(vsrc_1, float, 64, 1); > + DECL_VARIABLE(vsrc_2, float, 64, 1); > + VECT_VAR_DECL (buf_src_1, float, 64, 1) [] = {DA0}; > + VECT_VAR_DECL (buf_src_2, float, 64, 1) [] = {DB0}; > + VLOAD (vsrc_1, buf_src_1, , float, f, 64, 1); > + VLOAD (vsrc_2, buf_src_2, , float, f, 64, 1); > + DECL_VARIABLE (vector_res, float, 64, 1) = > + vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), > + VECT_VAR (vsrc_2, float, 64, 1), delem0); > + vst1_f64 (VECT_VAR (result, float, 64, 1), > + VECT_VAR (vector_res, float, 64, 1)); > + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms0_static, ""); > + VECT_VAR (vector_res, float, 64, 1) = > + vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), > + VECT_VAR (vsrc_2, float, 64, 1), delem0); > + vst1_f64 (VECT_VAR (result, float, 64, 1), > + VECT_VAR (vector_res, float, 64, 1)); > + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma0_static, ""); > + > + VECT_VAR_DECL (buf_src_3, float, 64, 1) [] = {DA2}; > + VECT_VAR_DECL (buf_src_4, float, 64, 1) [] = {DB2}; > + VLOAD (vsrc_1, buf_src_3, , float, f, 64, 1); > + VLOAD (vsrc_2, buf_src_4, , float, f, 64, 1); > + VECT_VAR (vector_res, float, 64, 1) = > + vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), > + VECT_VAR (vsrc_2, float, 64, 1), delem1); > + vst1_f64 (VECT_VAR (result, float, 64, 1), > + VECT_VAR (vector_res, float, 64, 1)); > + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms1_static, ""); > + VECT_VAR (vector_res, float, 64, 1) = > + vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), > + VECT_VAR (vsrc_2, float, 64, 1), delem1); > + vst1_f64 (VECT_VAR (result, float, 64, 1), > + VECT_VAR (vector_res, float, 64, 1)); > + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma1_static, ""); > + > + VECT_VAR_DECL (buf_src_5, float, 64, 1) [] = {DA4}; > + VECT_VAR_DECL (buf_src_6, float, 64, 1) [] = {DB4}; > + VLOAD (vsrc_1, buf_src_5, , float, f, 64, 1); > + VLOAD (vsrc_2, buf_src_6, , float, f, 64, 1); > + VECT_VAR (vector_res, float, 64, 1) = > + vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), > + VECT_VAR (vsrc_2, float, 64, 1), delem2); > + vst1_f64 (VECT_VAR (result, float, 64, 1), > + VECT_VAR (vector_res, float, 64, 1)); > + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms2_static, ""); > + VECT_VAR (vector_res, float, 64, 1) = > + vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), > + VECT_VAR (vsrc_2, float, 64, 1), delem2); > + vst1_f64 (VECT_VAR (result, float, 64, 1), > + VECT_VAR (vector_res, float, 64, 1)); > + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma2_static, ""); > + > + VECT_VAR_DECL (buf_src_7, float, 64, 1) [] = {DA6}; > + VECT_VAR_DECL (buf_src_8, float, 64, 1) [] = {DB6}; > + VLOAD (vsrc_1, buf_src_7, , float, f, 64, 1); > + VLOAD (vsrc_2, buf_src_8, , float, f, 64, 1); > + VECT_VAR (vector_res, float, 64, 1) = > + vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), > + VECT_VAR (vsrc_2, float, 64, 1), delem3); > + vst1_f64 (VECT_VAR (result, float, 64, 1), > + VECT_VAR (vector_res, float, 64, 1)); > + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms3_static, ""); > + VECT_VAR (vector_res, float, 64, 1) = > + vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), > + VECT_VAR (vsrc_2, float, 64, 1), delem3); > + vst1_f64 (VECT_VAR (result, float, 64, 1), > + VECT_VAR (vector_res, float, 64, 1)); > + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma3_static, ""); > +} > +#endif > + > +int > +main (void) > +{ > +#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA) > + exec_vfma_vfms_n (); > +#endif > + return 0; > +} > diff --git a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c > index 1ba1fed98a0711496815e00d2d702e5bfa2a7d43..5b348827002dcfef1f589900a4cf5ff7ada26697 100644 > --- a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c > +++ b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c > @@ -110,6 +110,6 @@ main (int argc, char **argv) > /* vfmaq_lane_f64. > vfma_laneq_f64. > vfmaq_laneq_f64. */ > -/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */ > +/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */ > > > diff --git a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c > index 887ebae10da715c8d301a8494a2225e53f15bd7d..6c194a023d34ebafb4d732edc303985531f92a63 100644 > --- a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c > +++ b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c > @@ -111,6 +111,6 @@ main (int argc, char **argv) > /* vfmsq_lane_f64. > vfms_laneq_f64. > vfmsq_laneq_f64. */ > -/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */ > +/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */ > > >
On 17/05/16 13:20, James Greenhalgh wrote: > On Mon, May 16, 2016 at 10:09:26AM +0100, Jiong Wang wrote: >> The support of vfma_n_f64, vfms_n_f32, vfmsq_n_f32, vfmsq_n_f64 are >> missing in current gcc arm_neon.h. >> >> Meanwhile, besides "(fma (vec_dup (vec_select)))", fma by element can >> also comes from "(fma (vec_dup(scalar" where the scalar value is already >> sitting in vector register then duplicated to other lanes, and there is >> no lane size change. >> >> This patch implement this and can generate better code under some >> context. For example: >> >> cat test.c >> === >> typedef __Float32x2_t float32x2_t; >> typedef float float32_t; >> >> float32x2_t >> vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c) >> { >> return __builtin_aarch64_fmav2sf (__b, (float32x2_t) {__c, >> __c}, __a); >> } >> >> before (-O2) >> === >> vfma_n_f32: >> dup v2.2s, v2.s[0] >> fmla v0.2s, v1.2s, v2.2s >> ret >> after >> === >> vfma_n_f32: >> fmla v0.2s, v1.2s, v2.s[0] >> ret >> >> OK for trunk? >> >> 2016-05-16 Jiong Wang <jiong.wang@arm.com> > This ChangeLog entry is not correctly formatted. There should be two > spaces between your name and your email, and each line should start with > a tab. > >> gcc/ >> * config/aarch64/aarch64-simd.md (*aarch64_fma4_elt_to_128df): Rename >> to *aarch64_fma4_elt_from_dup<mode>. >> (*aarch64_fnma4_elt_to_128df): Rename to >> *aarch64_fnma4_elt_from_dup<mode>. >> * config/aarch64/arm_neon.h (vfma_n_f64): New. >> (vfms_n_f32): Likewise. >> (vfms_n_f64): Likewise. >> (vfmsq_n_f32): Likewise. >> (vfmsq_n_f64): Likewise. >> >> gcc/testsuite/ >> * gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c: Use >> standard syntax. >> * gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c: Likewise. > The paths of these two entries are incorrect. Remove the gcc/testsuite > from the front. I don't understand what you mean by "Use standard syntax.", > please fix this to describe what you are actually changing. > >> * gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h: New entry >> for float64x1. >> * gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c: New. > These two changes need approval from an ARM maintainer as they are in > common files. > > From an AArch64 perspective, this patch is OK with a fixed ChangeLog. Please > wait for an ARM OK for the test changes. Considering that the tests' functionality is guarded on #if defined(__aarch64__) it's a noop on arm and so is ok from that perspective (we have precedence for tests guarded in such a way in advsimd-intrinsics.exp) The arm-neon-ref.h additions are ok too. Thanks, Kyrill > Thanks, > James > >> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md >> index bd73bce64414e8bc01732d14311d742cf28f4586..90eaca176b4706e6cc42f16ce2c956f1c8ad17b1 100644 >> --- a/gcc/config/aarch64/aarch64-simd.md >> +++ b/gcc/config/aarch64/aarch64-simd.md >> @@ -1579,16 +1579,16 @@ >> [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")] >> ) >> >> -(define_insn "*aarch64_fma4_elt_to_128df" >> - [(set (match_operand:V2DF 0 "register_operand" "=w") >> - (fma:V2DF >> - (vec_duplicate:V2DF >> - (match_operand:DF 1 "register_operand" "w")) >> - (match_operand:V2DF 2 "register_operand" "w") >> - (match_operand:V2DF 3 "register_operand" "0")))] >> +(define_insn "*aarch64_fma4_elt_from_dup<mode>" >> + [(set (match_operand:VMUL 0 "register_operand" "=w") >> + (fma:VMUL >> + (vec_duplicate:VMUL >> + (match_operand:<VEL> 1 "register_operand" "w")) >> + (match_operand:VMUL 2 "register_operand" "w") >> + (match_operand:VMUL 3 "register_operand" "0")))] >> "TARGET_SIMD" >> - "fmla\\t%0.2d, %2.2d, %1.2d[0]" >> - [(set_attr "type" "neon_fp_mla_d_scalar_q")] >> + "fmla\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]" >> + [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")] >> ) >> >> (define_insn "*aarch64_fma4_elt_to_64v2df" >> @@ -1656,17 +1656,17 @@ >> [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")] >> ) >> >> -(define_insn "*aarch64_fnma4_elt_to_128df" >> - [(set (match_operand:V2DF 0 "register_operand" "=w") >> - (fma:V2DF >> - (neg:V2DF >> - (match_operand:V2DF 2 "register_operand" "w")) >> - (vec_duplicate:V2DF >> - (match_operand:DF 1 "register_operand" "w")) >> - (match_operand:V2DF 3 "register_operand" "0")))] >> - "TARGET_SIMD" >> - "fmls\\t%0.2d, %2.2d, %1.2d[0]" >> - [(set_attr "type" "neon_fp_mla_d_scalar_q")] >> +(define_insn "*aarch64_fnma4_elt_from_dup<mode>" >> + [(set (match_operand:VMUL 0 "register_operand" "=w") >> + (fma:VMUL >> + (neg:VMUL >> + (match_operand:VMUL 2 "register_operand" "w")) >> + (vec_duplicate:VMUL >> + (match_operand:<VEL> 1 "register_operand" "w")) >> + (match_operand:VMUL 3 "register_operand" "0")))] >> + "TARGET_SIMD" >> + "fmls\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]" >> + [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")] >> ) >> >> (define_insn "*aarch64_fnma4_elt_to_64v2df" >> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h >> index 2612a325718918cf7cd808f28c09c9c4c7b11c07..ca7ace5aa656163826569d046fcbf02f9f7d4d6c 100644 >> --- a/gcc/config/aarch64/arm_neon.h >> +++ b/gcc/config/aarch64/arm_neon.h >> @@ -14456,6 +14456,12 @@ vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c) >> return __builtin_aarch64_fmav2sf (__b, vdup_n_f32 (__c), __a); >> } >> >> +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) >> +vfma_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c) >> +{ >> + return (float64x1_t) {__b[0] * __c + __a[0]}; >> +} >> + >> __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) >> vfmaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c) >> { >> @@ -14597,6 +14603,29 @@ vfmsq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c) >> return __builtin_aarch64_fmav2df (-__b, __c, __a); >> } >> >> +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) >> +vfms_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c) >> +{ >> + return __builtin_aarch64_fmav2sf (-__b, vdup_n_f32 (__c), __a); >> +} >> + >> +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) >> +vfms_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c) >> +{ >> + return (float64x1_t) {-__b[0] * __c + __a[0]}; >> +} >> + >> +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) >> +vfmsq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c) >> +{ >> + return __builtin_aarch64_fmav4sf (-__b, vdupq_n_f32 (__c), __a); >> +} >> + >> +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) >> +vfmsq_n_f64 (float64x2_t __a, float64x2_t __b, float64_t __c) >> +{ >> + return __builtin_aarch64_fmav2df (-__b, vdupq_n_f64 (__c), __a); >> +} >> >> /* vfms_lane */ >> >> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h >> index 49fbd843e507ede8aa81d02c175a82a1221750a4..cf90825f87391b72aca9a29980210d21f4321c04 100644 >> --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h >> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h >> @@ -136,6 +136,7 @@ static ARRAY(result, poly, 16, 4); >> #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) >> static ARRAY(result, float, 16, 4); >> #endif >> +static ARRAY(result, float, 64, 1); >> static ARRAY(result, float, 32, 2); >> static ARRAY(result, int, 8, 16); >> static ARRAY(result, int, 16, 8); >> @@ -169,6 +170,7 @@ extern ARRAY(expected, poly, 8, 8); >> extern ARRAY(expected, poly, 16, 4); >> extern ARRAY(expected, hfloat, 16, 4); >> extern ARRAY(expected, hfloat, 32, 2); >> +extern ARRAY(expected, hfloat, 64, 1); >> extern ARRAY(expected, int, 8, 16); >> extern ARRAY(expected, int, 16, 8); >> extern ARRAY(expected, int, 32, 4); >> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c >> new file mode 100644 >> index 0000000000000000000000000000000000000000..26223763c59c849607b5320f6ec37098a556ce2e >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c >> @@ -0,0 +1,490 @@ >> +#include <arm_neon.h> >> +#include "arm-neon-ref.h" >> +#include "compute-ref-data.h" >> + >> +#define A0 123.4f >> +#define A1 -3.8f >> +#define A2 -29.4f >> +#define A3 (__builtin_inff ()) >> +#define A4 0.0f >> +#define A5 24.0f >> +#define A6 124.0f >> +#define A7 1024.0f >> + >> +#define B0 -5.8f >> +#define B1 -0.0f >> +#define B2 -10.8f >> +#define B3 10.0f >> +#define B4 23.4f >> +#define B5 -1234.8f >> +#define B6 8.9f >> +#define B7 4.0f >> + >> +#define E0 9.8f >> +#define E1 -1024.0f >> +#define E2 (-__builtin_inff ()) >> +#define E3 479.0f >> +float32_t elem0 = E0; >> +float32_t elem1 = E1; >> +float32_t elem2 = E2; >> +float32_t elem3 = E3; >> + >> +#define DA0 1231234.4 >> +#define DA1 -3.8 >> +#define DA2 -2980.4 >> +#define DA3 -5.8 >> +#define DA4 0.01123 >> +#define DA5 24.0 >> +#define DA6 124.12345 >> +#define DA7 1024.0 >> + >> +#define DB0 -5.8 >> +#define DB1 (__builtin_inf ()) >> +#define DB2 -105.8 >> +#define DB3 10.0 >> +#define DB4 (-__builtin_inf ()) >> +#define DB5 -1234.8 >> +#define DB6 848.9 >> +#define DB7 44444.0 >> + >> +#define DE0 9.8 >> +#define DE1 -1024.0 >> +#define DE2 105.8 >> +#define DE3 479.0 >> +float64_t delem0 = DE0; >> +float64_t delem1 = DE1; >> +float64_t delem2 = DE2; >> +float64_t delem3 = DE3; >> + >> +#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA) >> + >> +/* Expected results for vfms_n. */ >> + >> +VECT_VAR_DECL(expectedfms0, float, 32, 2) [] = {A0 + -B0 * E0, A1 + -B1 * E0}; >> +VECT_VAR_DECL(expectedfms1, float, 32, 2) [] = {A2 + -B2 * E1, A3 + -B3 * E1}; >> +VECT_VAR_DECL(expectedfms2, float, 32, 2) [] = {A4 + -B4 * E2, A5 + -B5 * E2}; >> +VECT_VAR_DECL(expectedfms3, float, 32, 2) [] = {A6 + -B6 * E3, A7 + -B7 * E3}; >> +VECT_VAR_DECL(expectedfma0, float, 32, 2) [] = {A0 + B0 * E0, A1 + B1 * E0}; >> +VECT_VAR_DECL(expectedfma1, float, 32, 2) [] = {A2 + B2 * E1, A3 + B3 * E1}; >> +VECT_VAR_DECL(expectedfma2, float, 32, 2) [] = {A4 + B4 * E2, A5 + B5 * E2}; >> +VECT_VAR_DECL(expectedfma3, float, 32, 2) [] = {A6 + B6 * E3, A7 + B7 * E3}; >> + >> +hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 2) = >> + (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 2); >> +hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 2) = >> + (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 2); >> +hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 2) = >> + (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 2); >> +hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 2) = >> + (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 2); >> +hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 2) = >> + (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 2); >> +hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 2) = >> + (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 2); >> +hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 2) = >> + (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 2); >> +hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 2) = >> + (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 2); >> + >> + >> +VECT_VAR_DECL(expectedfms0, float, 32, 4) [] = {A0 + -B0 * E0, A1 + -B1 * E0, >> + A2 + -B2 * E0, A3 + -B3 * E0}; >> +VECT_VAR_DECL(expectedfms1, float, 32, 4) [] = {A4 + -B4 * E1, A5 + -B5 * E1, >> + A6 + -B6 * E1, A7 + -B7 * E1}; >> +VECT_VAR_DECL(expectedfms2, float, 32, 4) [] = {A0 + -B0 * E2, A2 + -B2 * E2, >> + A4 + -B4 * E2, A6 + -B6 * E2}; >> +VECT_VAR_DECL(expectedfms3, float, 32, 4) [] = {A1 + -B1 * E3, A3 + -B3 * E3, >> + A5 + -B5 * E3, A7 + -B7 * E3}; >> +VECT_VAR_DECL(expectedfma0, float, 32, 4) [] = {A0 + B0 * E0, A1 + B1 * E0, >> + A2 + B2 * E0, A3 + B3 * E0}; >> +VECT_VAR_DECL(expectedfma1, float, 32, 4) [] = {A4 + B4 * E1, A5 + B5 * E1, >> + A6 + B6 * E1, A7 + B7 * E1}; >> +VECT_VAR_DECL(expectedfma2, float, 32, 4) [] = {A0 + B0 * E2, A2 + B2 * E2, >> + A4 + B4 * E2, A6 + B6 * E2}; >> +VECT_VAR_DECL(expectedfma3, float, 32, 4) [] = {A1 + B1 * E3, A3 + B3 * E3, >> + A5 + B5 * E3, A7 + B7 * E3}; >> + >> +hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 4) = >> + (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 4); >> +hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 4) = >> + (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 4); >> +hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 4) = >> + (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 4); >> +hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 4) = >> + (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 4); >> +hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 4) = >> + (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 4); >> +hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 4) = >> + (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 4); >> +hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 4) = >> + (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 4); >> +hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 4) = >> + (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 4); >> + >> +VECT_VAR_DECL(expectedfms0, float, 64, 2) [] = {DA0 + -DB0 * DE0, >> + DA1 + -DB1 * DE0}; >> +VECT_VAR_DECL(expectedfms1, float, 64, 2) [] = {DA2 + -DB2 * DE1, >> + DA3 + -DB3 * DE1}; >> +VECT_VAR_DECL(expectedfms2, float, 64, 2) [] = {DA4 + -DB4 * DE2, >> + DA5 + -DB5 * DE2}; >> +VECT_VAR_DECL(expectedfms3, float, 64, 2) [] = {DA6 + -DB6 * DE3, >> + DA7 + -DB7 * DE3}; >> +VECT_VAR_DECL(expectedfma0, float, 64, 2) [] = {DA0 + DB0 * DE0, >> + DA1 + DB1 * DE0}; >> +VECT_VAR_DECL(expectedfma1, float, 64, 2) [] = {DA2 + DB2 * DE1, >> + DA3 + DB3 * DE1}; >> +VECT_VAR_DECL(expectedfma2, float, 64, 2) [] = {DA4 + DB4 * DE2, >> + DA5 + DB5 * DE2}; >> +VECT_VAR_DECL(expectedfma3, float, 64, 2) [] = {DA6 + DB6 * DE3, >> + DA7 + DB7 * DE3}; >> +hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 2) = >> + (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 2); >> +hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 2) = >> + (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 2); >> +hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 2) = >> + (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 2); >> +hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 2) = >> + (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 2); >> +hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 2) = >> + (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 2); >> +hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 2) = >> + (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 2); >> +hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 2) = >> + (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 2); >> +hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 2) = >> + (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 2); >> + >> +VECT_VAR_DECL(expectedfms0, float, 64, 1) [] = {DA0 + -DB0 * DE0}; >> +VECT_VAR_DECL(expectedfms1, float, 64, 1) [] = {DA2 + -DB2 * DE1}; >> +VECT_VAR_DECL(expectedfms2, float, 64, 1) [] = {DA4 + -DB4 * DE2}; >> +VECT_VAR_DECL(expectedfms3, float, 64, 1) [] = {DA6 + -DB6 * DE3}; >> +VECT_VAR_DECL(expectedfma0, float, 64, 1) [] = {DA0 + DB0 * DE0}; >> +VECT_VAR_DECL(expectedfma1, float, 64, 1) [] = {DA2 + DB2 * DE1}; >> +VECT_VAR_DECL(expectedfma2, float, 64, 1) [] = {DA4 + DB4 * DE2}; >> +VECT_VAR_DECL(expectedfma3, float, 64, 1) [] = {DA6 + DB6 * DE3}; >> + >> +hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 1) = >> + (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 1); >> +hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 1) = >> + (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 1); >> +hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 1) = >> + (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 1); >> +hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 1) = >> + (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 1); >> +hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 1) = >> + (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 1); >> +hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 1) = >> + (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 1); >> +hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 1) = >> + (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 1); >> +hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 1) = >> + (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 1); >> + >> +void exec_vfma_vfms_n (void) >> +{ >> +#undef TEST_MSG >> +#define TEST_MSG "VFMS_VFMA_N (FP32)" >> + clean_results (); >> + >> + DECL_VARIABLE(vsrc_1, float, 32, 2); >> + DECL_VARIABLE(vsrc_2, float, 32, 2); >> + VECT_VAR_DECL (buf_src_1, float, 32, 2) [] = {A0, A1}; >> + VECT_VAR_DECL (buf_src_2, float, 32, 2) [] = {B0, B1}; >> + VLOAD (vsrc_1, buf_src_1, , float, f, 32, 2); >> + VLOAD (vsrc_2, buf_src_2, , float, f, 32, 2); >> + DECL_VARIABLE (vector_res, float, 32, 2) = >> + vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), >> + VECT_VAR (vsrc_2, float, 32, 2), elem0); >> + vst1_f32 (VECT_VAR (result, float, 32, 2), >> + VECT_VAR (vector_res, float, 32, 2)); >> + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms0_static, ""); >> + VECT_VAR (vector_res, float, 32, 2) = >> + vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), >> + VECT_VAR (vsrc_2, float, 32, 2), elem0); >> + vst1_f32 (VECT_VAR (result, float, 32, 2), >> + VECT_VAR (vector_res, float, 32, 2)); >> + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma0_static, ""); >> + >> + VECT_VAR_DECL (buf_src_3, float, 32, 2) [] = {A2, A3}; >> + VECT_VAR_DECL (buf_src_4, float, 32, 2) [] = {B2, B3}; >> + VLOAD (vsrc_1, buf_src_3, , float, f, 32, 2); >> + VLOAD (vsrc_2, buf_src_4, , float, f, 32, 2); >> + VECT_VAR (vector_res, float, 32, 2) = >> + vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), >> + VECT_VAR (vsrc_2, float, 32, 2), elem1); >> + vst1_f32 (VECT_VAR (result, float, 32, 2), >> + VECT_VAR (vector_res, float, 32, 2)); >> + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms1_static, ""); >> + VECT_VAR (vector_res, float, 32, 2) = >> + vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), >> + VECT_VAR (vsrc_2, float, 32, 2), elem1); >> + vst1_f32 (VECT_VAR (result, float, 32, 2), >> + VECT_VAR (vector_res, float, 32, 2)); >> + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma1_static, ""); >> + >> + VECT_VAR_DECL (buf_src_5, float, 32, 2) [] = {A4, A5}; >> + VECT_VAR_DECL (buf_src_6, float, 32, 2) [] = {B4, B5}; >> + VLOAD (vsrc_1, buf_src_5, , float, f, 32, 2); >> + VLOAD (vsrc_2, buf_src_6, , float, f, 32, 2); >> + VECT_VAR (vector_res, float, 32, 2) = >> + vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), >> + VECT_VAR (vsrc_2, float, 32, 2), elem2); >> + vst1_f32 (VECT_VAR (result, float, 32, 2), >> + VECT_VAR (vector_res, float, 32, 2)); >> + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms2_static, ""); >> + VECT_VAR (vector_res, float, 32, 2) = >> + vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), >> + VECT_VAR (vsrc_2, float, 32, 2), elem2); >> + vst1_f32 (VECT_VAR (result, float, 32, 2), >> + VECT_VAR (vector_res, float, 32, 2)); >> + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma2_static, ""); >> + >> + VECT_VAR_DECL (buf_src_7, float, 32, 2) [] = {A6, A7}; >> + VECT_VAR_DECL (buf_src_8, float, 32, 2) [] = {B6, B7}; >> + VLOAD (vsrc_1, buf_src_7, , float, f, 32, 2); >> + VLOAD (vsrc_2, buf_src_8, , float, f, 32, 2); >> + VECT_VAR (vector_res, float, 32, 2) = >> + vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), >> + VECT_VAR (vsrc_2, float, 32, 2), elem3); >> + vst1_f32 (VECT_VAR (result, float, 32, 2), >> + VECT_VAR (vector_res, float, 32, 2)); >> + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms3_static, ""); >> + VECT_VAR (vector_res, float, 32, 2) = >> + vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), >> + VECT_VAR (vsrc_2, float, 32, 2), elem3); >> + vst1_f32 (VECT_VAR (result, float, 32, 2), >> + VECT_VAR (vector_res, float, 32, 2)); >> + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma3_static, ""); >> + >> +#undef TEST_MSG >> +#define TEST_MSG "VFMSQ_VFMAQ_N (FP32)" >> + clean_results (); >> + >> + DECL_VARIABLE(vsrc_1, float, 32, 4); >> + DECL_VARIABLE(vsrc_2, float, 32, 4); >> + VECT_VAR_DECL (buf_src_1, float, 32, 4) [] = {A0, A1, A2, A3}; >> + VECT_VAR_DECL (buf_src_2, float, 32, 4) [] = {B0, B1, B2, B3}; >> + VLOAD (vsrc_1, buf_src_1, q, float, f, 32, 4); >> + VLOAD (vsrc_2, buf_src_2, q, float, f, 32, 4); >> + DECL_VARIABLE (vector_res, float, 32, 4) = >> + vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), >> + VECT_VAR (vsrc_2, float, 32, 4), elem0); >> + vst1q_f32 (VECT_VAR (result, float, 32, 4), >> + VECT_VAR (vector_res, float, 32, 4)); >> + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms0_static, ""); >> + VECT_VAR (vector_res, float, 32, 4) = >> + vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), >> + VECT_VAR (vsrc_2, float, 32, 4), elem0); >> + vst1q_f32 (VECT_VAR (result, float, 32, 4), >> + VECT_VAR (vector_res, float, 32, 4)); >> + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma0_static, ""); >> + >> + VECT_VAR_DECL (buf_src_3, float, 32, 4) [] = {A4, A5, A6, A7}; >> + VECT_VAR_DECL (buf_src_4, float, 32, 4) [] = {B4, B5, B6, B7}; >> + VLOAD (vsrc_1, buf_src_3, q, float, f, 32, 4); >> + VLOAD (vsrc_2, buf_src_4, q, float, f, 32, 4); >> + VECT_VAR (vector_res, float, 32, 4) = >> + vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), >> + VECT_VAR (vsrc_2, float, 32, 4), elem1); >> + vst1q_f32 (VECT_VAR (result, float, 32, 4), >> + VECT_VAR (vector_res, float, 32, 4)); >> + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms1_static, ""); >> + VECT_VAR (vector_res, float, 32, 4) = >> + vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), >> + VECT_VAR (vsrc_2, float, 32, 4), elem1); >> + vst1q_f32 (VECT_VAR (result, float, 32, 4), >> + VECT_VAR (vector_res, float, 32, 4)); >> + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma1_static, ""); >> + >> + VECT_VAR_DECL (buf_src_5, float, 32, 4) [] = {A0, A2, A4, A6}; >> + VECT_VAR_DECL (buf_src_6, float, 32, 4) [] = {B0, B2, B4, B6}; >> + VLOAD (vsrc_1, buf_src_5, q, float, f, 32, 4); >> + VLOAD (vsrc_2, buf_src_6, q, float, f, 32, 4); >> + VECT_VAR (vector_res, float, 32, 4) = >> + vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), >> + VECT_VAR (vsrc_2, float, 32, 4), elem2); >> + vst1q_f32 (VECT_VAR (result, float, 32, 4), >> + VECT_VAR (vector_res, float, 32, 4)); >> + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms2_static, ""); >> + VECT_VAR (vector_res, float, 32, 4) = >> + vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), >> + VECT_VAR (vsrc_2, float, 32, 4), elem2); >> + vst1q_f32 (VECT_VAR (result, float, 32, 4), >> + VECT_VAR (vector_res, float, 32, 4)); >> + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma2_static, ""); >> + >> + VECT_VAR_DECL (buf_src_7, float, 32, 4) [] = {A1, A3, A5, A7}; >> + VECT_VAR_DECL (buf_src_8, float, 32, 4) [] = {B1, B3, B5, B7}; >> + VLOAD (vsrc_1, buf_src_7, q, float, f, 32, 4); >> + VLOAD (vsrc_2, buf_src_8, q, float, f, 32, 4); >> + VECT_VAR (vector_res, float, 32, 4) = >> + vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), >> + VECT_VAR (vsrc_2, float, 32, 4), elem3); >> + vst1q_f32 (VECT_VAR (result, float, 32, 4), >> + VECT_VAR (vector_res, float, 32, 4)); >> + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms3_static, ""); >> + VECT_VAR (vector_res, float, 32, 4) = >> + vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), >> + VECT_VAR (vsrc_2, float, 32, 4), elem3); >> + vst1q_f32 (VECT_VAR (result, float, 32, 4), >> + VECT_VAR (vector_res, float, 32, 4)); >> + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma3_static, ""); >> + >> +#undef TEST_MSG >> +#define TEST_MSG "VFMSQ_VFMAQ_N (FP64)" >> + clean_results (); >> + >> + DECL_VARIABLE(vsrc_1, float, 64, 2); >> + DECL_VARIABLE(vsrc_2, float, 64, 2); >> + VECT_VAR_DECL (buf_src_1, float, 64, 2) [] = {DA0, DA1}; >> + VECT_VAR_DECL (buf_src_2, float, 64, 2) [] = {DB0, DB1}; >> + VLOAD (vsrc_1, buf_src_1, q, float, f, 64, 2); >> + VLOAD (vsrc_2, buf_src_2, q, float, f, 64, 2); >> + DECL_VARIABLE (vector_res, float, 64, 2) = >> + vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), >> + VECT_VAR (vsrc_2, float, 64, 2), delem0); >> + vst1q_f64 (VECT_VAR (result, float, 64, 2), >> + VECT_VAR (vector_res, float, 64, 2)); >> + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms0_static, ""); >> + VECT_VAR (vector_res, float, 64, 2) = >> + vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), >> + VECT_VAR (vsrc_2, float, 64, 2), delem0); >> + vst1q_f64 (VECT_VAR (result, float, 64, 2), >> + VECT_VAR (vector_res, float, 64, 2)); >> + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma0_static, ""); >> + >> + VECT_VAR_DECL (buf_src_3, float, 64, 2) [] = {DA2, DA3}; >> + VECT_VAR_DECL (buf_src_4, float, 64, 2) [] = {DB2, DB3}; >> + VLOAD (vsrc_1, buf_src_3, q, float, f, 64, 2); >> + VLOAD (vsrc_2, buf_src_4, q, float, f, 64, 2); >> + VECT_VAR (vector_res, float, 64, 2) = >> + vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), >> + VECT_VAR (vsrc_2, float, 64, 2), delem1); >> + vst1q_f64 (VECT_VAR (result, float, 64, 2), >> + VECT_VAR (vector_res, float, 64, 2)); >> + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms1_static, ""); >> + VECT_VAR (vector_res, float, 64, 2) = >> + vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), >> + VECT_VAR (vsrc_2, float, 64, 2), delem1); >> + vst1q_f64 (VECT_VAR (result, float, 64, 2), >> + VECT_VAR (vector_res, float, 64, 2)); >> + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma1_static, ""); >> + >> + VECT_VAR_DECL (buf_src_5, float, 64, 2) [] = {DA4, DA5}; >> + VECT_VAR_DECL (buf_src_6, float, 64, 2) [] = {DB4, DB5}; >> + VLOAD (vsrc_1, buf_src_5, q, float, f, 64, 2); >> + VLOAD (vsrc_2, buf_src_6, q, float, f, 64, 2); >> + VECT_VAR (vector_res, float, 64, 2) = >> + vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), >> + VECT_VAR (vsrc_2, float, 64, 2), delem2); >> + vst1q_f64 (VECT_VAR (result, float, 64, 2), >> + VECT_VAR (vector_res, float, 64, 2)); >> + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms2_static, ""); >> + VECT_VAR (vector_res, float, 64, 2) = >> + vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), >> + VECT_VAR (vsrc_2, float, 64, 2), delem2); >> + vst1q_f64 (VECT_VAR (result, float, 64, 2), >> + VECT_VAR (vector_res, float, 64, 2)); >> + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma2_static, ""); >> + >> + VECT_VAR_DECL (buf_src_7, float, 64, 2) [] = {DA6, DA7}; >> + VECT_VAR_DECL (buf_src_8, float, 64, 2) [] = {DB6, DB7}; >> + VLOAD (vsrc_1, buf_src_7, q, float, f, 64, 2); >> + VLOAD (vsrc_2, buf_src_8, q, float, f, 64, 2); >> + VECT_VAR (vector_res, float, 64, 2) = >> + vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), >> + VECT_VAR (vsrc_2, float, 64, 2), delem3); >> + vst1q_f64 (VECT_VAR (result, float, 64, 2), >> + VECT_VAR (vector_res, float, 64, 2)); >> + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms3_static, ""); >> + VECT_VAR (vector_res, float, 64, 2) = >> + vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), >> + VECT_VAR (vsrc_2, float, 64, 2), delem3); >> + vst1q_f64 (VECT_VAR (result, float, 64, 2), >> + VECT_VAR (vector_res, float, 64, 2)); >> + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma3_static, ""); >> + >> +#undef TEST_MSG >> +#define TEST_MSG "VFMS_VFMA_N (FP64)" >> + clean_results (); >> + >> + DECL_VARIABLE(vsrc_1, float, 64, 1); >> + DECL_VARIABLE(vsrc_2, float, 64, 1); >> + VECT_VAR_DECL (buf_src_1, float, 64, 1) [] = {DA0}; >> + VECT_VAR_DECL (buf_src_2, float, 64, 1) [] = {DB0}; >> + VLOAD (vsrc_1, buf_src_1, , float, f, 64, 1); >> + VLOAD (vsrc_2, buf_src_2, , float, f, 64, 1); >> + DECL_VARIABLE (vector_res, float, 64, 1) = >> + vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), >> + VECT_VAR (vsrc_2, float, 64, 1), delem0); >> + vst1_f64 (VECT_VAR (result, float, 64, 1), >> + VECT_VAR (vector_res, float, 64, 1)); >> + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms0_static, ""); >> + VECT_VAR (vector_res, float, 64, 1) = >> + vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), >> + VECT_VAR (vsrc_2, float, 64, 1), delem0); >> + vst1_f64 (VECT_VAR (result, float, 64, 1), >> + VECT_VAR (vector_res, float, 64, 1)); >> + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma0_static, ""); >> + >> + VECT_VAR_DECL (buf_src_3, float, 64, 1) [] = {DA2}; >> + VECT_VAR_DECL (buf_src_4, float, 64, 1) [] = {DB2}; >> + VLOAD (vsrc_1, buf_src_3, , float, f, 64, 1); >> + VLOAD (vsrc_2, buf_src_4, , float, f, 64, 1); >> + VECT_VAR (vector_res, float, 64, 1) = >> + vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), >> + VECT_VAR (vsrc_2, float, 64, 1), delem1); >> + vst1_f64 (VECT_VAR (result, float, 64, 1), >> + VECT_VAR (vector_res, float, 64, 1)); >> + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms1_static, ""); >> + VECT_VAR (vector_res, float, 64, 1) = >> + vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), >> + VECT_VAR (vsrc_2, float, 64, 1), delem1); >> + vst1_f64 (VECT_VAR (result, float, 64, 1), >> + VECT_VAR (vector_res, float, 64, 1)); >> + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma1_static, ""); >> + >> + VECT_VAR_DECL (buf_src_5, float, 64, 1) [] = {DA4}; >> + VECT_VAR_DECL (buf_src_6, float, 64, 1) [] = {DB4}; >> + VLOAD (vsrc_1, buf_src_5, , float, f, 64, 1); >> + VLOAD (vsrc_2, buf_src_6, , float, f, 64, 1); >> + VECT_VAR (vector_res, float, 64, 1) = >> + vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), >> + VECT_VAR (vsrc_2, float, 64, 1), delem2); >> + vst1_f64 (VECT_VAR (result, float, 64, 1), >> + VECT_VAR (vector_res, float, 64, 1)); >> + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms2_static, ""); >> + VECT_VAR (vector_res, float, 64, 1) = >> + vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), >> + VECT_VAR (vsrc_2, float, 64, 1), delem2); >> + vst1_f64 (VECT_VAR (result, float, 64, 1), >> + VECT_VAR (vector_res, float, 64, 1)); >> + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma2_static, ""); >> + >> + VECT_VAR_DECL (buf_src_7, float, 64, 1) [] = {DA6}; >> + VECT_VAR_DECL (buf_src_8, float, 64, 1) [] = {DB6}; >> + VLOAD (vsrc_1, buf_src_7, , float, f, 64, 1); >> + VLOAD (vsrc_2, buf_src_8, , float, f, 64, 1); >> + VECT_VAR (vector_res, float, 64, 1) = >> + vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), >> + VECT_VAR (vsrc_2, float, 64, 1), delem3); >> + vst1_f64 (VECT_VAR (result, float, 64, 1), >> + VECT_VAR (vector_res, float, 64, 1)); >> + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms3_static, ""); >> + VECT_VAR (vector_res, float, 64, 1) = >> + vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), >> + VECT_VAR (vsrc_2, float, 64, 1), delem3); >> + vst1_f64 (VECT_VAR (result, float, 64, 1), >> + VECT_VAR (vector_res, float, 64, 1)); >> + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma3_static, ""); >> +} >> +#endif >> + >> +int >> +main (void) >> +{ >> +#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA) >> + exec_vfma_vfms_n (); >> +#endif >> + return 0; >> +} >> diff --git a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c >> index 1ba1fed98a0711496815e00d2d702e5bfa2a7d43..5b348827002dcfef1f589900a4cf5ff7ada26697 100644 >> --- a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c >> +++ b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c >> @@ -110,6 +110,6 @@ main (int argc, char **argv) >> /* vfmaq_lane_f64. >> vfma_laneq_f64. >> vfmaq_laneq_f64. */ >> -/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */ >> +/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */ >> >> >> diff --git a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c >> index 887ebae10da715c8d301a8494a2225e53f15bd7d..6c194a023d34ebafb4d732edc303985531f92a63 100644 >> --- a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c >> +++ b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c >> @@ -111,6 +111,6 @@ main (int argc, char **argv) >> /* vfmsq_lane_f64. >> vfms_laneq_f64. >> vfmsq_laneq_f64. */ >> -/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */ >> +/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */ >> >> >>
On 17/05/16 13:40, Kyrill Tkachov wrote: > > On 17/05/16 13:20, James Greenhalgh wrote: >> On Mon, May 16, 2016 at 10:09:26AM +0100, Jiong Wang wrote: >>> The support of vfma_n_f64, vfms_n_f32, vfmsq_n_f32, vfmsq_n_f64 are >>> missing in current gcc arm_neon.h. >>> >>> Meanwhile, besides "(fma (vec_dup (vec_select)))", fma by element can >>> also comes from "(fma (vec_dup(scalar" where the scalar value is already >>> sitting in vector register then duplicated to other lanes, and there is >>> no lane size change. >>> >>> This patch implement this and can generate better code under some >>> context. For example: >>> >>> cat test.c >>> === >>> typedef __Float32x2_t float32x2_t; >>> typedef float float32_t; >>> >>> float32x2_t >>> vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c) >>> { >>> return __builtin_aarch64_fmav2sf (__b, (float32x2_t) {__c, >>> __c}, __a); >>> } >>> >>> before (-O2) >>> === >>> vfma_n_f32: >>> dup v2.2s, v2.s[0] >>> fmla v0.2s, v1.2s, v2.2s >>> ret >>> after >>> === >>> vfma_n_f32: >>> fmla v0.2s, v1.2s, v2.s[0] >>> ret >>> >>> OK for trunk? >>> >>> 2016-05-16 Jiong Wang <jiong.wang@arm.com> >> This ChangeLog entry is not correctly formatted. There should be two >> spaces between your name and your email, and each line should start with >> a tab. >> >>> gcc/ >>> * config/aarch64/aarch64-simd.md (*aarch64_fma4_elt_to_128df): Rename >>> to *aarch64_fma4_elt_from_dup<mode>. >>> (*aarch64_fnma4_elt_to_128df): Rename to >>> *aarch64_fnma4_elt_from_dup<mode>. >>> * config/aarch64/arm_neon.h (vfma_n_f64): New. >>> (vfms_n_f32): Likewise. >>> (vfms_n_f64): Likewise. >>> (vfmsq_n_f32): Likewise. >>> (vfmsq_n_f64): Likewise. >>> >>> gcc/testsuite/ >>> * gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c: Use >>> standard syntax. >>> * gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c: Likewise. >> The paths of these two entries are incorrect. Remove the gcc/testsuite >> from the front. I don't understand what you mean by "Use standard syntax.", >> please fix this to describe what you are actually changing. >> >>> * gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h: New entry >>> for float64x1. >>> * gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c: New. >> These two changes need approval from an ARM maintainer as they are in >> common files. >> >> From an AArch64 perspective, this patch is OK with a fixed ChangeLog. Please >> wait for an ARM OK for the test changes. > > Considering that the tests' functionality is guarded on #if defined(__aarch64__) > it's a noop on arm and so is ok from that perspective (we have precedence for > tests guarded in such a way in advsimd-intrinsics.exp) > Of course I meant precedent rather than precedence :/ Kyrill > The arm-neon-ref.h additions are ok too. > > Thanks, > Kyrill > > >> Thanks, >> James >> >>> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md >>> index bd73bce64414e8bc01732d14311d742cf28f4586..90eaca176b4706e6cc42f16ce2c956f1c8ad17b1 100644 >>> --- a/gcc/config/aarch64/aarch64-simd.md >>> +++ b/gcc/config/aarch64/aarch64-simd.md >>> @@ -1579,16 +1579,16 @@ >>> [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")] >>> ) >>> -(define_insn "*aarch64_fma4_elt_to_128df" >>> - [(set (match_operand:V2DF 0 "register_operand" "=w") >>> - (fma:V2DF >>> - (vec_duplicate:V2DF >>> - (match_operand:DF 1 "register_operand" "w")) >>> - (match_operand:V2DF 2 "register_operand" "w") >>> - (match_operand:V2DF 3 "register_operand" "0")))] >>> +(define_insn "*aarch64_fma4_elt_from_dup<mode>" >>> + [(set (match_operand:VMUL 0 "register_operand" "=w") >>> + (fma:VMUL >>> + (vec_duplicate:VMUL >>> + (match_operand:<VEL> 1 "register_operand" "w")) >>> + (match_operand:VMUL 2 "register_operand" "w") >>> + (match_operand:VMUL 3 "register_operand" "0")))] >>> "TARGET_SIMD" >>> - "fmla\\t%0.2d, %2.2d, %1.2d[0]" >>> - [(set_attr "type" "neon_fp_mla_d_scalar_q")] >>> + "fmla\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]" >>> + [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")] >>> ) >>> (define_insn "*aarch64_fma4_elt_to_64v2df" >>> @@ -1656,17 +1656,17 @@ >>> [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")] >>> ) >>> -(define_insn "*aarch64_fnma4_elt_to_128df" >>> - [(set (match_operand:V2DF 0 "register_operand" "=w") >>> - (fma:V2DF >>> - (neg:V2DF >>> - (match_operand:V2DF 2 "register_operand" "w")) >>> - (vec_duplicate:V2DF >>> - (match_operand:DF 1 "register_operand" "w")) >>> - (match_operand:V2DF 3 "register_operand" "0")))] >>> - "TARGET_SIMD" >>> - "fmls\\t%0.2d, %2.2d, %1.2d[0]" >>> - [(set_attr "type" "neon_fp_mla_d_scalar_q")] >>> +(define_insn "*aarch64_fnma4_elt_from_dup<mode>" >>> + [(set (match_operand:VMUL 0 "register_operand" "=w") >>> + (fma:VMUL >>> + (neg:VMUL >>> + (match_operand:VMUL 2 "register_operand" "w")) >>> + (vec_duplicate:VMUL >>> + (match_operand:<VEL> 1 "register_operand" "w")) >>> + (match_operand:VMUL 3 "register_operand" "0")))] >>> + "TARGET_SIMD" >>> + "fmls\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]" >>> + [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")] >>> ) >>> (define_insn "*aarch64_fnma4_elt_to_64v2df" >>> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h >>> index 2612a325718918cf7cd808f28c09c9c4c7b11c07..ca7ace5aa656163826569d046fcbf02f9f7d4d6c 100644 >>> --- a/gcc/config/aarch64/arm_neon.h >>> +++ b/gcc/config/aarch64/arm_neon.h >>> @@ -14456,6 +14456,12 @@ vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c) >>> return __builtin_aarch64_fmav2sf (__b, vdup_n_f32 (__c), __a); >>> } >>> +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) >>> +vfma_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c) >>> +{ >>> + return (float64x1_t) {__b[0] * __c + __a[0]}; >>> +} >>> + >>> __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) >>> vfmaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c) >>> { >>> @@ -14597,6 +14603,29 @@ vfmsq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c) >>> return __builtin_aarch64_fmav2df (-__b, __c, __a); >>> } >>> +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) >>> +vfms_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c) >>> +{ >>> + return __builtin_aarch64_fmav2sf (-__b, vdup_n_f32 (__c), __a); >>> +} >>> + >>> +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) >>> +vfms_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c) >>> +{ >>> + return (float64x1_t) {-__b[0] * __c + __a[0]}; >>> +} >>> + >>> +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) >>> +vfmsq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c) >>> +{ >>> + return __builtin_aarch64_fmav4sf (-__b, vdupq_n_f32 (__c), __a); >>> +} >>> + >>> +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) >>> +vfmsq_n_f64 (float64x2_t __a, float64x2_t __b, float64_t __c) >>> +{ >>> + return __builtin_aarch64_fmav2df (-__b, vdupq_n_f64 (__c), __a); >>> +} >>> /* vfms_lane */ >>> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h >>> index 49fbd843e507ede8aa81d02c175a82a1221750a4..cf90825f87391b72aca9a29980210d21f4321c04 100644 >>> --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h >>> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h >>> @@ -136,6 +136,7 @@ static ARRAY(result, poly, 16, 4); >>> #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) >>> static ARRAY(result, float, 16, 4); >>> #endif >>> +static ARRAY(result, float, 64, 1); >>> static ARRAY(result, float, 32, 2); >>> static ARRAY(result, int, 8, 16); >>> static ARRAY(result, int, 16, 8); >>> @@ -169,6 +170,7 @@ extern ARRAY(expected, poly, 8, 8); >>> extern ARRAY(expected, poly, 16, 4); >>> extern ARRAY(expected, hfloat, 16, 4); >>> extern ARRAY(expected, hfloat, 32, 2); >>> +extern ARRAY(expected, hfloat, 64, 1); >>> extern ARRAY(expected, int, 8, 16); >>> extern ARRAY(expected, int, 16, 8); >>> extern ARRAY(expected, int, 32, 4); >>> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c >>> new file mode 100644 >>> index 0000000000000000000000000000000000000000..26223763c59c849607b5320f6ec37098a556ce2e >>> --- /dev/null >>> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c >>> @@ -0,0 +1,490 @@ >>> +#include <arm_neon.h> >>> +#include "arm-neon-ref.h" >>> +#include "compute-ref-data.h" >>> + >>> +#define A0 123.4f >>> +#define A1 -3.8f >>> +#define A2 -29.4f >>> +#define A3 (__builtin_inff ()) >>> +#define A4 0.0f >>> +#define A5 24.0f >>> +#define A6 124.0f >>> +#define A7 1024.0f >>> + >>> +#define B0 -5.8f >>> +#define B1 -0.0f >>> +#define B2 -10.8f >>> +#define B3 10.0f >>> +#define B4 23.4f >>> +#define B5 -1234.8f >>> +#define B6 8.9f >>> +#define B7 4.0f >>> + >>> +#define E0 9.8f >>> +#define E1 -1024.0f >>> +#define E2 (-__builtin_inff ()) >>> +#define E3 479.0f >>> +float32_t elem0 = E0; >>> +float32_t elem1 = E1; >>> +float32_t elem2 = E2; >>> +float32_t elem3 = E3; >>> + >>> +#define DA0 1231234.4 >>> +#define DA1 -3.8 >>> +#define DA2 -2980.4 >>> +#define DA3 -5.8 >>> +#define DA4 0.01123 >>> +#define DA5 24.0 >>> +#define DA6 124.12345 >>> +#define DA7 1024.0 >>> + >>> +#define DB0 -5.8 >>> +#define DB1 (__builtin_inf ()) >>> +#define DB2 -105.8 >>> +#define DB3 10.0 >>> +#define DB4 (-__builtin_inf ()) >>> +#define DB5 -1234.8 >>> +#define DB6 848.9 >>> +#define DB7 44444.0 >>> + >>> +#define DE0 9.8 >>> +#define DE1 -1024.0 >>> +#define DE2 105.8 >>> +#define DE3 479.0 >>> +float64_t delem0 = DE0; >>> +float64_t delem1 = DE1; >>> +float64_t delem2 = DE2; >>> +float64_t delem3 = DE3; >>> + >>> +#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA) >>> + >>> +/* Expected results for vfms_n. */ >>> + >>> +VECT_VAR_DECL(expectedfms0, float, 32, 2) [] = {A0 + -B0 * E0, A1 + -B1 * E0}; >>> +VECT_VAR_DECL(expectedfms1, float, 32, 2) [] = {A2 + -B2 * E1, A3 + -B3 * E1}; >>> +VECT_VAR_DECL(expectedfms2, float, 32, 2) [] = {A4 + -B4 * E2, A5 + -B5 * E2}; >>> +VECT_VAR_DECL(expectedfms3, float, 32, 2) [] = {A6 + -B6 * E3, A7 + -B7 * E3}; >>> +VECT_VAR_DECL(expectedfma0, float, 32, 2) [] = {A0 + B0 * E0, A1 + B1 * E0}; >>> +VECT_VAR_DECL(expectedfma1, float, 32, 2) [] = {A2 + B2 * E1, A3 + B3 * E1}; >>> +VECT_VAR_DECL(expectedfma2, float, 32, 2) [] = {A4 + B4 * E2, A5 + B5 * E2}; >>> +VECT_VAR_DECL(expectedfma3, float, 32, 2) [] = {A6 + B6 * E3, A7 + B7 * E3}; >>> + >>> +hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 2) = >>> + (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 2); >>> +hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 2) = >>> + (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 2); >>> +hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 2) = >>> + (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 2); >>> +hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 2) = >>> + (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 2); >>> +hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 2) = >>> + (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 2); >>> +hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 2) = >>> + (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 2); >>> +hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 2) = >>> + (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 2); >>> +hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 2) = >>> + (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 2); >>> + >>> + >>> +VECT_VAR_DECL(expectedfms0, float, 32, 4) [] = {A0 + -B0 * E0, A1 + -B1 * E0, >>> + A2 + -B2 * E0, A3 + -B3 * E0}; >>> +VECT_VAR_DECL(expectedfms1, float, 32, 4) [] = {A4 + -B4 * E1, A5 + -B5 * E1, >>> + A6 + -B6 * E1, A7 + -B7 * E1}; >>> +VECT_VAR_DECL(expectedfms2, float, 32, 4) [] = {A0 + -B0 * E2, A2 + -B2 * E2, >>> + A4 + -B4 * E2, A6 + -B6 * E2}; >>> +VECT_VAR_DECL(expectedfms3, float, 32, 4) [] = {A1 + -B1 * E3, A3 + -B3 * E3, >>> + A5 + -B5 * E3, A7 + -B7 * E3}; >>> +VECT_VAR_DECL(expectedfma0, float, 32, 4) [] = {A0 + B0 * E0, A1 + B1 * E0, >>> + A2 + B2 * E0, A3 + B3 * E0}; >>> +VECT_VAR_DECL(expectedfma1, float, 32, 4) [] = {A4 + B4 * E1, A5 + B5 * E1, >>> + A6 + B6 * E1, A7 + B7 * E1}; >>> +VECT_VAR_DECL(expectedfma2, float, 32, 4) [] = {A0 + B0 * E2, A2 + B2 * E2, >>> + A4 + B4 * E2, A6 + B6 * E2}; >>> +VECT_VAR_DECL(expectedfma3, float, 32, 4) [] = {A1 + B1 * E3, A3 + B3 * E3, >>> + A5 + B5 * E3, A7 + B7 * E3}; >>> + >>> +hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 4) = >>> + (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 4); >>> +hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 4) = >>> + (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 4); >>> +hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 4) = >>> + (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 4); >>> +hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 4) = >>> + (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 4); >>> +hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 4) = >>> + (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 4); >>> +hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 4) = >>> + (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 4); >>> +hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 4) = >>> + (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 4); >>> +hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 4) = >>> + (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 4); >>> + >>> +VECT_VAR_DECL(expectedfms0, float, 64, 2) [] = {DA0 + -DB0 * DE0, >>> + DA1 + -DB1 * DE0}; >>> +VECT_VAR_DECL(expectedfms1, float, 64, 2) [] = {DA2 + -DB2 * DE1, >>> + DA3 + -DB3 * DE1}; >>> +VECT_VAR_DECL(expectedfms2, float, 64, 2) [] = {DA4 + -DB4 * DE2, >>> + DA5 + -DB5 * DE2}; >>> +VECT_VAR_DECL(expectedfms3, float, 64, 2) [] = {DA6 + -DB6 * DE3, >>> + DA7 + -DB7 * DE3}; >>> +VECT_VAR_DECL(expectedfma0, float, 64, 2) [] = {DA0 + DB0 * DE0, >>> + DA1 + DB1 * DE0}; >>> +VECT_VAR_DECL(expectedfma1, float, 64, 2) [] = {DA2 + DB2 * DE1, >>> + DA3 + DB3 * DE1}; >>> +VECT_VAR_DECL(expectedfma2, float, 64, 2) [] = {DA4 + DB4 * DE2, >>> + DA5 + DB5 * DE2}; >>> +VECT_VAR_DECL(expectedfma3, float, 64, 2) [] = {DA6 + DB6 * DE3, >>> + DA7 + DB7 * DE3}; >>> +hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 2) = >>> + (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 2); >>> +hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 2) = >>> + (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 2); >>> +hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 2) = >>> + (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 2); >>> +hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 2) = >>> + (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 2); >>> +hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 2) = >>> + (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 2); >>> +hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 2) = >>> + (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 2); >>> +hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 2) = >>> + (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 2); >>> +hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 2) = >>> + (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 2); >>> + >>> +VECT_VAR_DECL(expectedfms0, float, 64, 1) [] = {DA0 + -DB0 * DE0}; >>> +VECT_VAR_DECL(expectedfms1, float, 64, 1) [] = {DA2 + -DB2 * DE1}; >>> +VECT_VAR_DECL(expectedfms2, float, 64, 1) [] = {DA4 + -DB4 * DE2}; >>> +VECT_VAR_DECL(expectedfms3, float, 64, 1) [] = {DA6 + -DB6 * DE3}; >>> +VECT_VAR_DECL(expectedfma0, float, 64, 1) [] = {DA0 + DB0 * DE0}; >>> +VECT_VAR_DECL(expectedfma1, float, 64, 1) [] = {DA2 + DB2 * DE1}; >>> +VECT_VAR_DECL(expectedfma2, float, 64, 1) [] = {DA4 + DB4 * DE2}; >>> +VECT_VAR_DECL(expectedfma3, float, 64, 1) [] = {DA6 + DB6 * DE3}; >>> + >>> +hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 1) = >>> + (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 1); >>> +hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 1) = >>> + (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 1); >>> +hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 1) = >>> + (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 1); >>> +hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 1) = >>> + (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 1); >>> +hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 1) = >>> + (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 1); >>> +hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 1) = >>> + (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 1); >>> +hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 1) = >>> + (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 1); >>> +hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 1) = >>> + (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 1); >>> + >>> +void exec_vfma_vfms_n (void) >>> +{ >>> +#undef TEST_MSG >>> +#define TEST_MSG "VFMS_VFMA_N (FP32)" >>> + clean_results (); >>> + >>> + DECL_VARIABLE(vsrc_1, float, 32, 2); >>> + DECL_VARIABLE(vsrc_2, float, 32, 2); >>> + VECT_VAR_DECL (buf_src_1, float, 32, 2) [] = {A0, A1}; >>> + VECT_VAR_DECL (buf_src_2, float, 32, 2) [] = {B0, B1}; >>> + VLOAD (vsrc_1, buf_src_1, , float, f, 32, 2); >>> + VLOAD (vsrc_2, buf_src_2, , float, f, 32, 2); >>> + DECL_VARIABLE (vector_res, float, 32, 2) = >>> + vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), >>> + VECT_VAR (vsrc_2, float, 32, 2), elem0); >>> + vst1_f32 (VECT_VAR (result, float, 32, 2), >>> + VECT_VAR (vector_res, float, 32, 2)); >>> + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms0_static, ""); >>> + VECT_VAR (vector_res, float, 32, 2) = >>> + vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), >>> + VECT_VAR (vsrc_2, float, 32, 2), elem0); >>> + vst1_f32 (VECT_VAR (result, float, 32, 2), >>> + VECT_VAR (vector_res, float, 32, 2)); >>> + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma0_static, ""); >>> + >>> + VECT_VAR_DECL (buf_src_3, float, 32, 2) [] = {A2, A3}; >>> + VECT_VAR_DECL (buf_src_4, float, 32, 2) [] = {B2, B3}; >>> + VLOAD (vsrc_1, buf_src_3, , float, f, 32, 2); >>> + VLOAD (vsrc_2, buf_src_4, , float, f, 32, 2); >>> + VECT_VAR (vector_res, float, 32, 2) = >>> + vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), >>> + VECT_VAR (vsrc_2, float, 32, 2), elem1); >>> + vst1_f32 (VECT_VAR (result, float, 32, 2), >>> + VECT_VAR (vector_res, float, 32, 2)); >>> + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms1_static, ""); >>> + VECT_VAR (vector_res, float, 32, 2) = >>> + vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), >>> + VECT_VAR (vsrc_2, float, 32, 2), elem1); >>> + vst1_f32 (VECT_VAR (result, float, 32, 2), >>> + VECT_VAR (vector_res, float, 32, 2)); >>> + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma1_static, ""); >>> + >>> + VECT_VAR_DECL (buf_src_5, float, 32, 2) [] = {A4, A5}; >>> + VECT_VAR_DECL (buf_src_6, float, 32, 2) [] = {B4, B5}; >>> + VLOAD (vsrc_1, buf_src_5, , float, f, 32, 2); >>> + VLOAD (vsrc_2, buf_src_6, , float, f, 32, 2); >>> + VECT_VAR (vector_res, float, 32, 2) = >>> + vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), >>> + VECT_VAR (vsrc_2, float, 32, 2), elem2); >>> + vst1_f32 (VECT_VAR (result, float, 32, 2), >>> + VECT_VAR (vector_res, float, 32, 2)); >>> + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms2_static, ""); >>> + VECT_VAR (vector_res, float, 32, 2) = >>> + vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), >>> + VECT_VAR (vsrc_2, float, 32, 2), elem2); >>> + vst1_f32 (VECT_VAR (result, float, 32, 2), >>> + VECT_VAR (vector_res, float, 32, 2)); >>> + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma2_static, ""); >>> + >>> + VECT_VAR_DECL (buf_src_7, float, 32, 2) [] = {A6, A7}; >>> + VECT_VAR_DECL (buf_src_8, float, 32, 2) [] = {B6, B7}; >>> + VLOAD (vsrc_1, buf_src_7, , float, f, 32, 2); >>> + VLOAD (vsrc_2, buf_src_8, , float, f, 32, 2); >>> + VECT_VAR (vector_res, float, 32, 2) = >>> + vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), >>> + VECT_VAR (vsrc_2, float, 32, 2), elem3); >>> + vst1_f32 (VECT_VAR (result, float, 32, 2), >>> + VECT_VAR (vector_res, float, 32, 2)); >>> + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms3_static, ""); >>> + VECT_VAR (vector_res, float, 32, 2) = >>> + vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), >>> + VECT_VAR (vsrc_2, float, 32, 2), elem3); >>> + vst1_f32 (VECT_VAR (result, float, 32, 2), >>> + VECT_VAR (vector_res, float, 32, 2)); >>> + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma3_static, ""); >>> + >>> +#undef TEST_MSG >>> +#define TEST_MSG "VFMSQ_VFMAQ_N (FP32)" >>> + clean_results (); >>> + >>> + DECL_VARIABLE(vsrc_1, float, 32, 4); >>> + DECL_VARIABLE(vsrc_2, float, 32, 4); >>> + VECT_VAR_DECL (buf_src_1, float, 32, 4) [] = {A0, A1, A2, A3}; >>> + VECT_VAR_DECL (buf_src_2, float, 32, 4) [] = {B0, B1, B2, B3}; >>> + VLOAD (vsrc_1, buf_src_1, q, float, f, 32, 4); >>> + VLOAD (vsrc_2, buf_src_2, q, float, f, 32, 4); >>> + DECL_VARIABLE (vector_res, float, 32, 4) = >>> + vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), >>> + VECT_VAR (vsrc_2, float, 32, 4), elem0); >>> + vst1q_f32 (VECT_VAR (result, float, 32, 4), >>> + VECT_VAR (vector_res, float, 32, 4)); >>> + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms0_static, ""); >>> + VECT_VAR (vector_res, float, 32, 4) = >>> + vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), >>> + VECT_VAR (vsrc_2, float, 32, 4), elem0); >>> + vst1q_f32 (VECT_VAR (result, float, 32, 4), >>> + VECT_VAR (vector_res, float, 32, 4)); >>> + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma0_static, ""); >>> + >>> + VECT_VAR_DECL (buf_src_3, float, 32, 4) [] = {A4, A5, A6, A7}; >>> + VECT_VAR_DECL (buf_src_4, float, 32, 4) [] = {B4, B5, B6, B7}; >>> + VLOAD (vsrc_1, buf_src_3, q, float, f, 32, 4); >>> + VLOAD (vsrc_2, buf_src_4, q, float, f, 32, 4); >>> + VECT_VAR (vector_res, float, 32, 4) = >>> + vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), >>> + VECT_VAR (vsrc_2, float, 32, 4), elem1); >>> + vst1q_f32 (VECT_VAR (result, float, 32, 4), >>> + VECT_VAR (vector_res, float, 32, 4)); >>> + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms1_static, ""); >>> + VECT_VAR (vector_res, float, 32, 4) = >>> + vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), >>> + VECT_VAR (vsrc_2, float, 32, 4), elem1); >>> + vst1q_f32 (VECT_VAR (result, float, 32, 4), >>> + VECT_VAR (vector_res, float, 32, 4)); >>> + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma1_static, ""); >>> + >>> + VECT_VAR_DECL (buf_src_5, float, 32, 4) [] = {A0, A2, A4, A6}; >>> + VECT_VAR_DECL (buf_src_6, float, 32, 4) [] = {B0, B2, B4, B6}; >>> + VLOAD (vsrc_1, buf_src_5, q, float, f, 32, 4); >>> + VLOAD (vsrc_2, buf_src_6, q, float, f, 32, 4); >>> + VECT_VAR (vector_res, float, 32, 4) = >>> + vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), >>> + VECT_VAR (vsrc_2, float, 32, 4), elem2); >>> + vst1q_f32 (VECT_VAR (result, float, 32, 4), >>> + VECT_VAR (vector_res, float, 32, 4)); >>> + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms2_static, ""); >>> + VECT_VAR (vector_res, float, 32, 4) = >>> + vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), >>> + VECT_VAR (vsrc_2, float, 32, 4), elem2); >>> + vst1q_f32 (VECT_VAR (result, float, 32, 4), >>> + VECT_VAR (vector_res, float, 32, 4)); >>> + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma2_static, ""); >>> + >>> + VECT_VAR_DECL (buf_src_7, float, 32, 4) [] = {A1, A3, A5, A7}; >>> + VECT_VAR_DECL (buf_src_8, float, 32, 4) [] = {B1, B3, B5, B7}; >>> + VLOAD (vsrc_1, buf_src_7, q, float, f, 32, 4); >>> + VLOAD (vsrc_2, buf_src_8, q, float, f, 32, 4); >>> + VECT_VAR (vector_res, float, 32, 4) = >>> + vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), >>> + VECT_VAR (vsrc_2, float, 32, 4), elem3); >>> + vst1q_f32 (VECT_VAR (result, float, 32, 4), >>> + VECT_VAR (vector_res, float, 32, 4)); >>> + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms3_static, ""); >>> + VECT_VAR (vector_res, float, 32, 4) = >>> + vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), >>> + VECT_VAR (vsrc_2, float, 32, 4), elem3); >>> + vst1q_f32 (VECT_VAR (result, float, 32, 4), >>> + VECT_VAR (vector_res, float, 32, 4)); >>> + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma3_static, ""); >>> + >>> +#undef TEST_MSG >>> +#define TEST_MSG "VFMSQ_VFMAQ_N (FP64)" >>> + clean_results (); >>> + >>> + DECL_VARIABLE(vsrc_1, float, 64, 2); >>> + DECL_VARIABLE(vsrc_2, float, 64, 2); >>> + VECT_VAR_DECL (buf_src_1, float, 64, 2) [] = {DA0, DA1}; >>> + VECT_VAR_DECL (buf_src_2, float, 64, 2) [] = {DB0, DB1}; >>> + VLOAD (vsrc_1, buf_src_1, q, float, f, 64, 2); >>> + VLOAD (vsrc_2, buf_src_2, q, float, f, 64, 2); >>> + DECL_VARIABLE (vector_res, float, 64, 2) = >>> + vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), >>> + VECT_VAR (vsrc_2, float, 64, 2), delem0); >>> + vst1q_f64 (VECT_VAR (result, float, 64, 2), >>> + VECT_VAR (vector_res, float, 64, 2)); >>> + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms0_static, ""); >>> + VECT_VAR (vector_res, float, 64, 2) = >>> + vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), >>> + VECT_VAR (vsrc_2, float, 64, 2), delem0); >>> + vst1q_f64 (VECT_VAR (result, float, 64, 2), >>> + VECT_VAR (vector_res, float, 64, 2)); >>> + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma0_static, ""); >>> + >>> + VECT_VAR_DECL (buf_src_3, float, 64, 2) [] = {DA2, DA3}; >>> + VECT_VAR_DECL (buf_src_4, float, 64, 2) [] = {DB2, DB3}; >>> + VLOAD (vsrc_1, buf_src_3, q, float, f, 64, 2); >>> + VLOAD (vsrc_2, buf_src_4, q, float, f, 64, 2); >>> + VECT_VAR (vector_res, float, 64, 2) = >>> + vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), >>> + VECT_VAR (vsrc_2, float, 64, 2), delem1); >>> + vst1q_f64 (VECT_VAR (result, float, 64, 2), >>> + VECT_VAR (vector_res, float, 64, 2)); >>> + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms1_static, ""); >>> + VECT_VAR (vector_res, float, 64, 2) = >>> + vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), >>> + VECT_VAR (vsrc_2, float, 64, 2), delem1); >>> + vst1q_f64 (VECT_VAR (result, float, 64, 2), >>> + VECT_VAR (vector_res, float, 64, 2)); >>> + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma1_static, ""); >>> + >>> + VECT_VAR_DECL (buf_src_5, float, 64, 2) [] = {DA4, DA5}; >>> + VECT_VAR_DECL (buf_src_6, float, 64, 2) [] = {DB4, DB5}; >>> + VLOAD (vsrc_1, buf_src_5, q, float, f, 64, 2); >>> + VLOAD (vsrc_2, buf_src_6, q, float, f, 64, 2); >>> + VECT_VAR (vector_res, float, 64, 2) = >>> + vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), >>> + VECT_VAR (vsrc_2, float, 64, 2), delem2); >>> + vst1q_f64 (VECT_VAR (result, float, 64, 2), >>> + VECT_VAR (vector_res, float, 64, 2)); >>> + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms2_static, ""); >>> + VECT_VAR (vector_res, float, 64, 2) = >>> + vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), >>> + VECT_VAR (vsrc_2, float, 64, 2), delem2); >>> + vst1q_f64 (VECT_VAR (result, float, 64, 2), >>> + VECT_VAR (vector_res, float, 64, 2)); >>> + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma2_static, ""); >>> + >>> + VECT_VAR_DECL (buf_src_7, float, 64, 2) [] = {DA6, DA7}; >>> + VECT_VAR_DECL (buf_src_8, float, 64, 2) [] = {DB6, DB7}; >>> + VLOAD (vsrc_1, buf_src_7, q, float, f, 64, 2); >>> + VLOAD (vsrc_2, buf_src_8, q, float, f, 64, 2); >>> + VECT_VAR (vector_res, float, 64, 2) = >>> + vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), >>> + VECT_VAR (vsrc_2, float, 64, 2), delem3); >>> + vst1q_f64 (VECT_VAR (result, float, 64, 2), >>> + VECT_VAR (vector_res, float, 64, 2)); >>> + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms3_static, ""); >>> + VECT_VAR (vector_res, float, 64, 2) = >>> + vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), >>> + VECT_VAR (vsrc_2, float, 64, 2), delem3); >>> + vst1q_f64 (VECT_VAR (result, float, 64, 2), >>> + VECT_VAR (vector_res, float, 64, 2)); >>> + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma3_static, ""); >>> + >>> +#undef TEST_MSG >>> +#define TEST_MSG "VFMS_VFMA_N (FP64)" >>> + clean_results (); >>> + >>> + DECL_VARIABLE(vsrc_1, float, 64, 1); >>> + DECL_VARIABLE(vsrc_2, float, 64, 1); >>> + VECT_VAR_DECL (buf_src_1, float, 64, 1) [] = {DA0}; >>> + VECT_VAR_DECL (buf_src_2, float, 64, 1) [] = {DB0}; >>> + VLOAD (vsrc_1, buf_src_1, , float, f, 64, 1); >>> + VLOAD (vsrc_2, buf_src_2, , float, f, 64, 1); >>> + DECL_VARIABLE (vector_res, float, 64, 1) = >>> + vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), >>> + VECT_VAR (vsrc_2, float, 64, 1), delem0); >>> + vst1_f64 (VECT_VAR (result, float, 64, 1), >>> + VECT_VAR (vector_res, float, 64, 1)); >>> + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms0_static, ""); >>> + VECT_VAR (vector_res, float, 64, 1) = >>> + vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), >>> + VECT_VAR (vsrc_2, float, 64, 1), delem0); >>> + vst1_f64 (VECT_VAR (result, float, 64, 1), >>> + VECT_VAR (vector_res, float, 64, 1)); >>> + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma0_static, ""); >>> + >>> + VECT_VAR_DECL (buf_src_3, float, 64, 1) [] = {DA2}; >>> + VECT_VAR_DECL (buf_src_4, float, 64, 1) [] = {DB2}; >>> + VLOAD (vsrc_1, buf_src_3, , float, f, 64, 1); >>> + VLOAD (vsrc_2, buf_src_4, , float, f, 64, 1); >>> + VECT_VAR (vector_res, float, 64, 1) = >>> + vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), >>> + VECT_VAR (vsrc_2, float, 64, 1), delem1); >>> + vst1_f64 (VECT_VAR (result, float, 64, 1), >>> + VECT_VAR (vector_res, float, 64, 1)); >>> + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms1_static, ""); >>> + VECT_VAR (vector_res, float, 64, 1) = >>> + vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), >>> + VECT_VAR (vsrc_2, float, 64, 1), delem1); >>> + vst1_f64 (VECT_VAR (result, float, 64, 1), >>> + VECT_VAR (vector_res, float, 64, 1)); >>> + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma1_static, ""); >>> + >>> + VECT_VAR_DECL (buf_src_5, float, 64, 1) [] = {DA4}; >>> + VECT_VAR_DECL (buf_src_6, float, 64, 1) [] = {DB4}; >>> + VLOAD (vsrc_1, buf_src_5, , float, f, 64, 1); >>> + VLOAD (vsrc_2, buf_src_6, , float, f, 64, 1); >>> + VECT_VAR (vector_res, float, 64, 1) = >>> + vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), >>> + VECT_VAR (vsrc_2, float, 64, 1), delem2); >>> + vst1_f64 (VECT_VAR (result, float, 64, 1), >>> + VECT_VAR (vector_res, float, 64, 1)); >>> + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms2_static, ""); >>> + VECT_VAR (vector_res, float, 64, 1) = >>> + vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), >>> + VECT_VAR (vsrc_2, float, 64, 1), delem2); >>> + vst1_f64 (VECT_VAR (result, float, 64, 1), >>> + VECT_VAR (vector_res, float, 64, 1)); >>> + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma2_static, ""); >>> + >>> + VECT_VAR_DECL (buf_src_7, float, 64, 1) [] = {DA6}; >>> + VECT_VAR_DECL (buf_src_8, float, 64, 1) [] = {DB6}; >>> + VLOAD (vsrc_1, buf_src_7, , float, f, 64, 1); >>> + VLOAD (vsrc_2, buf_src_8, , float, f, 64, 1); >>> + VECT_VAR (vector_res, float, 64, 1) = >>> + vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), >>> + VECT_VAR (vsrc_2, float, 64, 1), delem3); >>> + vst1_f64 (VECT_VAR (result, float, 64, 1), >>> + VECT_VAR (vector_res, float, 64, 1)); >>> + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms3_static, ""); >>> + VECT_VAR (vector_res, float, 64, 1) = >>> + vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), >>> + VECT_VAR (vsrc_2, float, 64, 1), delem3); >>> + vst1_f64 (VECT_VAR (result, float, 64, 1), >>> + VECT_VAR (vector_res, float, 64, 1)); >>> + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma3_static, ""); >>> +} >>> +#endif >>> + >>> +int >>> +main (void) >>> +{ >>> +#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA) >>> + exec_vfma_vfms_n (); >>> +#endif >>> + return 0; >>> +} >>> diff --git a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c >>> index 1ba1fed98a0711496815e00d2d702e5bfa2a7d43..5b348827002dcfef1f589900a4cf5ff7ada26697 100644 >>> --- a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c >>> +++ b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c >>> @@ -110,6 +110,6 @@ main (int argc, char **argv) >>> /* vfmaq_lane_f64. >>> vfma_laneq_f64. >>> vfmaq_laneq_f64. */ >>> -/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */ >>> +/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */ >>> diff --git a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c >>> index 887ebae10da715c8d301a8494a2225e53f15bd7d..6c194a023d34ebafb4d732edc303985531f92a63 100644 >>> --- a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c >>> +++ b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c >>> @@ -111,6 +111,6 @@ main (int argc, char **argv) >>> /* vfmsq_lane_f64. >>> vfms_laneq_f64. >>> vfmsq_laneq_f64. */ >>> -/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */ >>> +/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */ >>> >
On 17 May 2016 at 14:42, Kyrill Tkachov <kyrylo.tkachov@foss.arm.com> wrote: > > On 17/05/16 13:40, Kyrill Tkachov wrote: >> >> >> On 17/05/16 13:20, James Greenhalgh wrote: >>> >>> On Mon, May 16, 2016 at 10:09:26AM +0100, Jiong Wang wrote: >>>> >>>> The support of vfma_n_f64, vfms_n_f32, vfmsq_n_f32, vfmsq_n_f64 are >>>> missing in current gcc arm_neon.h. >>>> >>>> Meanwhile, besides "(fma (vec_dup (vec_select)))", fma by element can >>>> also comes from "(fma (vec_dup(scalar" where the scalar value is already >>>> sitting in vector register then duplicated to other lanes, and there is >>>> no lane size change. >>>> >>>> This patch implement this and can generate better code under some >>>> context. For example: >>>> >>>> cat test.c >>>> === >>>> typedef __Float32x2_t float32x2_t; >>>> typedef float float32_t; >>>> >>>> float32x2_t >>>> vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c) >>>> { >>>> return __builtin_aarch64_fmav2sf (__b, (float32x2_t) {__c, >>>> __c}, __a); >>>> } >>>> >>>> before (-O2) >>>> === >>>> vfma_n_f32: >>>> dup v2.2s, v2.s[0] >>>> fmla v0.2s, v1.2s, v2.2s >>>> ret >>>> after >>>> === >>>> vfma_n_f32: >>>> fmla v0.2s, v1.2s, v2.s[0] >>>> ret >>>> >>>> OK for trunk? >>>> >>>> 2016-05-16 Jiong Wang <jiong.wang@arm.com> >>> >>> This ChangeLog entry is not correctly formatted. There should be two >>> spaces between your name and your email, and each line should start with >>> a tab. >>> >>>> gcc/ >>>> * config/aarch64/aarch64-simd.md (*aarch64_fma4_elt_to_128df): Rename >>>> to *aarch64_fma4_elt_from_dup<mode>. >>>> (*aarch64_fnma4_elt_to_128df): Rename to >>>> *aarch64_fnma4_elt_from_dup<mode>. >>>> * config/aarch64/arm_neon.h (vfma_n_f64): New. >>>> (vfms_n_f32): Likewise. >>>> (vfms_n_f64): Likewise. >>>> (vfmsq_n_f32): Likewise. >>>> (vfmsq_n_f64): Likewise. >>>> >>>> gcc/testsuite/ >>>> * gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c: Use >>>> standard syntax. >>>> * gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c: Likewise. >>> >>> The paths of these two entries are incorrect. Remove the gcc/testsuite >>> from the front. I don't understand what you mean by "Use standard >>> syntax.", >>> please fix this to describe what you are actually changing. >>> >>>> * gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h: New entry >>>> for float64x1. >>>> * gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c: New. >>> >>> These two changes need approval from an ARM maintainer as they are in >>> common files. >>> >>> From an AArch64 perspective, this patch is OK with a fixed ChangeLog. >>> Please >>> wait for an ARM OK for the test changes. >> >> >> Considering that the tests' functionality is guarded on #if >> defined(__aarch64__) >> it's a noop on arm and so is ok from that perspective (we have precedence >> for >> tests guarded in such a way in advsimd-intrinsics.exp) >> > > Of course I meant precedent rather than precedence :/ > Unfortunately, the guard is not correct :( The float64_t type is not available on arm, so the new declarations/definitions in arm-neon-ref.h need a guard. Since this patch was checked-in, all the advsimd intrinsics tests fail to compile on arm: In file included from /aci-gcc-fsf/sources/gcc-fsf/gccsrc/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vaba.c:2:0: /aci-gcc-fsf/sources/gcc-fsf/gccsrc/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h:139:22: error: unknown type name 'float64_t' /aci-gcc-fsf/sources/gcc-fsf/gccsrc/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h:51:35: note: in definition of macro 'VECT_VAR_DECL' /aci-gcc-fsf/sources/gcc-fsf/gccsrc/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h:139:8: note: in expansion of macro 'ARRAY' Christophe > Kyrill > > >> The arm-neon-ref.h additions are ok too. >> >> Thanks, >> Kyrill >> >> >>> Thanks, >>> James >>> >>>> diff --git a/gcc/config/aarch64/aarch64-simd.md >>>> b/gcc/config/aarch64/aarch64-simd.md >>>> index >>>> bd73bce64414e8bc01732d14311d742cf28f4586..90eaca176b4706e6cc42f16ce2c956f1c8ad17b1 >>>> 100644 >>>> --- a/gcc/config/aarch64/aarch64-simd.md >>>> +++ b/gcc/config/aarch64/aarch64-simd.md >>>> @@ -1579,16 +1579,16 @@ >>>> [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")] >>>> ) >>>> -(define_insn "*aarch64_fma4_elt_to_128df" >>>> - [(set (match_operand:V2DF 0 "register_operand" "=w") >>>> - (fma:V2DF >>>> - (vec_duplicate:V2DF >>>> - (match_operand:DF 1 "register_operand" "w")) >>>> - (match_operand:V2DF 2 "register_operand" "w") >>>> - (match_operand:V2DF 3 "register_operand" "0")))] >>>> +(define_insn "*aarch64_fma4_elt_from_dup<mode>" >>>> + [(set (match_operand:VMUL 0 "register_operand" "=w") >>>> + (fma:VMUL >>>> + (vec_duplicate:VMUL >>>> + (match_operand:<VEL> 1 "register_operand" "w")) >>>> + (match_operand:VMUL 2 "register_operand" "w") >>>> + (match_operand:VMUL 3 "register_operand" "0")))] >>>> "TARGET_SIMD" >>>> - "fmla\\t%0.2d, %2.2d, %1.2d[0]" >>>> - [(set_attr "type" "neon_fp_mla_d_scalar_q")] >>>> + "fmla\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]" >>>> + [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")] >>>> ) >>>> (define_insn "*aarch64_fma4_elt_to_64v2df" >>>> @@ -1656,17 +1656,17 @@ >>>> [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")] >>>> ) >>>> -(define_insn "*aarch64_fnma4_elt_to_128df" >>>> - [(set (match_operand:V2DF 0 "register_operand" "=w") >>>> - (fma:V2DF >>>> - (neg:V2DF >>>> - (match_operand:V2DF 2 "register_operand" "w")) >>>> - (vec_duplicate:V2DF >>>> - (match_operand:DF 1 "register_operand" "w")) >>>> - (match_operand:V2DF 3 "register_operand" "0")))] >>>> - "TARGET_SIMD" >>>> - "fmls\\t%0.2d, %2.2d, %1.2d[0]" >>>> - [(set_attr "type" "neon_fp_mla_d_scalar_q")] >>>> +(define_insn "*aarch64_fnma4_elt_from_dup<mode>" >>>> + [(set (match_operand:VMUL 0 "register_operand" "=w") >>>> + (fma:VMUL >>>> + (neg:VMUL >>>> + (match_operand:VMUL 2 "register_operand" "w")) >>>> + (vec_duplicate:VMUL >>>> + (match_operand:<VEL> 1 "register_operand" "w")) >>>> + (match_operand:VMUL 3 "register_operand" "0")))] >>>> + "TARGET_SIMD" >>>> + "fmls\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]" >>>> + [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")] >>>> ) >>>> (define_insn "*aarch64_fnma4_elt_to_64v2df" >>>> diff --git a/gcc/config/aarch64/arm_neon.h >>>> b/gcc/config/aarch64/arm_neon.h >>>> index >>>> 2612a325718918cf7cd808f28c09c9c4c7b11c07..ca7ace5aa656163826569d046fcbf02f9f7d4d6c >>>> 100644 >>>> --- a/gcc/config/aarch64/arm_neon.h >>>> +++ b/gcc/config/aarch64/arm_neon.h >>>> @@ -14456,6 +14456,12 @@ vfma_n_f32 (float32x2_t __a, float32x2_t __b, >>>> float32_t __c) >>>> return __builtin_aarch64_fmav2sf (__b, vdup_n_f32 (__c), __a); >>>> } >>>> +__extension__ static __inline float64x1_t __attribute__ >>>> ((__always_inline__)) >>>> +vfma_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c) >>>> +{ >>>> + return (float64x1_t) {__b[0] * __c + __a[0]}; >>>> +} >>>> + >>>> __extension__ static __inline float32x4_t __attribute__ >>>> ((__always_inline__)) >>>> vfmaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c) >>>> { >>>> @@ -14597,6 +14603,29 @@ vfmsq_f64 (float64x2_t __a, float64x2_t __b, >>>> float64x2_t __c) >>>> return __builtin_aarch64_fmav2df (-__b, __c, __a); >>>> } >>>> +__extension__ static __inline float32x2_t __attribute__ >>>> ((__always_inline__)) >>>> +vfms_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c) >>>> +{ >>>> + return __builtin_aarch64_fmav2sf (-__b, vdup_n_f32 (__c), __a); >>>> +} >>>> + >>>> +__extension__ static __inline float64x1_t __attribute__ >>>> ((__always_inline__)) >>>> +vfms_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c) >>>> +{ >>>> + return (float64x1_t) {-__b[0] * __c + __a[0]}; >>>> +} >>>> + >>>> +__extension__ static __inline float32x4_t __attribute__ >>>> ((__always_inline__)) >>>> +vfmsq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c) >>>> +{ >>>> + return __builtin_aarch64_fmav4sf (-__b, vdupq_n_f32 (__c), __a); >>>> +} >>>> + >>>> +__extension__ static __inline float64x2_t __attribute__ >>>> ((__always_inline__)) >>>> +vfmsq_n_f64 (float64x2_t __a, float64x2_t __b, float64_t __c) >>>> +{ >>>> + return __builtin_aarch64_fmav2df (-__b, vdupq_n_f64 (__c), __a); >>>> +} >>>> /* vfms_lane */ >>>> diff --git >>>> a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h >>>> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h >>>> index >>>> 49fbd843e507ede8aa81d02c175a82a1221750a4..cf90825f87391b72aca9a29980210d21f4321c04 >>>> 100644 >>>> --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h >>>> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h >>>> @@ -136,6 +136,7 @@ static ARRAY(result, poly, 16, 4); >>>> #if defined (__ARM_FP16_FORMAT_IEEE) || defined >>>> (__ARM_FP16_FORMAT_ALTERNATIVE) >>>> static ARRAY(result, float, 16, 4); >>>> #endif >>>> +static ARRAY(result, float, 64, 1); >>>> static ARRAY(result, float, 32, 2); >>>> static ARRAY(result, int, 8, 16); >>>> static ARRAY(result, int, 16, 8); >>>> @@ -169,6 +170,7 @@ extern ARRAY(expected, poly, 8, 8); >>>> extern ARRAY(expected, poly, 16, 4); >>>> extern ARRAY(expected, hfloat, 16, 4); >>>> extern ARRAY(expected, hfloat, 32, 2); >>>> +extern ARRAY(expected, hfloat, 64, 1); >>>> extern ARRAY(expected, int, 8, 16); >>>> extern ARRAY(expected, int, 16, 8); >>>> extern ARRAY(expected, int, 32, 4); >>>> diff --git >>>> a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c >>>> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c >>>> new file mode 100644 >>>> index >>>> 0000000000000000000000000000000000000000..26223763c59c849607b5320f6ec37098a556ce2e >>>> --- /dev/null >>>> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c >>>> @@ -0,0 +1,490 @@ >>>> +#include <arm_neon.h> >>>> +#include "arm-neon-ref.h" >>>> +#include "compute-ref-data.h" >>>> + >>>> +#define A0 123.4f >>>> +#define A1 -3.8f >>>> +#define A2 -29.4f >>>> +#define A3 (__builtin_inff ()) >>>> +#define A4 0.0f >>>> +#define A5 24.0f >>>> +#define A6 124.0f >>>> +#define A7 1024.0f >>>> + >>>> +#define B0 -5.8f >>>> +#define B1 -0.0f >>>> +#define B2 -10.8f >>>> +#define B3 10.0f >>>> +#define B4 23.4f >>>> +#define B5 -1234.8f >>>> +#define B6 8.9f >>>> +#define B7 4.0f >>>> + >>>> +#define E0 9.8f >>>> +#define E1 -1024.0f >>>> +#define E2 (-__builtin_inff ()) >>>> +#define E3 479.0f >>>> +float32_t elem0 = E0; >>>> +float32_t elem1 = E1; >>>> +float32_t elem2 = E2; >>>> +float32_t elem3 = E3; >>>> + >>>> +#define DA0 1231234.4 >>>> +#define DA1 -3.8 >>>> +#define DA2 -2980.4 >>>> +#define DA3 -5.8 >>>> +#define DA4 0.01123 >>>> +#define DA5 24.0 >>>> +#define DA6 124.12345 >>>> +#define DA7 1024.0 >>>> + >>>> +#define DB0 -5.8 >>>> +#define DB1 (__builtin_inf ()) >>>> +#define DB2 -105.8 >>>> +#define DB3 10.0 >>>> +#define DB4 (-__builtin_inf ()) >>>> +#define DB5 -1234.8 >>>> +#define DB6 848.9 >>>> +#define DB7 44444.0 >>>> + >>>> +#define DE0 9.8 >>>> +#define DE1 -1024.0 >>>> +#define DE2 105.8 >>>> +#define DE3 479.0 >>>> +float64_t delem0 = DE0; >>>> +float64_t delem1 = DE1; >>>> +float64_t delem2 = DE2; >>>> +float64_t delem3 = DE3; >>>> + >>>> +#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA) >>>> + >>>> +/* Expected results for vfms_n. */ >>>> + >>>> +VECT_VAR_DECL(expectedfms0, float, 32, 2) [] = {A0 + -B0 * E0, A1 + -B1 >>>> * E0}; >>>> +VECT_VAR_DECL(expectedfms1, float, 32, 2) [] = {A2 + -B2 * E1, A3 + -B3 >>>> * E1}; >>>> +VECT_VAR_DECL(expectedfms2, float, 32, 2) [] = {A4 + -B4 * E2, A5 + -B5 >>>> * E2}; >>>> +VECT_VAR_DECL(expectedfms3, float, 32, 2) [] = {A6 + -B6 * E3, A7 + -B7 >>>> * E3}; >>>> +VECT_VAR_DECL(expectedfma0, float, 32, 2) [] = {A0 + B0 * E0, A1 + B1 * >>>> E0}; >>>> +VECT_VAR_DECL(expectedfma1, float, 32, 2) [] = {A2 + B2 * E1, A3 + B3 * >>>> E1}; >>>> +VECT_VAR_DECL(expectedfma2, float, 32, 2) [] = {A4 + B4 * E2, A5 + B5 * >>>> E2}; >>>> +VECT_VAR_DECL(expectedfma3, float, 32, 2) [] = {A6 + B6 * E3, A7 + B7 * >>>> E3}; >>>> + >>>> +hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 2) = >>>> + (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 2); >>>> +hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 2) = >>>> + (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 2); >>>> +hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 2) = >>>> + (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 2); >>>> +hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 2) = >>>> + (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 2); >>>> +hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 2) = >>>> + (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 2); >>>> +hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 2) = >>>> + (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 2); >>>> +hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 2) = >>>> + (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 2); >>>> +hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 2) = >>>> + (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 2); >>>> + >>>> + >>>> +VECT_VAR_DECL(expectedfms0, float, 32, 4) [] = {A0 + -B0 * E0, A1 + -B1 >>>> * E0, >>>> + A2 + -B2 * E0, A3 + -B3 * E0}; >>>> +VECT_VAR_DECL(expectedfms1, float, 32, 4) [] = {A4 + -B4 * E1, A5 + -B5 >>>> * E1, >>>> + A6 + -B6 * E1, A7 + -B7 * E1}; >>>> +VECT_VAR_DECL(expectedfms2, float, 32, 4) [] = {A0 + -B0 * E2, A2 + -B2 >>>> * E2, >>>> + A4 + -B4 * E2, A6 + -B6 * E2}; >>>> +VECT_VAR_DECL(expectedfms3, float, 32, 4) [] = {A1 + -B1 * E3, A3 + -B3 >>>> * E3, >>>> + A5 + -B5 * E3, A7 + -B7 * E3}; >>>> +VECT_VAR_DECL(expectedfma0, float, 32, 4) [] = {A0 + B0 * E0, A1 + B1 * >>>> E0, >>>> + A2 + B2 * E0, A3 + B3 * E0}; >>>> +VECT_VAR_DECL(expectedfma1, float, 32, 4) [] = {A4 + B4 * E1, A5 + B5 * >>>> E1, >>>> + A6 + B6 * E1, A7 + B7 * E1}; >>>> +VECT_VAR_DECL(expectedfma2, float, 32, 4) [] = {A0 + B0 * E2, A2 + B2 * >>>> E2, >>>> + A4 + B4 * E2, A6 + B6 * E2}; >>>> +VECT_VAR_DECL(expectedfma3, float, 32, 4) [] = {A1 + B1 * E3, A3 + B3 * >>>> E3, >>>> + A5 + B5 * E3, A7 + B7 * E3}; >>>> + >>>> +hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 4) = >>>> + (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 4); >>>> +hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 4) = >>>> + (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 4); >>>> +hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 4) = >>>> + (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 4); >>>> +hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 4) = >>>> + (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 4); >>>> +hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 4) = >>>> + (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 4); >>>> +hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 4) = >>>> + (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 4); >>>> +hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 4) = >>>> + (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 4); >>>> +hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 4) = >>>> + (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 4); >>>> + >>>> +VECT_VAR_DECL(expectedfms0, float, 64, 2) [] = {DA0 + -DB0 * DE0, >>>> + DA1 + -DB1 * DE0}; >>>> +VECT_VAR_DECL(expectedfms1, float, 64, 2) [] = {DA2 + -DB2 * DE1, >>>> + DA3 + -DB3 * DE1}; >>>> +VECT_VAR_DECL(expectedfms2, float, 64, 2) [] = {DA4 + -DB4 * DE2, >>>> + DA5 + -DB5 * DE2}; >>>> +VECT_VAR_DECL(expectedfms3, float, 64, 2) [] = {DA6 + -DB6 * DE3, >>>> + DA7 + -DB7 * DE3}; >>>> +VECT_VAR_DECL(expectedfma0, float, 64, 2) [] = {DA0 + DB0 * DE0, >>>> + DA1 + DB1 * DE0}; >>>> +VECT_VAR_DECL(expectedfma1, float, 64, 2) [] = {DA2 + DB2 * DE1, >>>> + DA3 + DB3 * DE1}; >>>> +VECT_VAR_DECL(expectedfma2, float, 64, 2) [] = {DA4 + DB4 * DE2, >>>> + DA5 + DB5 * DE2}; >>>> +VECT_VAR_DECL(expectedfma3, float, 64, 2) [] = {DA6 + DB6 * DE3, >>>> + DA7 + DB7 * DE3}; >>>> +hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 2) = >>>> + (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 2); >>>> +hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 2) = >>>> + (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 2); >>>> +hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 2) = >>>> + (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 2); >>>> +hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 2) = >>>> + (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 2); >>>> +hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 2) = >>>> + (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 2); >>>> +hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 2) = >>>> + (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 2); >>>> +hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 2) = >>>> + (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 2); >>>> +hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 2) = >>>> + (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 2); >>>> + >>>> +VECT_VAR_DECL(expectedfms0, float, 64, 1) [] = {DA0 + -DB0 * DE0}; >>>> +VECT_VAR_DECL(expectedfms1, float, 64, 1) [] = {DA2 + -DB2 * DE1}; >>>> +VECT_VAR_DECL(expectedfms2, float, 64, 1) [] = {DA4 + -DB4 * DE2}; >>>> +VECT_VAR_DECL(expectedfms3, float, 64, 1) [] = {DA6 + -DB6 * DE3}; >>>> +VECT_VAR_DECL(expectedfma0, float, 64, 1) [] = {DA0 + DB0 * DE0}; >>>> +VECT_VAR_DECL(expectedfma1, float, 64, 1) [] = {DA2 + DB2 * DE1}; >>>> +VECT_VAR_DECL(expectedfma2, float, 64, 1) [] = {DA4 + DB4 * DE2}; >>>> +VECT_VAR_DECL(expectedfma3, float, 64, 1) [] = {DA6 + DB6 * DE3}; >>>> + >>>> +hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 1) = >>>> + (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 1); >>>> +hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 1) = >>>> + (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 1); >>>> +hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 1) = >>>> + (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 1); >>>> +hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 1) = >>>> + (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 1); >>>> +hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 1) = >>>> + (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 1); >>>> +hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 1) = >>>> + (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 1); >>>> +hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 1) = >>>> + (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 1); >>>> +hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 1) = >>>> + (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 1); >>>> + >>>> +void exec_vfma_vfms_n (void) >>>> +{ >>>> +#undef TEST_MSG >>>> +#define TEST_MSG "VFMS_VFMA_N (FP32)" >>>> + clean_results (); >>>> + >>>> + DECL_VARIABLE(vsrc_1, float, 32, 2); >>>> + DECL_VARIABLE(vsrc_2, float, 32, 2); >>>> + VECT_VAR_DECL (buf_src_1, float, 32, 2) [] = {A0, A1}; >>>> + VECT_VAR_DECL (buf_src_2, float, 32, 2) [] = {B0, B1}; >>>> + VLOAD (vsrc_1, buf_src_1, , float, f, 32, 2); >>>> + VLOAD (vsrc_2, buf_src_2, , float, f, 32, 2); >>>> + DECL_VARIABLE (vector_res, float, 32, 2) = >>>> + vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), >>>> + VECT_VAR (vsrc_2, float, 32, 2), elem0); >>>> + vst1_f32 (VECT_VAR (result, float, 32, 2), >>>> + VECT_VAR (vector_res, float, 32, 2)); >>>> + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms0_static, ""); >>>> + VECT_VAR (vector_res, float, 32, 2) = >>>> + vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), >>>> + VECT_VAR (vsrc_2, float, 32, 2), elem0); >>>> + vst1_f32 (VECT_VAR (result, float, 32, 2), >>>> + VECT_VAR (vector_res, float, 32, 2)); >>>> + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma0_static, ""); >>>> + >>>> + VECT_VAR_DECL (buf_src_3, float, 32, 2) [] = {A2, A3}; >>>> + VECT_VAR_DECL (buf_src_4, float, 32, 2) [] = {B2, B3}; >>>> + VLOAD (vsrc_1, buf_src_3, , float, f, 32, 2); >>>> + VLOAD (vsrc_2, buf_src_4, , float, f, 32, 2); >>>> + VECT_VAR (vector_res, float, 32, 2) = >>>> + vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), >>>> + VECT_VAR (vsrc_2, float, 32, 2), elem1); >>>> + vst1_f32 (VECT_VAR (result, float, 32, 2), >>>> + VECT_VAR (vector_res, float, 32, 2)); >>>> + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms1_static, ""); >>>> + VECT_VAR (vector_res, float, 32, 2) = >>>> + vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), >>>> + VECT_VAR (vsrc_2, float, 32, 2), elem1); >>>> + vst1_f32 (VECT_VAR (result, float, 32, 2), >>>> + VECT_VAR (vector_res, float, 32, 2)); >>>> + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma1_static, ""); >>>> + >>>> + VECT_VAR_DECL (buf_src_5, float, 32, 2) [] = {A4, A5}; >>>> + VECT_VAR_DECL (buf_src_6, float, 32, 2) [] = {B4, B5}; >>>> + VLOAD (vsrc_1, buf_src_5, , float, f, 32, 2); >>>> + VLOAD (vsrc_2, buf_src_6, , float, f, 32, 2); >>>> + VECT_VAR (vector_res, float, 32, 2) = >>>> + vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), >>>> + VECT_VAR (vsrc_2, float, 32, 2), elem2); >>>> + vst1_f32 (VECT_VAR (result, float, 32, 2), >>>> + VECT_VAR (vector_res, float, 32, 2)); >>>> + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms2_static, ""); >>>> + VECT_VAR (vector_res, float, 32, 2) = >>>> + vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), >>>> + VECT_VAR (vsrc_2, float, 32, 2), elem2); >>>> + vst1_f32 (VECT_VAR (result, float, 32, 2), >>>> + VECT_VAR (vector_res, float, 32, 2)); >>>> + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma2_static, ""); >>>> + >>>> + VECT_VAR_DECL (buf_src_7, float, 32, 2) [] = {A6, A7}; >>>> + VECT_VAR_DECL (buf_src_8, float, 32, 2) [] = {B6, B7}; >>>> + VLOAD (vsrc_1, buf_src_7, , float, f, 32, 2); >>>> + VLOAD (vsrc_2, buf_src_8, , float, f, 32, 2); >>>> + VECT_VAR (vector_res, float, 32, 2) = >>>> + vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), >>>> + VECT_VAR (vsrc_2, float, 32, 2), elem3); >>>> + vst1_f32 (VECT_VAR (result, float, 32, 2), >>>> + VECT_VAR (vector_res, float, 32, 2)); >>>> + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms3_static, ""); >>>> + VECT_VAR (vector_res, float, 32, 2) = >>>> + vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), >>>> + VECT_VAR (vsrc_2, float, 32, 2), elem3); >>>> + vst1_f32 (VECT_VAR (result, float, 32, 2), >>>> + VECT_VAR (vector_res, float, 32, 2)); >>>> + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma3_static, ""); >>>> + >>>> +#undef TEST_MSG >>>> +#define TEST_MSG "VFMSQ_VFMAQ_N (FP32)" >>>> + clean_results (); >>>> + >>>> + DECL_VARIABLE(vsrc_1, float, 32, 4); >>>> + DECL_VARIABLE(vsrc_2, float, 32, 4); >>>> + VECT_VAR_DECL (buf_src_1, float, 32, 4) [] = {A0, A1, A2, A3}; >>>> + VECT_VAR_DECL (buf_src_2, float, 32, 4) [] = {B0, B1, B2, B3}; >>>> + VLOAD (vsrc_1, buf_src_1, q, float, f, 32, 4); >>>> + VLOAD (vsrc_2, buf_src_2, q, float, f, 32, 4); >>>> + DECL_VARIABLE (vector_res, float, 32, 4) = >>>> + vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), >>>> + VECT_VAR (vsrc_2, float, 32, 4), elem0); >>>> + vst1q_f32 (VECT_VAR (result, float, 32, 4), >>>> + VECT_VAR (vector_res, float, 32, 4)); >>>> + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms0_static, ""); >>>> + VECT_VAR (vector_res, float, 32, 4) = >>>> + vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), >>>> + VECT_VAR (vsrc_2, float, 32, 4), elem0); >>>> + vst1q_f32 (VECT_VAR (result, float, 32, 4), >>>> + VECT_VAR (vector_res, float, 32, 4)); >>>> + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma0_static, ""); >>>> + >>>> + VECT_VAR_DECL (buf_src_3, float, 32, 4) [] = {A4, A5, A6, A7}; >>>> + VECT_VAR_DECL (buf_src_4, float, 32, 4) [] = {B4, B5, B6, B7}; >>>> + VLOAD (vsrc_1, buf_src_3, q, float, f, 32, 4); >>>> + VLOAD (vsrc_2, buf_src_4, q, float, f, 32, 4); >>>> + VECT_VAR (vector_res, float, 32, 4) = >>>> + vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), >>>> + VECT_VAR (vsrc_2, float, 32, 4), elem1); >>>> + vst1q_f32 (VECT_VAR (result, float, 32, 4), >>>> + VECT_VAR (vector_res, float, 32, 4)); >>>> + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms1_static, ""); >>>> + VECT_VAR (vector_res, float, 32, 4) = >>>> + vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), >>>> + VECT_VAR (vsrc_2, float, 32, 4), elem1); >>>> + vst1q_f32 (VECT_VAR (result, float, 32, 4), >>>> + VECT_VAR (vector_res, float, 32, 4)); >>>> + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma1_static, ""); >>>> + >>>> + VECT_VAR_DECL (buf_src_5, float, 32, 4) [] = {A0, A2, A4, A6}; >>>> + VECT_VAR_DECL (buf_src_6, float, 32, 4) [] = {B0, B2, B4, B6}; >>>> + VLOAD (vsrc_1, buf_src_5, q, float, f, 32, 4); >>>> + VLOAD (vsrc_2, buf_src_6, q, float, f, 32, 4); >>>> + VECT_VAR (vector_res, float, 32, 4) = >>>> + vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), >>>> + VECT_VAR (vsrc_2, float, 32, 4), elem2); >>>> + vst1q_f32 (VECT_VAR (result, float, 32, 4), >>>> + VECT_VAR (vector_res, float, 32, 4)); >>>> + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms2_static, ""); >>>> + VECT_VAR (vector_res, float, 32, 4) = >>>> + vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), >>>> + VECT_VAR (vsrc_2, float, 32, 4), elem2); >>>> + vst1q_f32 (VECT_VAR (result, float, 32, 4), >>>> + VECT_VAR (vector_res, float, 32, 4)); >>>> + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma2_static, ""); >>>> + >>>> + VECT_VAR_DECL (buf_src_7, float, 32, 4) [] = {A1, A3, A5, A7}; >>>> + VECT_VAR_DECL (buf_src_8, float, 32, 4) [] = {B1, B3, B5, B7}; >>>> + VLOAD (vsrc_1, buf_src_7, q, float, f, 32, 4); >>>> + VLOAD (vsrc_2, buf_src_8, q, float, f, 32, 4); >>>> + VECT_VAR (vector_res, float, 32, 4) = >>>> + vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), >>>> + VECT_VAR (vsrc_2, float, 32, 4), elem3); >>>> + vst1q_f32 (VECT_VAR (result, float, 32, 4), >>>> + VECT_VAR (vector_res, float, 32, 4)); >>>> + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms3_static, ""); >>>> + VECT_VAR (vector_res, float, 32, 4) = >>>> + vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), >>>> + VECT_VAR (vsrc_2, float, 32, 4), elem3); >>>> + vst1q_f32 (VECT_VAR (result, float, 32, 4), >>>> + VECT_VAR (vector_res, float, 32, 4)); >>>> + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma3_static, ""); >>>> + >>>> +#undef TEST_MSG >>>> +#define TEST_MSG "VFMSQ_VFMAQ_N (FP64)" >>>> + clean_results (); >>>> + >>>> + DECL_VARIABLE(vsrc_1, float, 64, 2); >>>> + DECL_VARIABLE(vsrc_2, float, 64, 2); >>>> + VECT_VAR_DECL (buf_src_1, float, 64, 2) [] = {DA0, DA1}; >>>> + VECT_VAR_DECL (buf_src_2, float, 64, 2) [] = {DB0, DB1}; >>>> + VLOAD (vsrc_1, buf_src_1, q, float, f, 64, 2); >>>> + VLOAD (vsrc_2, buf_src_2, q, float, f, 64, 2); >>>> + DECL_VARIABLE (vector_res, float, 64, 2) = >>>> + vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), >>>> + VECT_VAR (vsrc_2, float, 64, 2), delem0); >>>> + vst1q_f64 (VECT_VAR (result, float, 64, 2), >>>> + VECT_VAR (vector_res, float, 64, 2)); >>>> + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms0_static, ""); >>>> + VECT_VAR (vector_res, float, 64, 2) = >>>> + vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), >>>> + VECT_VAR (vsrc_2, float, 64, 2), delem0); >>>> + vst1q_f64 (VECT_VAR (result, float, 64, 2), >>>> + VECT_VAR (vector_res, float, 64, 2)); >>>> + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma0_static, ""); >>>> + >>>> + VECT_VAR_DECL (buf_src_3, float, 64, 2) [] = {DA2, DA3}; >>>> + VECT_VAR_DECL (buf_src_4, float, 64, 2) [] = {DB2, DB3}; >>>> + VLOAD (vsrc_1, buf_src_3, q, float, f, 64, 2); >>>> + VLOAD (vsrc_2, buf_src_4, q, float, f, 64, 2); >>>> + VECT_VAR (vector_res, float, 64, 2) = >>>> + vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), >>>> + VECT_VAR (vsrc_2, float, 64, 2), delem1); >>>> + vst1q_f64 (VECT_VAR (result, float, 64, 2), >>>> + VECT_VAR (vector_res, float, 64, 2)); >>>> + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms1_static, ""); >>>> + VECT_VAR (vector_res, float, 64, 2) = >>>> + vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), >>>> + VECT_VAR (vsrc_2, float, 64, 2), delem1); >>>> + vst1q_f64 (VECT_VAR (result, float, 64, 2), >>>> + VECT_VAR (vector_res, float, 64, 2)); >>>> + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma1_static, ""); >>>> + >>>> + VECT_VAR_DECL (buf_src_5, float, 64, 2) [] = {DA4, DA5}; >>>> + VECT_VAR_DECL (buf_src_6, float, 64, 2) [] = {DB4, DB5}; >>>> + VLOAD (vsrc_1, buf_src_5, q, float, f, 64, 2); >>>> + VLOAD (vsrc_2, buf_src_6, q, float, f, 64, 2); >>>> + VECT_VAR (vector_res, float, 64, 2) = >>>> + vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), >>>> + VECT_VAR (vsrc_2, float, 64, 2), delem2); >>>> + vst1q_f64 (VECT_VAR (result, float, 64, 2), >>>> + VECT_VAR (vector_res, float, 64, 2)); >>>> + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms2_static, ""); >>>> + VECT_VAR (vector_res, float, 64, 2) = >>>> + vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), >>>> + VECT_VAR (vsrc_2, float, 64, 2), delem2); >>>> + vst1q_f64 (VECT_VAR (result, float, 64, 2), >>>> + VECT_VAR (vector_res, float, 64, 2)); >>>> + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma2_static, ""); >>>> + >>>> + VECT_VAR_DECL (buf_src_7, float, 64, 2) [] = {DA6, DA7}; >>>> + VECT_VAR_DECL (buf_src_8, float, 64, 2) [] = {DB6, DB7}; >>>> + VLOAD (vsrc_1, buf_src_7, q, float, f, 64, 2); >>>> + VLOAD (vsrc_2, buf_src_8, q, float, f, 64, 2); >>>> + VECT_VAR (vector_res, float, 64, 2) = >>>> + vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), >>>> + VECT_VAR (vsrc_2, float, 64, 2), delem3); >>>> + vst1q_f64 (VECT_VAR (result, float, 64, 2), >>>> + VECT_VAR (vector_res, float, 64, 2)); >>>> + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms3_static, ""); >>>> + VECT_VAR (vector_res, float, 64, 2) = >>>> + vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), >>>> + VECT_VAR (vsrc_2, float, 64, 2), delem3); >>>> + vst1q_f64 (VECT_VAR (result, float, 64, 2), >>>> + VECT_VAR (vector_res, float, 64, 2)); >>>> + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma3_static, ""); >>>> + >>>> +#undef TEST_MSG >>>> +#define TEST_MSG "VFMS_VFMA_N (FP64)" >>>> + clean_results (); >>>> + >>>> + DECL_VARIABLE(vsrc_1, float, 64, 1); >>>> + DECL_VARIABLE(vsrc_2, float, 64, 1); >>>> + VECT_VAR_DECL (buf_src_1, float, 64, 1) [] = {DA0}; >>>> + VECT_VAR_DECL (buf_src_2, float, 64, 1) [] = {DB0}; >>>> + VLOAD (vsrc_1, buf_src_1, , float, f, 64, 1); >>>> + VLOAD (vsrc_2, buf_src_2, , float, f, 64, 1); >>>> + DECL_VARIABLE (vector_res, float, 64, 1) = >>>> + vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), >>>> + VECT_VAR (vsrc_2, float, 64, 1), delem0); >>>> + vst1_f64 (VECT_VAR (result, float, 64, 1), >>>> + VECT_VAR (vector_res, float, 64, 1)); >>>> + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms0_static, ""); >>>> + VECT_VAR (vector_res, float, 64, 1) = >>>> + vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), >>>> + VECT_VAR (vsrc_2, float, 64, 1), delem0); >>>> + vst1_f64 (VECT_VAR (result, float, 64, 1), >>>> + VECT_VAR (vector_res, float, 64, 1)); >>>> + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma0_static, ""); >>>> + >>>> + VECT_VAR_DECL (buf_src_3, float, 64, 1) [] = {DA2}; >>>> + VECT_VAR_DECL (buf_src_4, float, 64, 1) [] = {DB2}; >>>> + VLOAD (vsrc_1, buf_src_3, , float, f, 64, 1); >>>> + VLOAD (vsrc_2, buf_src_4, , float, f, 64, 1); >>>> + VECT_VAR (vector_res, float, 64, 1) = >>>> + vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), >>>> + VECT_VAR (vsrc_2, float, 64, 1), delem1); >>>> + vst1_f64 (VECT_VAR (result, float, 64, 1), >>>> + VECT_VAR (vector_res, float, 64, 1)); >>>> + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms1_static, ""); >>>> + VECT_VAR (vector_res, float, 64, 1) = >>>> + vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), >>>> + VECT_VAR (vsrc_2, float, 64, 1), delem1); >>>> + vst1_f64 (VECT_VAR (result, float, 64, 1), >>>> + VECT_VAR (vector_res, float, 64, 1)); >>>> + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma1_static, ""); >>>> + >>>> + VECT_VAR_DECL (buf_src_5, float, 64, 1) [] = {DA4}; >>>> + VECT_VAR_DECL (buf_src_6, float, 64, 1) [] = {DB4}; >>>> + VLOAD (vsrc_1, buf_src_5, , float, f, 64, 1); >>>> + VLOAD (vsrc_2, buf_src_6, , float, f, 64, 1); >>>> + VECT_VAR (vector_res, float, 64, 1) = >>>> + vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), >>>> + VECT_VAR (vsrc_2, float, 64, 1), delem2); >>>> + vst1_f64 (VECT_VAR (result, float, 64, 1), >>>> + VECT_VAR (vector_res, float, 64, 1)); >>>> + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms2_static, ""); >>>> + VECT_VAR (vector_res, float, 64, 1) = >>>> + vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), >>>> + VECT_VAR (vsrc_2, float, 64, 1), delem2); >>>> + vst1_f64 (VECT_VAR (result, float, 64, 1), >>>> + VECT_VAR (vector_res, float, 64, 1)); >>>> + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma2_static, ""); >>>> + >>>> + VECT_VAR_DECL (buf_src_7, float, 64, 1) [] = {DA6}; >>>> + VECT_VAR_DECL (buf_src_8, float, 64, 1) [] = {DB6}; >>>> + VLOAD (vsrc_1, buf_src_7, , float, f, 64, 1); >>>> + VLOAD (vsrc_2, buf_src_8, , float, f, 64, 1); >>>> + VECT_VAR (vector_res, float, 64, 1) = >>>> + vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), >>>> + VECT_VAR (vsrc_2, float, 64, 1), delem3); >>>> + vst1_f64 (VECT_VAR (result, float, 64, 1), >>>> + VECT_VAR (vector_res, float, 64, 1)); >>>> + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms3_static, ""); >>>> + VECT_VAR (vector_res, float, 64, 1) = >>>> + vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), >>>> + VECT_VAR (vsrc_2, float, 64, 1), delem3); >>>> + vst1_f64 (VECT_VAR (result, float, 64, 1), >>>> + VECT_VAR (vector_res, float, 64, 1)); >>>> + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma3_static, ""); >>>> +} >>>> +#endif >>>> + >>>> +int >>>> +main (void) >>>> +{ >>>> +#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA) >>>> + exec_vfma_vfms_n (); >>>> +#endif >>>> + return 0; >>>> +} >>>> diff --git a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c >>>> b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c >>>> index >>>> 1ba1fed98a0711496815e00d2d702e5bfa2a7d43..5b348827002dcfef1f589900a4cf5ff7ada26697 >>>> 100644 >>>> --- a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c >>>> +++ b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c >>>> @@ -110,6 +110,6 @@ main (int argc, char **argv) >>>> /* vfmaq_lane_f64. >>>> vfma_laneq_f64. >>>> vfmaq_laneq_f64. */ >>>> -/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, >>>> v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */ >>>> +/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, >>>> v\[0-9\]+\.2d, v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */ >>>> diff --git a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c >>>> b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c >>>> index >>>> 887ebae10da715c8d301a8494a2225e53f15bd7d..6c194a023d34ebafb4d732edc303985531f92a63 >>>> 100644 >>>> --- a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c >>>> +++ b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c >>>> @@ -111,6 +111,6 @@ main (int argc, char **argv) >>>> /* vfmsq_lane_f64. >>>> vfms_laneq_f64. >>>> vfmsq_laneq_f64. */ >>>> -/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, >>>> v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */ >>>> +/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, >>>> v\[0-9\]+\.2d, v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */ >>>> >> >
On 18/05/16 09:25, Christophe Lyon wrote: > On 17 May 2016 at 14:42, Kyrill Tkachov <kyrylo.tkachov@foss.arm.com> wrote: >> On 17/05/16 13:40, Kyrill Tkachov wrote: >>> >>> On 17/05/16 13:20, James Greenhalgh wrote: >>>> On Mon, May 16, 2016 at 10:09:26AM +0100, Jiong Wang wrote: >>>>> The support of vfma_n_f64, vfms_n_f32, vfmsq_n_f32, vfmsq_n_f64 are >>>>> missing in current gcc arm_neon.h. >>>>> >>>>> Meanwhile, besides "(fma (vec_dup (vec_select)))", fma by element can >>>>> also comes from "(fma (vec_dup(scalar" where the scalar value is already >>>>> sitting in vector register then duplicated to other lanes, and there is >>>>> no lane size change. >>>>> >>>>> This patch implement this and can generate better code under some >>>>> context. For example: >>>>> >>>>> cat test.c >>>>> === >>>>> typedef __Float32x2_t float32x2_t; >>>>> typedef float float32_t; >>>>> >>>>> float32x2_t >>>>> vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c) >>>>> { >>>>> return __builtin_aarch64_fmav2sf (__b, (float32x2_t) {__c, >>>>> __c}, __a); >>>>> } >>>>> >>>>> before (-O2) >>>>> === >>>>> vfma_n_f32: >>>>> dup v2.2s, v2.s[0] >>>>> fmla v0.2s, v1.2s, v2.2s >>>>> ret >>>>> after >>>>> === >>>>> vfma_n_f32: >>>>> fmla v0.2s, v1.2s, v2.s[0] >>>>> ret >>>>> >>>>> OK for trunk? >>>>> >>>>> 2016-05-16 Jiong Wang <jiong.wang@arm.com> >>>> This ChangeLog entry is not correctly formatted. There should be two >>>> spaces between your name and your email, and each line should start with >>>> a tab. >>>> >>>>> gcc/ >>>>> * config/aarch64/aarch64-simd.md (*aarch64_fma4_elt_to_128df): Rename >>>>> to *aarch64_fma4_elt_from_dup<mode>. >>>>> (*aarch64_fnma4_elt_to_128df): Rename to >>>>> *aarch64_fnma4_elt_from_dup<mode>. >>>>> * config/aarch64/arm_neon.h (vfma_n_f64): New. >>>>> (vfms_n_f32): Likewise. >>>>> (vfms_n_f64): Likewise. >>>>> (vfmsq_n_f32): Likewise. >>>>> (vfmsq_n_f64): Likewise. >>>>> >>>>> gcc/testsuite/ >>>>> * gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c: Use >>>>> standard syntax. >>>>> * gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c: Likewise. >>>> The paths of these two entries are incorrect. Remove the gcc/testsuite >>>> from the front. I don't understand what you mean by "Use standard >>>> syntax.", >>>> please fix this to describe what you are actually changing. >>>> >>>>> * gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h: New entry >>>>> for float64x1. >>>>> * gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c: New. >>>> These two changes need approval from an ARM maintainer as they are in >>>> common files. >>>> >>>> From an AArch64 perspective, this patch is OK with a fixed ChangeLog. >>>> Please >>>> wait for an ARM OK for the test changes. >>> >>> Considering that the tests' functionality is guarded on #if >>> defined(__aarch64__) >>> it's a noop on arm and so is ok from that perspective (we have precedence >>> for >>> tests guarded in such a way in advsimd-intrinsics.exp) >>> >> Of course I meant precedent rather than precedence :/ >> > Unfortunately, the guard is not correct :( > > The float64_t type is not available on arm, so the new > declarations/definitions in arm-neon-ref.h > need a guard. Indeed. A patch to guard the float64_t definition in arm-neon-ref.h on __aarch64__ is preapproved (or I think would count as obvious in the interest of unbreaking a large number of tests) Sorry for not catching it earlier. Kyrill > Since this patch was checked-in, all the advsimd intrinsics tests fail > to compile > on arm: > In file included from > /aci-gcc-fsf/sources/gcc-fsf/gccsrc/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vaba.c:2:0: > /aci-gcc-fsf/sources/gcc-fsf/gccsrc/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h:139:22: > error: unknown type name 'float64_t' > /aci-gcc-fsf/sources/gcc-fsf/gccsrc/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h:51:35: > note: in definition of macro 'VECT_VAR_DECL' > /aci-gcc-fsf/sources/gcc-fsf/gccsrc/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h:139:8: > note: in expansion of macro 'ARRAY' > > > Christophe > > >> Kyrill >> >> >>> The arm-neon-ref.h additions are ok too. >>> >>> Thanks, >>> Kyrill >>> >>> >>>> Thanks, >>>> James >>>> >>>>> diff --git a/gcc/config/aarch64/aarch64-simd.md >>>>> b/gcc/config/aarch64/aarch64-simd.md >>>>> index >>>>> bd73bce64414e8bc01732d14311d742cf28f4586..90eaca176b4706e6cc42f16ce2c956f1c8ad17b1 >>>>> 100644 >>>>> --- a/gcc/config/aarch64/aarch64-simd.md >>>>> +++ b/gcc/config/aarch64/aarch64-simd.md >>>>> @@ -1579,16 +1579,16 @@ >>>>> [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")] >>>>> ) >>>>> -(define_insn "*aarch64_fma4_elt_to_128df" >>>>> - [(set (match_operand:V2DF 0 "register_operand" "=w") >>>>> - (fma:V2DF >>>>> - (vec_duplicate:V2DF >>>>> - (match_operand:DF 1 "register_operand" "w")) >>>>> - (match_operand:V2DF 2 "register_operand" "w") >>>>> - (match_operand:V2DF 3 "register_operand" "0")))] >>>>> +(define_insn "*aarch64_fma4_elt_from_dup<mode>" >>>>> + [(set (match_operand:VMUL 0 "register_operand" "=w") >>>>> + (fma:VMUL >>>>> + (vec_duplicate:VMUL >>>>> + (match_operand:<VEL> 1 "register_operand" "w")) >>>>> + (match_operand:VMUL 2 "register_operand" "w") >>>>> + (match_operand:VMUL 3 "register_operand" "0")))] >>>>> "TARGET_SIMD" >>>>> - "fmla\\t%0.2d, %2.2d, %1.2d[0]" >>>>> - [(set_attr "type" "neon_fp_mla_d_scalar_q")] >>>>> + "fmla\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]" >>>>> + [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")] >>>>> ) >>>>> (define_insn "*aarch64_fma4_elt_to_64v2df" >>>>> @@ -1656,17 +1656,17 @@ >>>>> [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")] >>>>> ) >>>>> -(define_insn "*aarch64_fnma4_elt_to_128df" >>>>> - [(set (match_operand:V2DF 0 "register_operand" "=w") >>>>> - (fma:V2DF >>>>> - (neg:V2DF >>>>> - (match_operand:V2DF 2 "register_operand" "w")) >>>>> - (vec_duplicate:V2DF >>>>> - (match_operand:DF 1 "register_operand" "w")) >>>>> - (match_operand:V2DF 3 "register_operand" "0")))] >>>>> - "TARGET_SIMD" >>>>> - "fmls\\t%0.2d, %2.2d, %1.2d[0]" >>>>> - [(set_attr "type" "neon_fp_mla_d_scalar_q")] >>>>> +(define_insn "*aarch64_fnma4_elt_from_dup<mode>" >>>>> + [(set (match_operand:VMUL 0 "register_operand" "=w") >>>>> + (fma:VMUL >>>>> + (neg:VMUL >>>>> + (match_operand:VMUL 2 "register_operand" "w")) >>>>> + (vec_duplicate:VMUL >>>>> + (match_operand:<VEL> 1 "register_operand" "w")) >>>>> + (match_operand:VMUL 3 "register_operand" "0")))] >>>>> + "TARGET_SIMD" >>>>> + "fmls\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]" >>>>> + [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")] >>>>> ) >>>>> (define_insn "*aarch64_fnma4_elt_to_64v2df" >>>>> diff --git a/gcc/config/aarch64/arm_neon.h >>>>> b/gcc/config/aarch64/arm_neon.h >>>>> index >>>>> 2612a325718918cf7cd808f28c09c9c4c7b11c07..ca7ace5aa656163826569d046fcbf02f9f7d4d6c >>>>> 100644 >>>>> --- a/gcc/config/aarch64/arm_neon.h >>>>> +++ b/gcc/config/aarch64/arm_neon.h >>>>> @@ -14456,6 +14456,12 @@ vfma_n_f32 (float32x2_t __a, float32x2_t __b, >>>>> float32_t __c) >>>>> return __builtin_aarch64_fmav2sf (__b, vdup_n_f32 (__c), __a); >>>>> } >>>>> +__extension__ static __inline float64x1_t __attribute__ >>>>> ((__always_inline__)) >>>>> +vfma_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c) >>>>> +{ >>>>> + return (float64x1_t) {__b[0] * __c + __a[0]}; >>>>> +} >>>>> + >>>>> __extension__ static __inline float32x4_t __attribute__ >>>>> ((__always_inline__)) >>>>> vfmaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c) >>>>> { >>>>> @@ -14597,6 +14603,29 @@ vfmsq_f64 (float64x2_t __a, float64x2_t __b, >>>>> float64x2_t __c) >>>>> return __builtin_aarch64_fmav2df (-__b, __c, __a); >>>>> } >>>>> +__extension__ static __inline float32x2_t __attribute__ >>>>> ((__always_inline__)) >>>>> +vfms_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c) >>>>> +{ >>>>> + return __builtin_aarch64_fmav2sf (-__b, vdup_n_f32 (__c), __a); >>>>> +} >>>>> + >>>>> +__extension__ static __inline float64x1_t __attribute__ >>>>> ((__always_inline__)) >>>>> +vfms_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c) >>>>> +{ >>>>> + return (float64x1_t) {-__b[0] * __c + __a[0]}; >>>>> +} >>>>> + >>>>> +__extension__ static __inline float32x4_t __attribute__ >>>>> ((__always_inline__)) >>>>> +vfmsq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c) >>>>> +{ >>>>> + return __builtin_aarch64_fmav4sf (-__b, vdupq_n_f32 (__c), __a); >>>>> +} >>>>> + >>>>> +__extension__ static __inline float64x2_t __attribute__ >>>>> ((__always_inline__)) >>>>> +vfmsq_n_f64 (float64x2_t __a, float64x2_t __b, float64_t __c) >>>>> +{ >>>>> + return __builtin_aarch64_fmav2df (-__b, vdupq_n_f64 (__c), __a); >>>>> +} >>>>> /* vfms_lane */ >>>>> diff --git >>>>> a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h >>>>> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h >>>>> index >>>>> 49fbd843e507ede8aa81d02c175a82a1221750a4..cf90825f87391b72aca9a29980210d21f4321c04 >>>>> 100644 >>>>> --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h >>>>> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h >>>>> @@ -136,6 +136,7 @@ static ARRAY(result, poly, 16, 4); >>>>> #if defined (__ARM_FP16_FORMAT_IEEE) || defined >>>>> (__ARM_FP16_FORMAT_ALTERNATIVE) >>>>> static ARRAY(result, float, 16, 4); >>>>> #endif >>>>> +static ARRAY(result, float, 64, 1); >>>>> static ARRAY(result, float, 32, 2); >>>>> static ARRAY(result, int, 8, 16); >>>>> static ARRAY(result, int, 16, 8); >>>>> @@ -169,6 +170,7 @@ extern ARRAY(expected, poly, 8, 8); >>>>> extern ARRAY(expected, poly, 16, 4); >>>>> extern ARRAY(expected, hfloat, 16, 4); >>>>> extern ARRAY(expected, hfloat, 32, 2); >>>>> +extern ARRAY(expected, hfloat, 64, 1); >>>>> extern ARRAY(expected, int, 8, 16); >>>>> extern ARRAY(expected, int, 16, 8); >>>>> extern ARRAY(expected, int, 32, 4); >>>>> diff --git >>>>> a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c >>>>> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c >>>>> new file mode 100644 >>>>> index >>>>> 0000000000000000000000000000000000000000..26223763c59c849607b5320f6ec37098a556ce2e >>>>> --- /dev/null >>>>> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c >>>>> @@ -0,0 +1,490 @@ >>>>> +#include <arm_neon.h> >>>>> +#include "arm-neon-ref.h" >>>>> +#include "compute-ref-data.h" >>>>> + >>>>> +#define A0 123.4f >>>>> +#define A1 -3.8f >>>>> +#define A2 -29.4f >>>>> +#define A3 (__builtin_inff ()) >>>>> +#define A4 0.0f >>>>> +#define A5 24.0f >>>>> +#define A6 124.0f >>>>> +#define A7 1024.0f >>>>> + >>>>> +#define B0 -5.8f >>>>> +#define B1 -0.0f >>>>> +#define B2 -10.8f >>>>> +#define B3 10.0f >>>>> +#define B4 23.4f >>>>> +#define B5 -1234.8f >>>>> +#define B6 8.9f >>>>> +#define B7 4.0f >>>>> + >>>>> +#define E0 9.8f >>>>> +#define E1 -1024.0f >>>>> +#define E2 (-__builtin_inff ()) >>>>> +#define E3 479.0f >>>>> +float32_t elem0 = E0; >>>>> +float32_t elem1 = E1; >>>>> +float32_t elem2 = E2; >>>>> +float32_t elem3 = E3; >>>>> + >>>>> +#define DA0 1231234.4 >>>>> +#define DA1 -3.8 >>>>> +#define DA2 -2980.4 >>>>> +#define DA3 -5.8 >>>>> +#define DA4 0.01123 >>>>> +#define DA5 24.0 >>>>> +#define DA6 124.12345 >>>>> +#define DA7 1024.0 >>>>> + >>>>> +#define DB0 -5.8 >>>>> +#define DB1 (__builtin_inf ()) >>>>> +#define DB2 -105.8 >>>>> +#define DB3 10.0 >>>>> +#define DB4 (-__builtin_inf ()) >>>>> +#define DB5 -1234.8 >>>>> +#define DB6 848.9 >>>>> +#define DB7 44444.0 >>>>> + >>>>> +#define DE0 9.8 >>>>> +#define DE1 -1024.0 >>>>> +#define DE2 105.8 >>>>> +#define DE3 479.0 >>>>> +float64_t delem0 = DE0; >>>>> +float64_t delem1 = DE1; >>>>> +float64_t delem2 = DE2; >>>>> +float64_t delem3 = DE3; >>>>> + >>>>> +#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA) >>>>> + >>>>> +/* Expected results for vfms_n. */ >>>>> + >>>>> +VECT_VAR_DECL(expectedfms0, float, 32, 2) [] = {A0 + -B0 * E0, A1 + -B1 >>>>> * E0}; >>>>> +VECT_VAR_DECL(expectedfms1, float, 32, 2) [] = {A2 + -B2 * E1, A3 + -B3 >>>>> * E1}; >>>>> +VECT_VAR_DECL(expectedfms2, float, 32, 2) [] = {A4 + -B4 * E2, A5 + -B5 >>>>> * E2}; >>>>> +VECT_VAR_DECL(expectedfms3, float, 32, 2) [] = {A6 + -B6 * E3, A7 + -B7 >>>>> * E3}; >>>>> +VECT_VAR_DECL(expectedfma0, float, 32, 2) [] = {A0 + B0 * E0, A1 + B1 * >>>>> E0}; >>>>> +VECT_VAR_DECL(expectedfma1, float, 32, 2) [] = {A2 + B2 * E1, A3 + B3 * >>>>> E1}; >>>>> +VECT_VAR_DECL(expectedfma2, float, 32, 2) [] = {A4 + B4 * E2, A5 + B5 * >>>>> E2}; >>>>> +VECT_VAR_DECL(expectedfma3, float, 32, 2) [] = {A6 + B6 * E3, A7 + B7 * >>>>> E3}; >>>>> + >>>>> +hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 2) = >>>>> + (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 2); >>>>> +hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 2) = >>>>> + (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 2); >>>>> +hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 2) = >>>>> + (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 2); >>>>> +hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 2) = >>>>> + (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 2); >>>>> +hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 2) = >>>>> + (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 2); >>>>> +hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 2) = >>>>> + (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 2); >>>>> +hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 2) = >>>>> + (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 2); >>>>> +hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 2) = >>>>> + (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 2); >>>>> + >>>>> + >>>>> +VECT_VAR_DECL(expectedfms0, float, 32, 4) [] = {A0 + -B0 * E0, A1 + -B1 >>>>> * E0, >>>>> + A2 + -B2 * E0, A3 + -B3 * E0}; >>>>> +VECT_VAR_DECL(expectedfms1, float, 32, 4) [] = {A4 + -B4 * E1, A5 + -B5 >>>>> * E1, >>>>> + A6 + -B6 * E1, A7 + -B7 * E1}; >>>>> +VECT_VAR_DECL(expectedfms2, float, 32, 4) [] = {A0 + -B0 * E2, A2 + -B2 >>>>> * E2, >>>>> + A4 + -B4 * E2, A6 + -B6 * E2}; >>>>> +VECT_VAR_DECL(expectedfms3, float, 32, 4) [] = {A1 + -B1 * E3, A3 + -B3 >>>>> * E3, >>>>> + A5 + -B5 * E3, A7 + -B7 * E3}; >>>>> +VECT_VAR_DECL(expectedfma0, float, 32, 4) [] = {A0 + B0 * E0, A1 + B1 * >>>>> E0, >>>>> + A2 + B2 * E0, A3 + B3 * E0}; >>>>> +VECT_VAR_DECL(expectedfma1, float, 32, 4) [] = {A4 + B4 * E1, A5 + B5 * >>>>> E1, >>>>> + A6 + B6 * E1, A7 + B7 * E1}; >>>>> +VECT_VAR_DECL(expectedfma2, float, 32, 4) [] = {A0 + B0 * E2, A2 + B2 * >>>>> E2, >>>>> + A4 + B4 * E2, A6 + B6 * E2}; >>>>> +VECT_VAR_DECL(expectedfma3, float, 32, 4) [] = {A1 + B1 * E3, A3 + B3 * >>>>> E3, >>>>> + A5 + B5 * E3, A7 + B7 * E3}; >>>>> + >>>>> +hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 4) = >>>>> + (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 4); >>>>> +hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 4) = >>>>> + (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 4); >>>>> +hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 4) = >>>>> + (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 4); >>>>> +hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 4) = >>>>> + (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 4); >>>>> +hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 4) = >>>>> + (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 4); >>>>> +hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 4) = >>>>> + (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 4); >>>>> +hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 4) = >>>>> + (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 4); >>>>> +hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 4) = >>>>> + (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 4); >>>>> + >>>>> +VECT_VAR_DECL(expectedfms0, float, 64, 2) [] = {DA0 + -DB0 * DE0, >>>>> + DA1 + -DB1 * DE0}; >>>>> +VECT_VAR_DECL(expectedfms1, float, 64, 2) [] = {DA2 + -DB2 * DE1, >>>>> + DA3 + -DB3 * DE1}; >>>>> +VECT_VAR_DECL(expectedfms2, float, 64, 2) [] = {DA4 + -DB4 * DE2, >>>>> + DA5 + -DB5 * DE2}; >>>>> +VECT_VAR_DECL(expectedfms3, float, 64, 2) [] = {DA6 + -DB6 * DE3, >>>>> + DA7 + -DB7 * DE3}; >>>>> +VECT_VAR_DECL(expectedfma0, float, 64, 2) [] = {DA0 + DB0 * DE0, >>>>> + DA1 + DB1 * DE0}; >>>>> +VECT_VAR_DECL(expectedfma1, float, 64, 2) [] = {DA2 + DB2 * DE1, >>>>> + DA3 + DB3 * DE1}; >>>>> +VECT_VAR_DECL(expectedfma2, float, 64, 2) [] = {DA4 + DB4 * DE2, >>>>> + DA5 + DB5 * DE2}; >>>>> +VECT_VAR_DECL(expectedfma3, float, 64, 2) [] = {DA6 + DB6 * DE3, >>>>> + DA7 + DB7 * DE3}; >>>>> +hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 2) = >>>>> + (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 2); >>>>> +hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 2) = >>>>> + (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 2); >>>>> +hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 2) = >>>>> + (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 2); >>>>> +hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 2) = >>>>> + (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 2); >>>>> +hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 2) = >>>>> + (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 2); >>>>> +hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 2) = >>>>> + (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 2); >>>>> +hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 2) = >>>>> + (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 2); >>>>> +hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 2) = >>>>> + (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 2); >>>>> + >>>>> +VECT_VAR_DECL(expectedfms0, float, 64, 1) [] = {DA0 + -DB0 * DE0}; >>>>> +VECT_VAR_DECL(expectedfms1, float, 64, 1) [] = {DA2 + -DB2 * DE1}; >>>>> +VECT_VAR_DECL(expectedfms2, float, 64, 1) [] = {DA4 + -DB4 * DE2}; >>>>> +VECT_VAR_DECL(expectedfms3, float, 64, 1) [] = {DA6 + -DB6 * DE3}; >>>>> +VECT_VAR_DECL(expectedfma0, float, 64, 1) [] = {DA0 + DB0 * DE0}; >>>>> +VECT_VAR_DECL(expectedfma1, float, 64, 1) [] = {DA2 + DB2 * DE1}; >>>>> +VECT_VAR_DECL(expectedfma2, float, 64, 1) [] = {DA4 + DB4 * DE2}; >>>>> +VECT_VAR_DECL(expectedfma3, float, 64, 1) [] = {DA6 + DB6 * DE3}; >>>>> + >>>>> +hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 1) = >>>>> + (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 1); >>>>> +hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 1) = >>>>> + (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 1); >>>>> +hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 1) = >>>>> + (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 1); >>>>> +hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 1) = >>>>> + (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 1); >>>>> +hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 1) = >>>>> + (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 1); >>>>> +hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 1) = >>>>> + (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 1); >>>>> +hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 1) = >>>>> + (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 1); >>>>> +hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 1) = >>>>> + (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 1); >>>>> + >>>>> +void exec_vfma_vfms_n (void) >>>>> +{ >>>>> +#undef TEST_MSG >>>>> +#define TEST_MSG "VFMS_VFMA_N (FP32)" >>>>> + clean_results (); >>>>> + >>>>> + DECL_VARIABLE(vsrc_1, float, 32, 2); >>>>> + DECL_VARIABLE(vsrc_2, float, 32, 2); >>>>> + VECT_VAR_DECL (buf_src_1, float, 32, 2) [] = {A0, A1}; >>>>> + VECT_VAR_DECL (buf_src_2, float, 32, 2) [] = {B0, B1}; >>>>> + VLOAD (vsrc_1, buf_src_1, , float, f, 32, 2); >>>>> + VLOAD (vsrc_2, buf_src_2, , float, f, 32, 2); >>>>> + DECL_VARIABLE (vector_res, float, 32, 2) = >>>>> + vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), >>>>> + VECT_VAR (vsrc_2, float, 32, 2), elem0); >>>>> + vst1_f32 (VECT_VAR (result, float, 32, 2), >>>>> + VECT_VAR (vector_res, float, 32, 2)); >>>>> + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms0_static, ""); >>>>> + VECT_VAR (vector_res, float, 32, 2) = >>>>> + vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), >>>>> + VECT_VAR (vsrc_2, float, 32, 2), elem0); >>>>> + vst1_f32 (VECT_VAR (result, float, 32, 2), >>>>> + VECT_VAR (vector_res, float, 32, 2)); >>>>> + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma0_static, ""); >>>>> + >>>>> + VECT_VAR_DECL (buf_src_3, float, 32, 2) [] = {A2, A3}; >>>>> + VECT_VAR_DECL (buf_src_4, float, 32, 2) [] = {B2, B3}; >>>>> + VLOAD (vsrc_1, buf_src_3, , float, f, 32, 2); >>>>> + VLOAD (vsrc_2, buf_src_4, , float, f, 32, 2); >>>>> + VECT_VAR (vector_res, float, 32, 2) = >>>>> + vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), >>>>> + VECT_VAR (vsrc_2, float, 32, 2), elem1); >>>>> + vst1_f32 (VECT_VAR (result, float, 32, 2), >>>>> + VECT_VAR (vector_res, float, 32, 2)); >>>>> + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms1_static, ""); >>>>> + VECT_VAR (vector_res, float, 32, 2) = >>>>> + vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), >>>>> + VECT_VAR (vsrc_2, float, 32, 2), elem1); >>>>> + vst1_f32 (VECT_VAR (result, float, 32, 2), >>>>> + VECT_VAR (vector_res, float, 32, 2)); >>>>> + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma1_static, ""); >>>>> + >>>>> + VECT_VAR_DECL (buf_src_5, float, 32, 2) [] = {A4, A5}; >>>>> + VECT_VAR_DECL (buf_src_6, float, 32, 2) [] = {B4, B5}; >>>>> + VLOAD (vsrc_1, buf_src_5, , float, f, 32, 2); >>>>> + VLOAD (vsrc_2, buf_src_6, , float, f, 32, 2); >>>>> + VECT_VAR (vector_res, float, 32, 2) = >>>>> + vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), >>>>> + VECT_VAR (vsrc_2, float, 32, 2), elem2); >>>>> + vst1_f32 (VECT_VAR (result, float, 32, 2), >>>>> + VECT_VAR (vector_res, float, 32, 2)); >>>>> + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms2_static, ""); >>>>> + VECT_VAR (vector_res, float, 32, 2) = >>>>> + vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), >>>>> + VECT_VAR (vsrc_2, float, 32, 2), elem2); >>>>> + vst1_f32 (VECT_VAR (result, float, 32, 2), >>>>> + VECT_VAR (vector_res, float, 32, 2)); >>>>> + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma2_static, ""); >>>>> + >>>>> + VECT_VAR_DECL (buf_src_7, float, 32, 2) [] = {A6, A7}; >>>>> + VECT_VAR_DECL (buf_src_8, float, 32, 2) [] = {B6, B7}; >>>>> + VLOAD (vsrc_1, buf_src_7, , float, f, 32, 2); >>>>> + VLOAD (vsrc_2, buf_src_8, , float, f, 32, 2); >>>>> + VECT_VAR (vector_res, float, 32, 2) = >>>>> + vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), >>>>> + VECT_VAR (vsrc_2, float, 32, 2), elem3); >>>>> + vst1_f32 (VECT_VAR (result, float, 32, 2), >>>>> + VECT_VAR (vector_res, float, 32, 2)); >>>>> + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms3_static, ""); >>>>> + VECT_VAR (vector_res, float, 32, 2) = >>>>> + vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), >>>>> + VECT_VAR (vsrc_2, float, 32, 2), elem3); >>>>> + vst1_f32 (VECT_VAR (result, float, 32, 2), >>>>> + VECT_VAR (vector_res, float, 32, 2)); >>>>> + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma3_static, ""); >>>>> + >>>>> +#undef TEST_MSG >>>>> +#define TEST_MSG "VFMSQ_VFMAQ_N (FP32)" >>>>> + clean_results (); >>>>> + >>>>> + DECL_VARIABLE(vsrc_1, float, 32, 4); >>>>> + DECL_VARIABLE(vsrc_2, float, 32, 4); >>>>> + VECT_VAR_DECL (buf_src_1, float, 32, 4) [] = {A0, A1, A2, A3}; >>>>> + VECT_VAR_DECL (buf_src_2, float, 32, 4) [] = {B0, B1, B2, B3}; >>>>> + VLOAD (vsrc_1, buf_src_1, q, float, f, 32, 4); >>>>> + VLOAD (vsrc_2, buf_src_2, q, float, f, 32, 4); >>>>> + DECL_VARIABLE (vector_res, float, 32, 4) = >>>>> + vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), >>>>> + VECT_VAR (vsrc_2, float, 32, 4), elem0); >>>>> + vst1q_f32 (VECT_VAR (result, float, 32, 4), >>>>> + VECT_VAR (vector_res, float, 32, 4)); >>>>> + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms0_static, ""); >>>>> + VECT_VAR (vector_res, float, 32, 4) = >>>>> + vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), >>>>> + VECT_VAR (vsrc_2, float, 32, 4), elem0); >>>>> + vst1q_f32 (VECT_VAR (result, float, 32, 4), >>>>> + VECT_VAR (vector_res, float, 32, 4)); >>>>> + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma0_static, ""); >>>>> + >>>>> + VECT_VAR_DECL (buf_src_3, float, 32, 4) [] = {A4, A5, A6, A7}; >>>>> + VECT_VAR_DECL (buf_src_4, float, 32, 4) [] = {B4, B5, B6, B7}; >>>>> + VLOAD (vsrc_1, buf_src_3, q, float, f, 32, 4); >>>>> + VLOAD (vsrc_2, buf_src_4, q, float, f, 32, 4); >>>>> + VECT_VAR (vector_res, float, 32, 4) = >>>>> + vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), >>>>> + VECT_VAR (vsrc_2, float, 32, 4), elem1); >>>>> + vst1q_f32 (VECT_VAR (result, float, 32, 4), >>>>> + VECT_VAR (vector_res, float, 32, 4)); >>>>> + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms1_static, ""); >>>>> + VECT_VAR (vector_res, float, 32, 4) = >>>>> + vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), >>>>> + VECT_VAR (vsrc_2, float, 32, 4), elem1); >>>>> + vst1q_f32 (VECT_VAR (result, float, 32, 4), >>>>> + VECT_VAR (vector_res, float, 32, 4)); >>>>> + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma1_static, ""); >>>>> + >>>>> + VECT_VAR_DECL (buf_src_5, float, 32, 4) [] = {A0, A2, A4, A6}; >>>>> + VECT_VAR_DECL (buf_src_6, float, 32, 4) [] = {B0, B2, B4, B6}; >>>>> + VLOAD (vsrc_1, buf_src_5, q, float, f, 32, 4); >>>>> + VLOAD (vsrc_2, buf_src_6, q, float, f, 32, 4); >>>>> + VECT_VAR (vector_res, float, 32, 4) = >>>>> + vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), >>>>> + VECT_VAR (vsrc_2, float, 32, 4), elem2); >>>>> + vst1q_f32 (VECT_VAR (result, float, 32, 4), >>>>> + VECT_VAR (vector_res, float, 32, 4)); >>>>> + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms2_static, ""); >>>>> + VECT_VAR (vector_res, float, 32, 4) = >>>>> + vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), >>>>> + VECT_VAR (vsrc_2, float, 32, 4), elem2); >>>>> + vst1q_f32 (VECT_VAR (result, float, 32, 4), >>>>> + VECT_VAR (vector_res, float, 32, 4)); >>>>> + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma2_static, ""); >>>>> + >>>>> + VECT_VAR_DECL (buf_src_7, float, 32, 4) [] = {A1, A3, A5, A7}; >>>>> + VECT_VAR_DECL (buf_src_8, float, 32, 4) [] = {B1, B3, B5, B7}; >>>>> + VLOAD (vsrc_1, buf_src_7, q, float, f, 32, 4); >>>>> + VLOAD (vsrc_2, buf_src_8, q, float, f, 32, 4); >>>>> + VECT_VAR (vector_res, float, 32, 4) = >>>>> + vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), >>>>> + VECT_VAR (vsrc_2, float, 32, 4), elem3); >>>>> + vst1q_f32 (VECT_VAR (result, float, 32, 4), >>>>> + VECT_VAR (vector_res, float, 32, 4)); >>>>> + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms3_static, ""); >>>>> + VECT_VAR (vector_res, float, 32, 4) = >>>>> + vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), >>>>> + VECT_VAR (vsrc_2, float, 32, 4), elem3); >>>>> + vst1q_f32 (VECT_VAR (result, float, 32, 4), >>>>> + VECT_VAR (vector_res, float, 32, 4)); >>>>> + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma3_static, ""); >>>>> + >>>>> +#undef TEST_MSG >>>>> +#define TEST_MSG "VFMSQ_VFMAQ_N (FP64)" >>>>> + clean_results (); >>>>> + >>>>> + DECL_VARIABLE(vsrc_1, float, 64, 2); >>>>> + DECL_VARIABLE(vsrc_2, float, 64, 2); >>>>> + VECT_VAR_DECL (buf_src_1, float, 64, 2) [] = {DA0, DA1}; >>>>> + VECT_VAR_DECL (buf_src_2, float, 64, 2) [] = {DB0, DB1}; >>>>> + VLOAD (vsrc_1, buf_src_1, q, float, f, 64, 2); >>>>> + VLOAD (vsrc_2, buf_src_2, q, float, f, 64, 2); >>>>> + DECL_VARIABLE (vector_res, float, 64, 2) = >>>>> + vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), >>>>> + VECT_VAR (vsrc_2, float, 64, 2), delem0); >>>>> + vst1q_f64 (VECT_VAR (result, float, 64, 2), >>>>> + VECT_VAR (vector_res, float, 64, 2)); >>>>> + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms0_static, ""); >>>>> + VECT_VAR (vector_res, float, 64, 2) = >>>>> + vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), >>>>> + VECT_VAR (vsrc_2, float, 64, 2), delem0); >>>>> + vst1q_f64 (VECT_VAR (result, float, 64, 2), >>>>> + VECT_VAR (vector_res, float, 64, 2)); >>>>> + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma0_static, ""); >>>>> + >>>>> + VECT_VAR_DECL (buf_src_3, float, 64, 2) [] = {DA2, DA3}; >>>>> + VECT_VAR_DECL (buf_src_4, float, 64, 2) [] = {DB2, DB3}; >>>>> + VLOAD (vsrc_1, buf_src_3, q, float, f, 64, 2); >>>>> + VLOAD (vsrc_2, buf_src_4, q, float, f, 64, 2); >>>>> + VECT_VAR (vector_res, float, 64, 2) = >>>>> + vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), >>>>> + VECT_VAR (vsrc_2, float, 64, 2), delem1); >>>>> + vst1q_f64 (VECT_VAR (result, float, 64, 2), >>>>> + VECT_VAR (vector_res, float, 64, 2)); >>>>> + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms1_static, ""); >>>>> + VECT_VAR (vector_res, float, 64, 2) = >>>>> + vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), >>>>> + VECT_VAR (vsrc_2, float, 64, 2), delem1); >>>>> + vst1q_f64 (VECT_VAR (result, float, 64, 2), >>>>> + VECT_VAR (vector_res, float, 64, 2)); >>>>> + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma1_static, ""); >>>>> + >>>>> + VECT_VAR_DECL (buf_src_5, float, 64, 2) [] = {DA4, DA5}; >>>>> + VECT_VAR_DECL (buf_src_6, float, 64, 2) [] = {DB4, DB5}; >>>>> + VLOAD (vsrc_1, buf_src_5, q, float, f, 64, 2); >>>>> + VLOAD (vsrc_2, buf_src_6, q, float, f, 64, 2); >>>>> + VECT_VAR (vector_res, float, 64, 2) = >>>>> + vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), >>>>> + VECT_VAR (vsrc_2, float, 64, 2), delem2); >>>>> + vst1q_f64 (VECT_VAR (result, float, 64, 2), >>>>> + VECT_VAR (vector_res, float, 64, 2)); >>>>> + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms2_static, ""); >>>>> + VECT_VAR (vector_res, float, 64, 2) = >>>>> + vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), >>>>> + VECT_VAR (vsrc_2, float, 64, 2), delem2); >>>>> + vst1q_f64 (VECT_VAR (result, float, 64, 2), >>>>> + VECT_VAR (vector_res, float, 64, 2)); >>>>> + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma2_static, ""); >>>>> + >>>>> + VECT_VAR_DECL (buf_src_7, float, 64, 2) [] = {DA6, DA7}; >>>>> + VECT_VAR_DECL (buf_src_8, float, 64, 2) [] = {DB6, DB7}; >>>>> + VLOAD (vsrc_1, buf_src_7, q, float, f, 64, 2); >>>>> + VLOAD (vsrc_2, buf_src_8, q, float, f, 64, 2); >>>>> + VECT_VAR (vector_res, float, 64, 2) = >>>>> + vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), >>>>> + VECT_VAR (vsrc_2, float, 64, 2), delem3); >>>>> + vst1q_f64 (VECT_VAR (result, float, 64, 2), >>>>> + VECT_VAR (vector_res, float, 64, 2)); >>>>> + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms3_static, ""); >>>>> + VECT_VAR (vector_res, float, 64, 2) = >>>>> + vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), >>>>> + VECT_VAR (vsrc_2, float, 64, 2), delem3); >>>>> + vst1q_f64 (VECT_VAR (result, float, 64, 2), >>>>> + VECT_VAR (vector_res, float, 64, 2)); >>>>> + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma3_static, ""); >>>>> + >>>>> +#undef TEST_MSG >>>>> +#define TEST_MSG "VFMS_VFMA_N (FP64)" >>>>> + clean_results (); >>>>> + >>>>> + DECL_VARIABLE(vsrc_1, float, 64, 1); >>>>> + DECL_VARIABLE(vsrc_2, float, 64, 1); >>>>> + VECT_VAR_DECL (buf_src_1, float, 64, 1) [] = {DA0}; >>>>> + VECT_VAR_DECL (buf_src_2, float, 64, 1) [] = {DB0}; >>>>> + VLOAD (vsrc_1, buf_src_1, , float, f, 64, 1); >>>>> + VLOAD (vsrc_2, buf_src_2, , float, f, 64, 1); >>>>> + DECL_VARIABLE (vector_res, float, 64, 1) = >>>>> + vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), >>>>> + VECT_VAR (vsrc_2, float, 64, 1), delem0); >>>>> + vst1_f64 (VECT_VAR (result, float, 64, 1), >>>>> + VECT_VAR (vector_res, float, 64, 1)); >>>>> + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms0_static, ""); >>>>> + VECT_VAR (vector_res, float, 64, 1) = >>>>> + vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), >>>>> + VECT_VAR (vsrc_2, float, 64, 1), delem0); >>>>> + vst1_f64 (VECT_VAR (result, float, 64, 1), >>>>> + VECT_VAR (vector_res, float, 64, 1)); >>>>> + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma0_static, ""); >>>>> + >>>>> + VECT_VAR_DECL (buf_src_3, float, 64, 1) [] = {DA2}; >>>>> + VECT_VAR_DECL (buf_src_4, float, 64, 1) [] = {DB2}; >>>>> + VLOAD (vsrc_1, buf_src_3, , float, f, 64, 1); >>>>> + VLOAD (vsrc_2, buf_src_4, , float, f, 64, 1); >>>>> + VECT_VAR (vector_res, float, 64, 1) = >>>>> + vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), >>>>> + VECT_VAR (vsrc_2, float, 64, 1), delem1); >>>>> + vst1_f64 (VECT_VAR (result, float, 64, 1), >>>>> + VECT_VAR (vector_res, float, 64, 1)); >>>>> + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms1_static, ""); >>>>> + VECT_VAR (vector_res, float, 64, 1) = >>>>> + vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), >>>>> + VECT_VAR (vsrc_2, float, 64, 1), delem1); >>>>> + vst1_f64 (VECT_VAR (result, float, 64, 1), >>>>> + VECT_VAR (vector_res, float, 64, 1)); >>>>> + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma1_static, ""); >>>>> + >>>>> + VECT_VAR_DECL (buf_src_5, float, 64, 1) [] = {DA4}; >>>>> + VECT_VAR_DECL (buf_src_6, float, 64, 1) [] = {DB4}; >>>>> + VLOAD (vsrc_1, buf_src_5, , float, f, 64, 1); >>>>> + VLOAD (vsrc_2, buf_src_6, , float, f, 64, 1); >>>>> + VECT_VAR (vector_res, float, 64, 1) = >>>>> + vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), >>>>> + VECT_VAR (vsrc_2, float, 64, 1), delem2); >>>>> + vst1_f64 (VECT_VAR (result, float, 64, 1), >>>>> + VECT_VAR (vector_res, float, 64, 1)); >>>>> + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms2_static, ""); >>>>> + VECT_VAR (vector_res, float, 64, 1) = >>>>> + vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), >>>>> + VECT_VAR (vsrc_2, float, 64, 1), delem2); >>>>> + vst1_f64 (VECT_VAR (result, float, 64, 1), >>>>> + VECT_VAR (vector_res, float, 64, 1)); >>>>> + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma2_static, ""); >>>>> + >>>>> + VECT_VAR_DECL (buf_src_7, float, 64, 1) [] = {DA6}; >>>>> + VECT_VAR_DECL (buf_src_8, float, 64, 1) [] = {DB6}; >>>>> + VLOAD (vsrc_1, buf_src_7, , float, f, 64, 1); >>>>> + VLOAD (vsrc_2, buf_src_8, , float, f, 64, 1); >>>>> + VECT_VAR (vector_res, float, 64, 1) = >>>>> + vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), >>>>> + VECT_VAR (vsrc_2, float, 64, 1), delem3); >>>>> + vst1_f64 (VECT_VAR (result, float, 64, 1), >>>>> + VECT_VAR (vector_res, float, 64, 1)); >>>>> + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms3_static, ""); >>>>> + VECT_VAR (vector_res, float, 64, 1) = >>>>> + vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), >>>>> + VECT_VAR (vsrc_2, float, 64, 1), delem3); >>>>> + vst1_f64 (VECT_VAR (result, float, 64, 1), >>>>> + VECT_VAR (vector_res, float, 64, 1)); >>>>> + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma3_static, ""); >>>>> +} >>>>> +#endif >>>>> + >>>>> +int >>>>> +main (void) >>>>> +{ >>>>> +#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA) >>>>> + exec_vfma_vfms_n (); >>>>> +#endif >>>>> + return 0; >>>>> +} >>>>> diff --git a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c >>>>> b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c >>>>> index >>>>> 1ba1fed98a0711496815e00d2d702e5bfa2a7d43..5b348827002dcfef1f589900a4cf5ff7ada26697 >>>>> 100644 >>>>> --- a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c >>>>> +++ b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c >>>>> @@ -110,6 +110,6 @@ main (int argc, char **argv) >>>>> /* vfmaq_lane_f64. >>>>> vfma_laneq_f64. >>>>> vfmaq_laneq_f64. */ >>>>> -/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, >>>>> v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */ >>>>> +/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, >>>>> v\[0-9\]+\.2d, v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */ >>>>> diff --git a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c >>>>> b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c >>>>> index >>>>> 887ebae10da715c8d301a8494a2225e53f15bd7d..6c194a023d34ebafb4d732edc303985531f92a63 >>>>> 100644 >>>>> --- a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c >>>>> +++ b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c >>>>> @@ -111,6 +111,6 @@ main (int argc, char **argv) >>>>> /* vfmsq_lane_f64. >>>>> vfms_laneq_f64. >>>>> vfmsq_laneq_f64. */ >>>>> -/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, >>>>> v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */ >>>>> +/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, >>>>> v\[0-9\]+\.2d, v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */ >>>>>
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index bd73bce64414e8bc01732d14311d742cf28f4586..90eaca176b4706e6cc42f16ce2c956f1c8ad17b1 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -1579,16 +1579,16 @@ [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")] ) -(define_insn "*aarch64_fma4_elt_to_128df" - [(set (match_operand:V2DF 0 "register_operand" "=w") - (fma:V2DF - (vec_duplicate:V2DF - (match_operand:DF 1 "register_operand" "w")) - (match_operand:V2DF 2 "register_operand" "w") - (match_operand:V2DF 3 "register_operand" "0")))] +(define_insn "*aarch64_fma4_elt_from_dup<mode>" + [(set (match_operand:VMUL 0 "register_operand" "=w") + (fma:VMUL + (vec_duplicate:VMUL + (match_operand:<VEL> 1 "register_operand" "w")) + (match_operand:VMUL 2 "register_operand" "w") + (match_operand:VMUL 3 "register_operand" "0")))] "TARGET_SIMD" - "fmla\\t%0.2d, %2.2d, %1.2d[0]" - [(set_attr "type" "neon_fp_mla_d_scalar_q")] + "fmla\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]" + [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")] ) (define_insn "*aarch64_fma4_elt_to_64v2df" @@ -1656,17 +1656,17 @@ [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")] ) -(define_insn "*aarch64_fnma4_elt_to_128df" - [(set (match_operand:V2DF 0 "register_operand" "=w") - (fma:V2DF - (neg:V2DF - (match_operand:V2DF 2 "register_operand" "w")) - (vec_duplicate:V2DF - (match_operand:DF 1 "register_operand" "w")) - (match_operand:V2DF 3 "register_operand" "0")))] - "TARGET_SIMD" - "fmls\\t%0.2d, %2.2d, %1.2d[0]" - [(set_attr "type" "neon_fp_mla_d_scalar_q")] +(define_insn "*aarch64_fnma4_elt_from_dup<mode>" + [(set (match_operand:VMUL 0 "register_operand" "=w") + (fma:VMUL + (neg:VMUL + (match_operand:VMUL 2 "register_operand" "w")) + (vec_duplicate:VMUL + (match_operand:<VEL> 1 "register_operand" "w")) + (match_operand:VMUL 3 "register_operand" "0")))] + "TARGET_SIMD" + "fmls\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]" + [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")] ) (define_insn "*aarch64_fnma4_elt_to_64v2df" diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 2612a325718918cf7cd808f28c09c9c4c7b11c07..ca7ace5aa656163826569d046fcbf02f9f7d4d6c 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -14456,6 +14456,12 @@ vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c) return __builtin_aarch64_fmav2sf (__b, vdup_n_f32 (__c), __a); } +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) +vfma_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c) +{ + return (float64x1_t) {__b[0] * __c + __a[0]}; +} + __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) vfmaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c) { @@ -14597,6 +14603,29 @@ vfmsq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c) return __builtin_aarch64_fmav2df (-__b, __c, __a); } +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vfms_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c) +{ + return __builtin_aarch64_fmav2sf (-__b, vdup_n_f32 (__c), __a); +} + +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) +vfms_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c) +{ + return (float64x1_t) {-__b[0] * __c + __a[0]}; +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vfmsq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c) +{ + return __builtin_aarch64_fmav4sf (-__b, vdupq_n_f32 (__c), __a); +} + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vfmsq_n_f64 (float64x2_t __a, float64x2_t __b, float64_t __c) +{ + return __builtin_aarch64_fmav2df (-__b, vdupq_n_f64 (__c), __a); +} /* vfms_lane */ diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h index 49fbd843e507ede8aa81d02c175a82a1221750a4..cf90825f87391b72aca9a29980210d21f4321c04 100644 --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h @@ -136,6 +136,7 @@ static ARRAY(result, poly, 16, 4); #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) static ARRAY(result, float, 16, 4); #endif +static ARRAY(result, float, 64, 1); static ARRAY(result, float, 32, 2); static ARRAY(result, int, 8, 16); static ARRAY(result, int, 16, 8); @@ -169,6 +170,7 @@ extern ARRAY(expected, poly, 8, 8); extern ARRAY(expected, poly, 16, 4); extern ARRAY(expected, hfloat, 16, 4); extern ARRAY(expected, hfloat, 32, 2); +extern ARRAY(expected, hfloat, 64, 1); extern ARRAY(expected, int, 8, 16); extern ARRAY(expected, int, 16, 8); extern ARRAY(expected, int, 32, 4); diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c new file mode 100644 index 0000000000000000000000000000000000000000..26223763c59c849607b5320f6ec37098a556ce2e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c @@ -0,0 +1,490 @@ +#include <arm_neon.h> +#include "arm-neon-ref.h" +#include "compute-ref-data.h" + +#define A0 123.4f +#define A1 -3.8f +#define A2 -29.4f +#define A3 (__builtin_inff ()) +#define A4 0.0f +#define A5 24.0f +#define A6 124.0f +#define A7 1024.0f + +#define B0 -5.8f +#define B1 -0.0f +#define B2 -10.8f +#define B3 10.0f +#define B4 23.4f +#define B5 -1234.8f +#define B6 8.9f +#define B7 4.0f + +#define E0 9.8f +#define E1 -1024.0f +#define E2 (-__builtin_inff ()) +#define E3 479.0f +float32_t elem0 = E0; +float32_t elem1 = E1; +float32_t elem2 = E2; +float32_t elem3 = E3; + +#define DA0 1231234.4 +#define DA1 -3.8 +#define DA2 -2980.4 +#define DA3 -5.8 +#define DA4 0.01123 +#define DA5 24.0 +#define DA6 124.12345 +#define DA7 1024.0 + +#define DB0 -5.8 +#define DB1 (__builtin_inf ()) +#define DB2 -105.8 +#define DB3 10.0 +#define DB4 (-__builtin_inf ()) +#define DB5 -1234.8 +#define DB6 848.9 +#define DB7 44444.0 + +#define DE0 9.8 +#define DE1 -1024.0 +#define DE2 105.8 +#define DE3 479.0 +float64_t delem0 = DE0; +float64_t delem1 = DE1; +float64_t delem2 = DE2; +float64_t delem3 = DE3; + +#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA) + +/* Expected results for vfms_n. */ + +VECT_VAR_DECL(expectedfms0, float, 32, 2) [] = {A0 + -B0 * E0, A1 + -B1 * E0}; +VECT_VAR_DECL(expectedfms1, float, 32, 2) [] = {A2 + -B2 * E1, A3 + -B3 * E1}; +VECT_VAR_DECL(expectedfms2, float, 32, 2) [] = {A4 + -B4 * E2, A5 + -B5 * E2}; +VECT_VAR_DECL(expectedfms3, float, 32, 2) [] = {A6 + -B6 * E3, A7 + -B7 * E3}; +VECT_VAR_DECL(expectedfma0, float, 32, 2) [] = {A0 + B0 * E0, A1 + B1 * E0}; +VECT_VAR_DECL(expectedfma1, float, 32, 2) [] = {A2 + B2 * E1, A3 + B3 * E1}; +VECT_VAR_DECL(expectedfma2, float, 32, 2) [] = {A4 + B4 * E2, A5 + B5 * E2}; +VECT_VAR_DECL(expectedfma3, float, 32, 2) [] = {A6 + B6 * E3, A7 + B7 * E3}; + +hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 2) = + (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 2); +hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 2) = + (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 2); +hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 2) = + (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 2); +hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 2) = + (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 2); +hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 2) = + (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 2); +hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 2) = + (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 2); +hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 2) = + (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 2); +hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 2) = + (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 2); + + +VECT_VAR_DECL(expectedfms0, float, 32, 4) [] = {A0 + -B0 * E0, A1 + -B1 * E0, + A2 + -B2 * E0, A3 + -B3 * E0}; +VECT_VAR_DECL(expectedfms1, float, 32, 4) [] = {A4 + -B4 * E1, A5 + -B5 * E1, + A6 + -B6 * E1, A7 + -B7 * E1}; +VECT_VAR_DECL(expectedfms2, float, 32, 4) [] = {A0 + -B0 * E2, A2 + -B2 * E2, + A4 + -B4 * E2, A6 + -B6 * E2}; +VECT_VAR_DECL(expectedfms3, float, 32, 4) [] = {A1 + -B1 * E3, A3 + -B3 * E3, + A5 + -B5 * E3, A7 + -B7 * E3}; +VECT_VAR_DECL(expectedfma0, float, 32, 4) [] = {A0 + B0 * E0, A1 + B1 * E0, + A2 + B2 * E0, A3 + B3 * E0}; +VECT_VAR_DECL(expectedfma1, float, 32, 4) [] = {A4 + B4 * E1, A5 + B5 * E1, + A6 + B6 * E1, A7 + B7 * E1}; +VECT_VAR_DECL(expectedfma2, float, 32, 4) [] = {A0 + B0 * E2, A2 + B2 * E2, + A4 + B4 * E2, A6 + B6 * E2}; +VECT_VAR_DECL(expectedfma3, float, 32, 4) [] = {A1 + B1 * E3, A3 + B3 * E3, + A5 + B5 * E3, A7 + B7 * E3}; + +hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 4) = + (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 4); +hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 4) = + (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 4); +hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 4) = + (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 4); +hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 4) = + (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 4); +hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 4) = + (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 4); +hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 4) = + (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 4); +hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 4) = + (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 4); +hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 4) = + (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 4); + +VECT_VAR_DECL(expectedfms0, float, 64, 2) [] = {DA0 + -DB0 * DE0, + DA1 + -DB1 * DE0}; +VECT_VAR_DECL(expectedfms1, float, 64, 2) [] = {DA2 + -DB2 * DE1, + DA3 + -DB3 * DE1}; +VECT_VAR_DECL(expectedfms2, float, 64, 2) [] = {DA4 + -DB4 * DE2, + DA5 + -DB5 * DE2}; +VECT_VAR_DECL(expectedfms3, float, 64, 2) [] = {DA6 + -DB6 * DE3, + DA7 + -DB7 * DE3}; +VECT_VAR_DECL(expectedfma0, float, 64, 2) [] = {DA0 + DB0 * DE0, + DA1 + DB1 * DE0}; +VECT_VAR_DECL(expectedfma1, float, 64, 2) [] = {DA2 + DB2 * DE1, + DA3 + DB3 * DE1}; +VECT_VAR_DECL(expectedfma2, float, 64, 2) [] = {DA4 + DB4 * DE2, + DA5 + DB5 * DE2}; +VECT_VAR_DECL(expectedfma3, float, 64, 2) [] = {DA6 + DB6 * DE3, + DA7 + DB7 * DE3}; +hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 2) = + (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 2); +hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 2) = + (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 2); +hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 2) = + (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 2); +hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 2) = + (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 2); +hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 2) = + (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 2); +hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 2) = + (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 2); +hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 2) = + (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 2); +hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 2) = + (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 2); + +VECT_VAR_DECL(expectedfms0, float, 64, 1) [] = {DA0 + -DB0 * DE0}; +VECT_VAR_DECL(expectedfms1, float, 64, 1) [] = {DA2 + -DB2 * DE1}; +VECT_VAR_DECL(expectedfms2, float, 64, 1) [] = {DA4 + -DB4 * DE2}; +VECT_VAR_DECL(expectedfms3, float, 64, 1) [] = {DA6 + -DB6 * DE3}; +VECT_VAR_DECL(expectedfma0, float, 64, 1) [] = {DA0 + DB0 * DE0}; +VECT_VAR_DECL(expectedfma1, float, 64, 1) [] = {DA2 + DB2 * DE1}; +VECT_VAR_DECL(expectedfma2, float, 64, 1) [] = {DA4 + DB4 * DE2}; +VECT_VAR_DECL(expectedfma3, float, 64, 1) [] = {DA6 + DB6 * DE3}; + +hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 1) = + (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 1); +hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 1) = + (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 1); +hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 1) = + (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 1); +hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 1) = + (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 1); +hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 1) = + (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 1); +hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 1) = + (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 1); +hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 1) = + (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 1); +hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 1) = + (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 1); + +void exec_vfma_vfms_n (void) +{ +#undef TEST_MSG +#define TEST_MSG "VFMS_VFMA_N (FP32)" + clean_results (); + + DECL_VARIABLE(vsrc_1, float, 32, 2); + DECL_VARIABLE(vsrc_2, float, 32, 2); + VECT_VAR_DECL (buf_src_1, float, 32, 2) [] = {A0, A1}; + VECT_VAR_DECL (buf_src_2, float, 32, 2) [] = {B0, B1}; + VLOAD (vsrc_1, buf_src_1, , float, f, 32, 2); + VLOAD (vsrc_2, buf_src_2, , float, f, 32, 2); + DECL_VARIABLE (vector_res, float, 32, 2) = + vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), + VECT_VAR (vsrc_2, float, 32, 2), elem0); + vst1_f32 (VECT_VAR (result, float, 32, 2), + VECT_VAR (vector_res, float, 32, 2)); + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms0_static, ""); + VECT_VAR (vector_res, float, 32, 2) = + vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), + VECT_VAR (vsrc_2, float, 32, 2), elem0); + vst1_f32 (VECT_VAR (result, float, 32, 2), + VECT_VAR (vector_res, float, 32, 2)); + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma0_static, ""); + + VECT_VAR_DECL (buf_src_3, float, 32, 2) [] = {A2, A3}; + VECT_VAR_DECL (buf_src_4, float, 32, 2) [] = {B2, B3}; + VLOAD (vsrc_1, buf_src_3, , float, f, 32, 2); + VLOAD (vsrc_2, buf_src_4, , float, f, 32, 2); + VECT_VAR (vector_res, float, 32, 2) = + vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), + VECT_VAR (vsrc_2, float, 32, 2), elem1); + vst1_f32 (VECT_VAR (result, float, 32, 2), + VECT_VAR (vector_res, float, 32, 2)); + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms1_static, ""); + VECT_VAR (vector_res, float, 32, 2) = + vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), + VECT_VAR (vsrc_2, float, 32, 2), elem1); + vst1_f32 (VECT_VAR (result, float, 32, 2), + VECT_VAR (vector_res, float, 32, 2)); + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma1_static, ""); + + VECT_VAR_DECL (buf_src_5, float, 32, 2) [] = {A4, A5}; + VECT_VAR_DECL (buf_src_6, float, 32, 2) [] = {B4, B5}; + VLOAD (vsrc_1, buf_src_5, , float, f, 32, 2); + VLOAD (vsrc_2, buf_src_6, , float, f, 32, 2); + VECT_VAR (vector_res, float, 32, 2) = + vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), + VECT_VAR (vsrc_2, float, 32, 2), elem2); + vst1_f32 (VECT_VAR (result, float, 32, 2), + VECT_VAR (vector_res, float, 32, 2)); + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms2_static, ""); + VECT_VAR (vector_res, float, 32, 2) = + vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), + VECT_VAR (vsrc_2, float, 32, 2), elem2); + vst1_f32 (VECT_VAR (result, float, 32, 2), + VECT_VAR (vector_res, float, 32, 2)); + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma2_static, ""); + + VECT_VAR_DECL (buf_src_7, float, 32, 2) [] = {A6, A7}; + VECT_VAR_DECL (buf_src_8, float, 32, 2) [] = {B6, B7}; + VLOAD (vsrc_1, buf_src_7, , float, f, 32, 2); + VLOAD (vsrc_2, buf_src_8, , float, f, 32, 2); + VECT_VAR (vector_res, float, 32, 2) = + vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), + VECT_VAR (vsrc_2, float, 32, 2), elem3); + vst1_f32 (VECT_VAR (result, float, 32, 2), + VECT_VAR (vector_res, float, 32, 2)); + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms3_static, ""); + VECT_VAR (vector_res, float, 32, 2) = + vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), + VECT_VAR (vsrc_2, float, 32, 2), elem3); + vst1_f32 (VECT_VAR (result, float, 32, 2), + VECT_VAR (vector_res, float, 32, 2)); + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma3_static, ""); + +#undef TEST_MSG +#define TEST_MSG "VFMSQ_VFMAQ_N (FP32)" + clean_results (); + + DECL_VARIABLE(vsrc_1, float, 32, 4); + DECL_VARIABLE(vsrc_2, float, 32, 4); + VECT_VAR_DECL (buf_src_1, float, 32, 4) [] = {A0, A1, A2, A3}; + VECT_VAR_DECL (buf_src_2, float, 32, 4) [] = {B0, B1, B2, B3}; + VLOAD (vsrc_1, buf_src_1, q, float, f, 32, 4); + VLOAD (vsrc_2, buf_src_2, q, float, f, 32, 4); + DECL_VARIABLE (vector_res, float, 32, 4) = + vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), + VECT_VAR (vsrc_2, float, 32, 4), elem0); + vst1q_f32 (VECT_VAR (result, float, 32, 4), + VECT_VAR (vector_res, float, 32, 4)); + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms0_static, ""); + VECT_VAR (vector_res, float, 32, 4) = + vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), + VECT_VAR (vsrc_2, float, 32, 4), elem0); + vst1q_f32 (VECT_VAR (result, float, 32, 4), + VECT_VAR (vector_res, float, 32, 4)); + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma0_static, ""); + + VECT_VAR_DECL (buf_src_3, float, 32, 4) [] = {A4, A5, A6, A7}; + VECT_VAR_DECL (buf_src_4, float, 32, 4) [] = {B4, B5, B6, B7}; + VLOAD (vsrc_1, buf_src_3, q, float, f, 32, 4); + VLOAD (vsrc_2, buf_src_4, q, float, f, 32, 4); + VECT_VAR (vector_res, float, 32, 4) = + vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), + VECT_VAR (vsrc_2, float, 32, 4), elem1); + vst1q_f32 (VECT_VAR (result, float, 32, 4), + VECT_VAR (vector_res, float, 32, 4)); + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms1_static, ""); + VECT_VAR (vector_res, float, 32, 4) = + vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), + VECT_VAR (vsrc_2, float, 32, 4), elem1); + vst1q_f32 (VECT_VAR (result, float, 32, 4), + VECT_VAR (vector_res, float, 32, 4)); + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma1_static, ""); + + VECT_VAR_DECL (buf_src_5, float, 32, 4) [] = {A0, A2, A4, A6}; + VECT_VAR_DECL (buf_src_6, float, 32, 4) [] = {B0, B2, B4, B6}; + VLOAD (vsrc_1, buf_src_5, q, float, f, 32, 4); + VLOAD (vsrc_2, buf_src_6, q, float, f, 32, 4); + VECT_VAR (vector_res, float, 32, 4) = + vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), + VECT_VAR (vsrc_2, float, 32, 4), elem2); + vst1q_f32 (VECT_VAR (result, float, 32, 4), + VECT_VAR (vector_res, float, 32, 4)); + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms2_static, ""); + VECT_VAR (vector_res, float, 32, 4) = + vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), + VECT_VAR (vsrc_2, float, 32, 4), elem2); + vst1q_f32 (VECT_VAR (result, float, 32, 4), + VECT_VAR (vector_res, float, 32, 4)); + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma2_static, ""); + + VECT_VAR_DECL (buf_src_7, float, 32, 4) [] = {A1, A3, A5, A7}; + VECT_VAR_DECL (buf_src_8, float, 32, 4) [] = {B1, B3, B5, B7}; + VLOAD (vsrc_1, buf_src_7, q, float, f, 32, 4); + VLOAD (vsrc_2, buf_src_8, q, float, f, 32, 4); + VECT_VAR (vector_res, float, 32, 4) = + vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), + VECT_VAR (vsrc_2, float, 32, 4), elem3); + vst1q_f32 (VECT_VAR (result, float, 32, 4), + VECT_VAR (vector_res, float, 32, 4)); + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms3_static, ""); + VECT_VAR (vector_res, float, 32, 4) = + vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), + VECT_VAR (vsrc_2, float, 32, 4), elem3); + vst1q_f32 (VECT_VAR (result, float, 32, 4), + VECT_VAR (vector_res, float, 32, 4)); + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma3_static, ""); + +#undef TEST_MSG +#define TEST_MSG "VFMSQ_VFMAQ_N (FP64)" + clean_results (); + + DECL_VARIABLE(vsrc_1, float, 64, 2); + DECL_VARIABLE(vsrc_2, float, 64, 2); + VECT_VAR_DECL (buf_src_1, float, 64, 2) [] = {DA0, DA1}; + VECT_VAR_DECL (buf_src_2, float, 64, 2) [] = {DB0, DB1}; + VLOAD (vsrc_1, buf_src_1, q, float, f, 64, 2); + VLOAD (vsrc_2, buf_src_2, q, float, f, 64, 2); + DECL_VARIABLE (vector_res, float, 64, 2) = + vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), + VECT_VAR (vsrc_2, float, 64, 2), delem0); + vst1q_f64 (VECT_VAR (result, float, 64, 2), + VECT_VAR (vector_res, float, 64, 2)); + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms0_static, ""); + VECT_VAR (vector_res, float, 64, 2) = + vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), + VECT_VAR (vsrc_2, float, 64, 2), delem0); + vst1q_f64 (VECT_VAR (result, float, 64, 2), + VECT_VAR (vector_res, float, 64, 2)); + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma0_static, ""); + + VECT_VAR_DECL (buf_src_3, float, 64, 2) [] = {DA2, DA3}; + VECT_VAR_DECL (buf_src_4, float, 64, 2) [] = {DB2, DB3}; + VLOAD (vsrc_1, buf_src_3, q, float, f, 64, 2); + VLOAD (vsrc_2, buf_src_4, q, float, f, 64, 2); + VECT_VAR (vector_res, float, 64, 2) = + vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), + VECT_VAR (vsrc_2, float, 64, 2), delem1); + vst1q_f64 (VECT_VAR (result, float, 64, 2), + VECT_VAR (vector_res, float, 64, 2)); + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms1_static, ""); + VECT_VAR (vector_res, float, 64, 2) = + vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), + VECT_VAR (vsrc_2, float, 64, 2), delem1); + vst1q_f64 (VECT_VAR (result, float, 64, 2), + VECT_VAR (vector_res, float, 64, 2)); + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma1_static, ""); + + VECT_VAR_DECL (buf_src_5, float, 64, 2) [] = {DA4, DA5}; + VECT_VAR_DECL (buf_src_6, float, 64, 2) [] = {DB4, DB5}; + VLOAD (vsrc_1, buf_src_5, q, float, f, 64, 2); + VLOAD (vsrc_2, buf_src_6, q, float, f, 64, 2); + VECT_VAR (vector_res, float, 64, 2) = + vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), + VECT_VAR (vsrc_2, float, 64, 2), delem2); + vst1q_f64 (VECT_VAR (result, float, 64, 2), + VECT_VAR (vector_res, float, 64, 2)); + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms2_static, ""); + VECT_VAR (vector_res, float, 64, 2) = + vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), + VECT_VAR (vsrc_2, float, 64, 2), delem2); + vst1q_f64 (VECT_VAR (result, float, 64, 2), + VECT_VAR (vector_res, float, 64, 2)); + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma2_static, ""); + + VECT_VAR_DECL (buf_src_7, float, 64, 2) [] = {DA6, DA7}; + VECT_VAR_DECL (buf_src_8, float, 64, 2) [] = {DB6, DB7}; + VLOAD (vsrc_1, buf_src_7, q, float, f, 64, 2); + VLOAD (vsrc_2, buf_src_8, q, float, f, 64, 2); + VECT_VAR (vector_res, float, 64, 2) = + vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), + VECT_VAR (vsrc_2, float, 64, 2), delem3); + vst1q_f64 (VECT_VAR (result, float, 64, 2), + VECT_VAR (vector_res, float, 64, 2)); + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms3_static, ""); + VECT_VAR (vector_res, float, 64, 2) = + vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), + VECT_VAR (vsrc_2, float, 64, 2), delem3); + vst1q_f64 (VECT_VAR (result, float, 64, 2), + VECT_VAR (vector_res, float, 64, 2)); + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma3_static, ""); + +#undef TEST_MSG +#define TEST_MSG "VFMS_VFMA_N (FP64)" + clean_results (); + + DECL_VARIABLE(vsrc_1, float, 64, 1); + DECL_VARIABLE(vsrc_2, float, 64, 1); + VECT_VAR_DECL (buf_src_1, float, 64, 1) [] = {DA0}; + VECT_VAR_DECL (buf_src_2, float, 64, 1) [] = {DB0}; + VLOAD (vsrc_1, buf_src_1, , float, f, 64, 1); + VLOAD (vsrc_2, buf_src_2, , float, f, 64, 1); + DECL_VARIABLE (vector_res, float, 64, 1) = + vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), + VECT_VAR (vsrc_2, float, 64, 1), delem0); + vst1_f64 (VECT_VAR (result, float, 64, 1), + VECT_VAR (vector_res, float, 64, 1)); + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms0_static, ""); + VECT_VAR (vector_res, float, 64, 1) = + vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), + VECT_VAR (vsrc_2, float, 64, 1), delem0); + vst1_f64 (VECT_VAR (result, float, 64, 1), + VECT_VAR (vector_res, float, 64, 1)); + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma0_static, ""); + + VECT_VAR_DECL (buf_src_3, float, 64, 1) [] = {DA2}; + VECT_VAR_DECL (buf_src_4, float, 64, 1) [] = {DB2}; + VLOAD (vsrc_1, buf_src_3, , float, f, 64, 1); + VLOAD (vsrc_2, buf_src_4, , float, f, 64, 1); + VECT_VAR (vector_res, float, 64, 1) = + vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), + VECT_VAR (vsrc_2, float, 64, 1), delem1); + vst1_f64 (VECT_VAR (result, float, 64, 1), + VECT_VAR (vector_res, float, 64, 1)); + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms1_static, ""); + VECT_VAR (vector_res, float, 64, 1) = + vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), + VECT_VAR (vsrc_2, float, 64, 1), delem1); + vst1_f64 (VECT_VAR (result, float, 64, 1), + VECT_VAR (vector_res, float, 64, 1)); + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma1_static, ""); + + VECT_VAR_DECL (buf_src_5, float, 64, 1) [] = {DA4}; + VECT_VAR_DECL (buf_src_6, float, 64, 1) [] = {DB4}; + VLOAD (vsrc_1, buf_src_5, , float, f, 64, 1); + VLOAD (vsrc_2, buf_src_6, , float, f, 64, 1); + VECT_VAR (vector_res, float, 64, 1) = + vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), + VECT_VAR (vsrc_2, float, 64, 1), delem2); + vst1_f64 (VECT_VAR (result, float, 64, 1), + VECT_VAR (vector_res, float, 64, 1)); + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms2_static, ""); + VECT_VAR (vector_res, float, 64, 1) = + vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), + VECT_VAR (vsrc_2, float, 64, 1), delem2); + vst1_f64 (VECT_VAR (result, float, 64, 1), + VECT_VAR (vector_res, float, 64, 1)); + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma2_static, ""); + + VECT_VAR_DECL (buf_src_7, float, 64, 1) [] = {DA6}; + VECT_VAR_DECL (buf_src_8, float, 64, 1) [] = {DB6}; + VLOAD (vsrc_1, buf_src_7, , float, f, 64, 1); + VLOAD (vsrc_2, buf_src_8, , float, f, 64, 1); + VECT_VAR (vector_res, float, 64, 1) = + vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), + VECT_VAR (vsrc_2, float, 64, 1), delem3); + vst1_f64 (VECT_VAR (result, float, 64, 1), + VECT_VAR (vector_res, float, 64, 1)); + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms3_static, ""); + VECT_VAR (vector_res, float, 64, 1) = + vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), + VECT_VAR (vsrc_2, float, 64, 1), delem3); + vst1_f64 (VECT_VAR (result, float, 64, 1), + VECT_VAR (vector_res, float, 64, 1)); + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma3_static, ""); +} +#endif + +int +main (void) +{ +#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA) + exec_vfma_vfms_n (); +#endif + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c index 1ba1fed98a0711496815e00d2d702e5bfa2a7d43..5b348827002dcfef1f589900a4cf5ff7ada26697 100644 --- a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c +++ b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c @@ -110,6 +110,6 @@ main (int argc, char **argv) /* vfmaq_lane_f64. vfma_laneq_f64. vfmaq_laneq_f64. */ -/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */ +/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c index 887ebae10da715c8d301a8494a2225e53f15bd7d..6c194a023d34ebafb4d732edc303985531f92a63 100644 --- a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c +++ b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c @@ -111,6 +111,6 @@ main (int argc, char **argv) /* vfmsq_lane_f64. vfms_laneq_f64. vfmsq_laneq_f64. */ -/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */ +/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */