Message ID | 44ca371c-24bb-3a47-7dea-58aeb1595697@arm.com |
---|---|
State | New |
Headers | show |
Series | [AArch64] Add ACLE intrinsics for bfdot for ARMv8.6 Extension | expand |
Stam Markianos-Wright <Stam.Markianos-Wright@arm.com> writes: > Hi all, > > This patch adds the ARMv8.6 Extension ACLE intrinsics for the bfloat bfdot > operation. > > The functions are declared in arm_neon.h with the armv8.2-a+bf16 target option > as required. > > RTL patterns are defined to generate assembler. > > Tests added to verify expected assembly and perform adequate lane checks. > > This patch depends on: > > https://gcc.gnu.org/ml/gcc-patches/2019-12/msg00857.html > > for testuite effective_target update and on: > > https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01323.html > https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01324.html > > for back-end Bfloat enablement. > > Cheers, > Stam > > > gcc/ChangeLog: > > 2019-11-04 Stam Markianos-Wright <stam.markianos-wright@arm.com> > > * config/aarch64/aarch64-simd-builtins.def (aarch64_bfdot, > aarch64_bfdot_lane, aarch64_bfdot_laneq): New. > * config/aarch64/aarch64-simd.md > (aarch64_bfdot, aarch64_bfdot_lane): New. > * config/aarch64/arm_neon.h (vbfdot_f32, vbfdotq_f32, vbfdot_lane_f32, > vbfdotq_lane_f32, vbfdot_laneq_f32, vbfdotq_laneq_f32): New. > * config/aarch64/iterators.md (UNSPEC_BFDOT, VBF, isquadop, Vbfdottype, > VBFMLA_W): New. Changelog nit: the continuation lines should be indened by a tab only. > diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md > index c4858ab7cffd786066646a5cd95a168311990b76..bdc26c190610580e57e9749804b7729ee4e34793 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -7027,3 +7027,37 @@ > "xtn\t%0.<Vntype>, %1.<Vtype>" > [(set_attr "type" "neon_shift_imm_narrow_q")] > ) > + > +(define_insn "aarch64_bfdot<mode>" > + [(set (match_operand:VDQSF 0 "register_operand" "=w") > + (plus:VDQSF (match_operand:VDQSF 1 "register_operand" "0") > + (unspec:VDQSF [(match_operand:<VBFMLA_W> 2 > + "register_operand" "w") > + (match_operand:<VBFMLA_W> 3 > + "register_operand" "w")] > + UNSPEC_BFDOT)))] The operands to the plus should be the other way around, so that the more complicated operand comes first, > + "TARGET_BF16_SIMD" > + "bfdot\t%0.<Vtype>, %2.<Vbfdottype>, %3.<Vbfdottype>" > + [(set_attr "type" "neon_dot<q>")] > +) > + > + > +(define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>" > + [(set (match_operand:VDQSF 0 "register_operand" "=w") > + (plus:VDQSF (match_operand:VDQSF 1 "register_operand" "0") > + (unspec:VDQSF [(match_operand:<VDQSF:VBFMLA_W> 2 > + "register_operand" "w") > + (match_operand: VBF 3 Nit: should be no space before "VBF". > + "register_operand" "w") > + (match_operand:SI 4 > + "const_int_operand" "n")] > + UNSPEC_BFDOT)))] > + "TARGET_BF16_SIMD" > +{ > + int nunits = GET_MODE_NUNITS (<VBF:MODE>mode).to_constant (); > + int lane = INTVAL (operands[4]); > + operands[4] = gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane), SImode); Should only be one space after "=". > + return "bfdot\t%0.<VDQSF:Vtype>, %2.<VDQSF:Vbfdottype>, %3.2h[%4]"; > +} > + [(set_attr "type" "neon_dot<VDQSF:q>")] > +) > diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h > index 5996df0a612caff3c881fc15b0aa12b8f91a193b..0357d97cc4143c3a9c56260d9a9cc24138afc049 100644 > --- a/gcc/config/aarch64/arm_neon.h > +++ b/gcc/config/aarch64/arm_neon.h > @@ -34612,6 +34612,57 @@ vrnd64xq_f64 (float64x2_t __a) > > #include "arm_bf16.h" > > +#pragma GCC push_options > +#pragma GCC target ("arch=armv8.2-a+bf16") > + > +__extension__ extern __inline float32x2_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b) > +{ > + return __builtin_aarch64_bfdotv2sf (__r, __a, __b); > +} > + > +__extension__ extern __inline float32x4_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vbfdotq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b) > +{ > + return __builtin_aarch64_bfdotv4sf (__r, __a, __b); > +} > + > +__extension__ extern __inline float32x2_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vbfdot_lane_f32 \ > + (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b, const int __index) Stray backslash (same comment as for the USDOT/SUDOT review just posted). > diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c > new file mode 100644 > index 0000000000000000000000000000000000000000..62ac715c2a9c4468eb7c143464390dbf1144d6d6 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c > @@ -0,0 +1,80 @@ > +/* { dg-do assemble { target { aarch64*-*-* } } } */ > +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ > +/* { dg-add-options arm_v8_2a_bf16_neon } */ > +/* { dg-additional-options "--save-temps" } */ > +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */ > + > +#include <arm_neon.h> > + > +/* > +**ufoo: > +** ... > +** bfdot\tv[0-9]+.2s, v[0-9]+.4h, v[0-9]+.4h > +** ... > +** ret > +*/ > +float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) > +{ > + return vbfdot_f32 (r, x, y); > +} Same comments as for SUDOT and USDOT here too. Thanks, Richard
On 12/20/19 2:36 PM, Richard Sandiford wrote: > Stam Markianos-Wright <Stam.Markianos-Wright@arm.com> writes: >> Hi all, >> >> This patch adds the ARMv8.6 Extension ACLE intrinsics for the bfloat bfdot >> operation. >> >> The functions are declared in arm_neon.h with the armv8.2-a+bf16 target option >> as required. >> >> RTL patterns are defined to generate assembler. >> >> Tests added to verify expected assembly and perform adequate lane checks. >> >> This patch depends on: >> >> https://gcc.gnu.org/ml/gcc-patches/2019-12/msg00857.html >> >> for testuite effective_target update and on: >> >> https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01323.html >> https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01324.html >> >> for back-end Bfloat enablement. >> >> Cheers, >> Stam >> >> >> gcc/ChangeLog: >> >> 2019-11-04 Stam Markianos-Wright <stam.markianos-wright@arm.com> >> >> * config/aarch64/aarch64-simd-builtins.def (aarch64_bfdot, >> aarch64_bfdot_lane, aarch64_bfdot_laneq): New. >> * config/aarch64/aarch64-simd.md >> (aarch64_bfdot, aarch64_bfdot_lane): New. >> * config/aarch64/arm_neon.h (vbfdot_f32, vbfdotq_f32, vbfdot_lane_f32, >> vbfdotq_lane_f32, vbfdot_laneq_f32, vbfdotq_laneq_f32): New. >> * config/aarch64/iterators.md (UNSPEC_BFDOT, VBF, isquadop, Vbfdottype, >> VBFMLA_W): New. > > Changelog nit: the continuation lines should be indened by a tab only. Yes, sorry, that's my email client messing things up again! Fixed locally and will carry over when I do the commit. > >> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md >> index c4858ab7cffd786066646a5cd95a168311990b76..bdc26c190610580e57e9749804b7729ee4e34793 100644 >> --- a/gcc/config/aarch64/aarch64-simd.md >> +++ b/gcc/config/aarch64/aarch64-simd.md >> @@ -7027,3 +7027,37 @@ >> "xtn\t%0.<Vntype>, %1.<Vtype>" >> [(set_attr "type" "neon_shift_imm_narrow_q")] >> ) >> + >> +(define_insn "aarch64_bfdot<mode>" >> + [(set (match_operand:VDQSF 0 "register_operand" "=w") >> + (plus:VDQSF (match_operand:VDQSF 1 "register_operand" "0") >> + (unspec:VDQSF [(match_operand:<VBFMLA_W> 2 >> + "register_operand" "w") >> + (match_operand:<VBFMLA_W> 3 >> + "register_operand" "w")] >> + UNSPEC_BFDOT)))] > > The operands to the plus should be the other way around, so that > the more complicated operand comes first, > Done >> + "TARGET_BF16_SIMD" >> + "bfdot\t%0.<Vtype>, %2.<Vbfdottype>, %3.<Vbfdottype>" >> + [(set_attr "type" "neon_dot<q>")] >> +) >> + >> + >> +(define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>" >> + [(set (match_operand:VDQSF 0 "register_operand" "=w") >> + (plus:VDQSF (match_operand:VDQSF 1 "register_operand" "0") >> + (unspec:VDQSF [(match_operand:<VDQSF:VBFMLA_W> 2 >> + "register_operand" "w") >> + (match_operand: VBF 3 > > Nit: should be no space before "VBF". Done > >> + "register_operand" "w") >> + (match_operand:SI 4 >> + "const_int_operand" "n")] >> + UNSPEC_BFDOT)))] >> + "TARGET_BF16_SIMD" >> +{ >> + int nunits = GET_MODE_NUNITS (<VBF:MODE>mode).to_constant (); >> + int lane = INTVAL (operands[4]); >> + operands[4] = gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane), SImode); > > Should only be one space after "=". Done > >> + return "bfdot\t%0.<VDQSF:Vtype>, %2.<VDQSF:Vbfdottype>, %3.2h[%4]"; >> +} >> + [(set_attr "type" "neon_dot<VDQSF:q>")] >> +) >> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h >> index 5996df0a612caff3c881fc15b0aa12b8f91a193b..0357d97cc4143c3a9c56260d9a9cc24138afc049 100644 >> --- a/gcc/config/aarch64/arm_neon.h >> +++ b/gcc/config/aarch64/arm_neon.h >> @@ -34612,6 +34612,57 @@ vrnd64xq_f64 (float64x2_t __a) >> >> #include "arm_bf16.h" >> >> +#pragma GCC push_options >> +#pragma GCC target ("arch=armv8.2-a+bf16") >> + >> +__extension__ extern __inline float32x2_t >> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >> +vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b) >> +{ >> + return __builtin_aarch64_bfdotv2sf (__r, __a, __b); >> +} >> + >> +__extension__ extern __inline float32x4_t >> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >> +vbfdotq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b) >> +{ >> + return __builtin_aarch64_bfdotv4sf (__r, __a, __b); >> +} >> + >> +__extension__ extern __inline float32x2_t >> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >> +vbfdot_lane_f32 \ >> + (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b, const int __index) > > Stray backslash (same comment as for the USDOT/SUDOT review > just posted). Done > >> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c >> new file mode 100644 >> index 0000000000000000000000000000000000000000..62ac715c2a9c4468eb7c143464390dbf1144d6d6 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c >> @@ -0,0 +1,80 @@ >> +/* { dg-do assemble { target { aarch64*-*-* } } } */ >> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ >> +/* { dg-add-options arm_v8_2a_bf16_neon } */ >> +/* { dg-additional-options "--save-temps" } */ >> +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */ >> + >> +#include <arm_neon.h> >> + >> +/* >> +**ufoo: >> +** ... >> +** bfdot\tv[0-9]+.2s, v[0-9]+.4h, v[0-9]+.4h >> +** ... >> +** ret >> +*/ >> +float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) >> +{ >> + return vbfdot_f32 (r, x, y); >> +} > > Same comments as for SUDOT and USDOT here too. Same changes as US/SUDOT. Thank you! Stam > > Thanks, > Richard > diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index f4ca35a59704c761fe2ac2b6d401fff7c8aba80d..6c5b61c37bcb340f963861723c6e365e32f6ca95 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -682,3 +682,8 @@ BUILTIN_VSFDF (UNOP, frint32x, 0) BUILTIN_VSFDF (UNOP, frint64z, 0) BUILTIN_VSFDF (UNOP, frint64x, 0) + + /* Implemented by aarch64_bfdot{_lane}{q}<mode>. */ + VAR2 (TERNOP, bfdot, 0, v2sf, v4sf) + VAR2 (QUADOP_LANE_PAIR, bfdot_lane, 0, v2sf, v4sf) + VAR2 (QUADOP_LANE_PAIR, bfdot_laneq, 0, v2sf, v4sf) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index adfda96f077075ad53d4bea2919c4d3b326e49f5..7587bc46ba1c80389ea49fa83a0e6f8a489711e9 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -7028,3 +7028,36 @@ "xtn\t%0.<Vntype>, %1.<Vtype>" [(set_attr "type" "neon_shift_imm_narrow_q")] ) + +(define_insn "aarch64_bfdot<mode>" + [(set (match_operand:VDQSF 0 "register_operand" "=w") + (plus:VDQSF + (unspec:VDQSF + [(match_operand:<VBFMLA_W> 2 "register_operand" "w") + (match_operand:<VBFMLA_W> 3 "register_operand" "w")] + UNSPEC_BFDOT) + (match_operand:VDQSF 1 "register_operand" "0")))] + "TARGET_BF16_SIMD" + "bfdot\t%0.<Vtype>, %2.<Vbfdottype>, %3.<Vbfdottype>" + [(set_attr "type" "neon_dot<q>")] +) + + +(define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>" + [(set (match_operand:VDQSF 0 "register_operand" "=w") + (plus:VDQSF + (unspec:VDQSF + [(match_operand:<VDQSF:VBFMLA_W> 2 "register_operand" "w") + (match_operand:VBF 3 "register_operand" "w") + (match_operand:SI 4 "const_int_operand" "n")] + UNSPEC_BFDOT) + (match_operand:VDQSF 1 "register_operand" "0")))] + "TARGET_BF16_SIMD" +{ + int nunits = GET_MODE_NUNITS (<VBF:MODE>mode).to_constant (); + int lane = INTVAL (operands[4]); + operands[4] = gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane), SImode); + return "bfdot\t%0.<VDQSF:Vtype>, %2.<VDQSF:Vbfdottype>, %3.2h[%4]"; +} + [(set_attr "type" "neon_dot<VDQSF:q>")] +) diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index ee4bb76bcd4f52bdf99ba9b24fc5749ba555a73b..c304c2c4597550882377d1dfce03fff92e8ebde3 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -34611,6 +34611,57 @@ vrnd64xq_f64 (float64x2_t __a) #include "arm_bf16.h" +#pragma GCC push_options +#pragma GCC target ("arch=armv8.2-a+bf16") + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b) +{ + return __builtin_aarch64_bfdotv2sf (__r, __a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfdotq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b) +{ + return __builtin_aarch64_bfdotv4sf (__r, __a, __b); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfdot_lane_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b, + const int __index) +{ + return __builtin_aarch64_bfdot_lanev2sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfdotq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b, + const int __index) +{ + return __builtin_aarch64_bfdot_lanev4sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfdot_laneq_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x8_t __b, + const int __index) +{ + return __builtin_aarch64_bfdot_laneqv2sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfdotq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b, + const int __index) +{ + return __builtin_aarch64_bfdot_laneqv4sf (__r, __a, __b, __index); +} + +#pragma GCC pop_options + #undef __aarch64_vget_lane_any #undef __aarch64_vdup_lane_any diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 04262645a019087b600ff47667c13381dab10d66..2277abcaf7f10a256ddbadb1d4be40ba42f0ac67 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -119,6 +119,9 @@ ;; Quad vector with only 2 element modes. (define_mode_iterator VQ_2E [V2DI V2DF]) +;; BFmode vector modes. +(define_mode_iterator VBF [V4BF V8BF]) + ;; This mode iterator allows :P to be used for patterns that operate on ;; addresses in different modes. In LP64, only DI will match, while in ;; ILP32, either can match. @@ -671,6 +674,7 @@ UNSPEC_UMULHS ; Used in aarch64-sve2.md. UNSPEC_UMULHRS ; Used in aarch64-sve2.md. UNSPEC_ASRD ; Used in aarch64-sve.md. + UNSPEC_BFDOT ; Used in aarch64-simd.md. ]) ;; ------------------------------------------------------------------ @@ -727,6 +731,8 @@ (define_mode_attr FCVT_CHANGE_MODE [(SI "DF") (DI "SF")]) +(define_mode_attr isquadop [(V4BF "") (V8BF "q")]) + ;; For scalar usage of vector/FP registers (define_mode_attr v [(QI "b") (HI "h") (SI "s") (DI "d") (HF "h") (SF "s") (DF "d") @@ -1308,6 +1314,9 @@ ;; Register suffix for DOTPROD input types from the return type. (define_mode_attr Vdottype [(V2SI "8b") (V4SI "16b")]) +;; Register suffix for BFDOT input types from the return type. +(define_mode_attr Vbfdottype [(V2SF "4h") (V4SF "8h")]) + ;; Sum of lengths of instructions needed to move vector registers of a mode. (define_mode_attr insn_count [(OI "8") (CI "12") (XI "16")]) @@ -1318,6 +1327,9 @@ ;; Width of 2nd and 3rd arguments to fp16 vector multiply add/sub (define_mode_attr VFMLA_W [(V2SF "V4HF") (V4SF "V8HF")]) +;; Width of 2nd and 3rd arguments to bf16 vector multiply add/sub +(define_mode_attr VBFMLA_W [(V2SF "V4BF") (V4SF "V8BF")]) + (define_mode_attr VFMLA_SEL_W [(V2SF "V2HF") (V4SF "V4HF")]) (define_mode_attr f16quad [(V2SF "") (V4SF "q")]) diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c new file mode 100644 index 0000000000000000000000000000000000000000..c575dcd3901172a52fa9403c9179d58eea44eb72 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c @@ -0,0 +1,91 @@ +/* { dg-do assemble { target { aarch64*-*-* } } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-additional-options "-O -save-temps" } */ +/* { dg-final { check-function-bodies "**" "" } } */ +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ + +#include <arm_neon.h> + +/* +**ufoo: +** bfdot v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h) +** ret +*/ +float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) +{ + return vbfdot_f32 (r, x, y); +} + +/* +**ufooq: +** bfdot v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h) +** ret +*/ +float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) +{ + return vbfdotq_f32 (r, x, y); +} + +/* +**ufoo_lane: +** bfdot v0.2s, (v1.4h, v2.2h\[0\]|v2.4h, v1.2h\[0\]) +** ret +*/ +float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) +{ + return vbfdot_lane_f32 (r, x, y, 0); +} + +/* +**ufooq_laneq: +** bfdot v0.4s, (v1.8h, v2.2h\[2\]|v2.8h, v1.2h\[2\]) +** ret +*/ +float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) +{ + return vbfdotq_laneq_f32 (r, x, y, 2); +} + +/* +**ufoo_laneq: +** bfdot v0.2s, (v1.4h, v2.2h\[3\]|v2.4h, v1.2h\[3\]) +** ret +*/ +float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y) +{ + return vbfdot_laneq_f32 (r, x, y, 3); +} + +/* +**ufooq_lane: +** bfdot v0.4s, (v1.8h, v2.2h\[1\]|v2.8h, v1.2h\[1\]) +** ret +*/ +float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y) +{ + return vbfdotq_lane_f32 (r, x, y, 1); +} + +/* +**ufoo_untied: +** mov v0.8b, v1.8b +** bfdot v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h) +** ret +*/ +float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) +{ + return vbfdot_f32 (r, x, y); +} + +/* +**ufooq_lane_untied: +** mov v0.16b, v1.16b +** bfdot v0.4s, (v2.8h, v3.2h\[1\]|v3.8h, v2.2h\[1\]) +** ret +*/ +float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, bfloat16x8_t x, bfloat16x4_t y) +{ + return vbfdotq_lane_f32 (r, x, y, 1); +} + diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-2.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-2.c new file mode 100644 index 0000000000000000000000000000000000000000..a4da60a0a721c6ea819e28cb8f178c317eb54de1 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-2.c @@ -0,0 +1,91 @@ +/* { dg-do assemble { target { aarch64*-*-* } } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-additional-options "-O -mbig-endian --save-temps" } */ +/* { dg-final { check-function-bodies "**" "" } } */ +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ + +#include <arm_neon.h> + +/* +**ufoo: +** bfdot v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h) +** ret +*/ +float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) +{ + return vbfdot_f32 (r, x, y); +} + +/* +**ufooq: +** bfdot v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h) +** ret +*/ +float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) +{ + return vbfdotq_f32 (r, x, y); +} + +/* +**ufoo_lane: +** bfdot v0.2s, (v1.4h, v2.2h\[0\]|v2.4h, v1.2h\[0\]) +** ret +*/ +float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) +{ + return vbfdot_lane_f32 (r, x, y, 0); +} + +/* +**ufooq_laneq: +** bfdot v0.4s, (v1.8h, v2.2h\[2\]|v2.8h, v1.2h\[2\]) +** ret +*/ +float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) +{ + return vbfdotq_laneq_f32 (r, x, y, 2); +} + +/* +**ufoo_laneq: +** bfdot v0.2s, (v1.4h, v2.2h\[3\]|v2.4h, v1.2h\[3\]) +** ret +*/ +float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y) +{ + return vbfdot_laneq_f32 (r, x, y, 3); +} + +/* +**ufooq_lane: +** bfdot v0.4s, (v1.8h, v2.2h\[1\]|v2.8h, v1.2h\[1\]) +** ret +*/ +float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y) +{ + return vbfdotq_lane_f32 (r, x, y, 1); +} + +/* +**ufoo_untied: +** mov v0.8b, v1.8b +** bfdot v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h) +** ret +*/ +float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) +{ + return vbfdot_f32 (r, x, y); +} + +/* +**ufooq_lane_untied: +** mov v0.16b, v1.16b +** bfdot v0.4s, (v2.8h, v3.2h\[1\]|v3.8h, v2.2h\[1\]) +** ret +*/ +float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, bfloat16x8_t x, bfloat16x4_t y) +{ + return vbfdotq_lane_f32 (r, x, y, 1); +} + diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-3.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-3.c new file mode 100644 index 0000000000000000000000000000000000000000..607126203b00213d94471a1adefe16f265104af8 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-3.c @@ -0,0 +1,28 @@ +/* { dg-do assemble { target { aarch64*-*-* } } } */ +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-additional-options "--save-temps" } */ + +#include <arm_neon.h> + +float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) +{ + return vbfdot_lane_f32 (r, x, y, -1); /* { dg-error {lane -1 out of range 0 - 1} "" { target *-*-* } 0 } */ +} + +float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) +{ + return vbfdotq_laneq_f32 (r, x, y, -1); /* { dg-error {lane -1 out of range 0 - 3} "" { target *-*-* } 0 } */ +} + +float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y) +{ + return vbfdot_laneq_f32 (r, x, y, 4); /* { dg-error {lane 4 out of range 0 - 3} "" { target *-*-* } 0 } */ +} + +float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y) +{ + return vbfdotq_lane_f32 (r, x, y, 2); /* { dg-error {lane 2 out of range 0 - 1} "" { target *-*-* } 0 } */ +} +
Stam Markianos-Wright <Stam.Markianos-Wright@arm.com> writes: > diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md > index adfda96f077075ad53d4bea2919c4d3b326e49f5..7587bc46ba1c80389ea49fa83a0e6f8a489711e9 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -7028,3 +7028,36 @@ > "xtn\t%0.<Vntype>, %1.<Vtype>" > [(set_attr "type" "neon_shift_imm_narrow_q")] > ) > + > +(define_insn "aarch64_bfdot<mode>" > + [(set (match_operand:VDQSF 0 "register_operand" "=w") > + (plus:VDQSF > + (unspec:VDQSF > + [(match_operand:<VBFMLA_W> 2 "register_operand" "w") > + (match_operand:<VBFMLA_W> 3 "register_operand" "w")] > + UNSPEC_BFDOT) > + (match_operand:VDQSF 1 "register_operand" "0")))] > + "TARGET_BF16_SIMD" > + "bfdot\t%0.<Vtype>, %2.<Vbfdottype>, %3.<Vbfdottype>" > + [(set_attr "type" "neon_dot<q>")] > +) > + > + > +(define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>" Too many blank lines. > + [(set (match_operand:VDQSF 0 "register_operand" "=w") > + (plus:VDQSF > + (unspec:VDQSF > + [(match_operand:<VDQSF:VBFMLA_W> 2 "register_operand" "w") > + (match_operand:VBF 3 "register_operand" "w") > + (match_operand:SI 4 "const_int_operand" "n")] > + UNSPEC_BFDOT) > + (match_operand:VDQSF 1 "register_operand" "0")))] > + "TARGET_BF16_SIMD" > +{ > + int nunits = GET_MODE_NUNITS (<VBF:MODE>mode).to_constant (); > + int lane = INTVAL (operands[4]); > + operands[4] = gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane), SImode); > + return "bfdot\t%0.<VDQSF:Vtype>, %2.<VDQSF:Vbfdottype>, %3.2h[%4]"; > +} > + [(set_attr "type" "neon_dot<VDQSF:q>")] > +) > [...] > diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c > new file mode 100644 > index 0000000000000000000000000000000000000000..c575dcd3901172a52fa9403c9179d58eea44eb72 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c > @@ -0,0 +1,91 @@ > +/* { dg-do assemble { target { aarch64*-*-* } } } */ > +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ > +/* { dg-add-options arm_v8_2a_bf16_neon } */ > +/* { dg-additional-options "-O -save-temps" } */ > +/* { dg-final { check-function-bodies "**" "" } } */ > +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ Same comment as for USDOT/SUDOT regarding the dg- markup. > + > +#include <arm_neon.h> > + > +/* > +**ufoo: > +** bfdot v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h) > +** ret > +*/ > +float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) > +{ > + return vbfdot_f32 (r, x, y); > +} > + > +/* > +**ufooq: > +** bfdot v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h) > +** ret > +*/ > +float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) > +{ > + return vbfdotq_f32 (r, x, y); > +} The (...|...)s here are correct. > + > +/* > +**ufoo_lane: > +** bfdot v0.2s, (v1.4h, v2.2h\[0\]|v2.4h, v1.2h\[0\]) > +** ret > +*/ > +float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) > +{ > + return vbfdot_lane_f32 (r, x, y, 0); > +} > + > +/* > +**ufooq_laneq: > +** bfdot v0.4s, (v1.8h, v2.2h\[2\]|v2.8h, v1.2h\[2\]) > +** ret > +*/ > +float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) > +{ > + return vbfdotq_laneq_f32 (r, x, y, 2); > +} > + > +/* > +**ufoo_laneq: > +** bfdot v0.2s, (v1.4h, v2.2h\[3\]|v2.4h, v1.2h\[3\]) > +** ret > +*/ > +float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y) > +{ > + return vbfdot_laneq_f32 (r, x, y, 3); > +} > + > +/* > +**ufooq_lane: > +** bfdot v0.4s, (v1.8h, v2.2h\[1\]|v2.8h, v1.2h\[1\]) > +** ret > +*/ > +float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y) > +{ > + return vbfdotq_lane_f32 (r, x, y, 1); > +} But these aren't, since the operands must be in the order given. > + > +/* > +**ufoo_untied: > +** mov v0.8b, v1.8b > +** bfdot v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h) > +** ret > +*/ > +float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) > +{ > + return vbfdot_f32 (r, x, y); > +} Similarly, OK here. > + > +/* > +**ufooq_lane_untied: > +** mov v0.16b, v1.16b > +** bfdot v0.4s, (v2.8h, v3.2h\[1\]|v3.8h, v2.2h\[1\]) > +** ret > +*/ > +float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, bfloat16x8_t x, bfloat16x4_t y) > +{ > + return vbfdotq_lane_f32 (r, x, y, 1); > +} ...but not here. Same comments for the big-endian test. Thanks, Richard
On 12/30/19 10:29 AM, Richard Sandiford wrote: > Stam Markianos-Wright <Stam.Markianos-Wright@arm.com> writes: >> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md >> index adfda96f077075ad53d4bea2919c4d3b326e49f5..7587bc46ba1c80389ea49fa83a0e6f8a489711e9 100644 >> --- a/gcc/config/aarch64/aarch64-simd.md >> +++ b/gcc/config/aarch64/aarch64-simd.md >> @@ -7028,3 +7028,36 @@ >> "xtn\t%0.<Vntype>, %1.<Vtype>" >> [(set_attr "type" "neon_shift_imm_narrow_q")] >> ) >> + >> +(define_insn "aarch64_bfdot<mode>" >> + [(set (match_operand:VDQSF 0 "register_operand" "=w") >> + (plus:VDQSF >> + (unspec:VDQSF >> + [(match_operand:<VBFMLA_W> 2 "register_operand" "w") >> + (match_operand:<VBFMLA_W> 3 "register_operand" "w")] >> + UNSPEC_BFDOT) >> + (match_operand:VDQSF 1 "register_operand" "0")))] >> + "TARGET_BF16_SIMD" >> + "bfdot\t%0.<Vtype>, %2.<Vbfdottype>, %3.<Vbfdottype>" >> + [(set_attr "type" "neon_dot<q>")] >> +) >> + >> + >> +(define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>" > > Too many blank lines. Fixed, sorry I hadn't noticed! > >> + [(set (match_operand:VDQSF 0 "register_operand" "=w") >> + (plus:VDQSF >> + (unspec:VDQSF >> + [(match_operand:<VDQSF:VBFMLA_W> 2 "register_operand" "w") >> + (match_operand:VBF 3 "register_operand" "w") >> + (match_operand:SI 4 "const_int_operand" "n")] >> + UNSPEC_BFDOT) >> + (match_operand:VDQSF 1 "register_operand" "0")))] >> + "TARGET_BF16_SIMD" >> +{ >> + int nunits = GET_MODE_NUNITS (<VBF:MODE>mode).to_constant (); >> + int lane = INTVAL (operands[4]); >> + operands[4] = gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane), SImode); >> + return "bfdot\t%0.<VDQSF:Vtype>, %2.<VDQSF:Vbfdottype>, %3.2h[%4]"; >> +} >> + [(set_attr "type" "neon_dot<VDQSF:q>")] >> +) >> [...] >> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c >> new file mode 100644 >> index 0000000000000000000000000000000000000000..c575dcd3901172a52fa9403c9179d58eea44eb72 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c >> @@ -0,0 +1,91 @@ >> +/* { dg-do assemble { target { aarch64*-*-* } } } */ >> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ >> +/* { dg-add-options arm_v8_2a_bf16_neon } */ >> +/* { dg-additional-options "-O -save-temps" } */ >> +/* { dg-final { check-function-bodies "**" "" } } */ >> +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ > > Same comment as for USDOT/SUDOT regarding the dg- markup. Done! > >> + >> +#include <arm_neon.h> >> + >> +/* >> +**ufoo: >> +** bfdot v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h) >> +** ret >> +*/ >> +float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) >> +{ >> + return vbfdot_f32 (r, x, y); >> +} >> + >> +/* >> +**ufooq: >> +** bfdot v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h) >> +** ret >> +*/ >> +float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) >> +{ >> + return vbfdotq_f32 (r, x, y); >> +} > > The (...|...)s here are correct. Yep. > >> + >> +/* >> +**ufoo_lane: >> +** bfdot v0.2s, (v1.4h, v2.2h\[0\]|v2.4h, v1.2h\[0\]) >> +** ret >> +*/ >> +float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) >> +{ >> + return vbfdot_lane_f32 (r, x, y, 0); >> +} >> + >> +/* >> +**ufooq_laneq: >> +** bfdot v0.4s, (v1.8h, v2.2h\[2\]|v2.8h, v1.2h\[2\]) >> +** ret >> +*/ >> +float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) >> +{ >> + return vbfdotq_laneq_f32 (r, x, y, 2); >> +} >> + >> +/* >> +**ufoo_laneq: >> +** bfdot v0.2s, (v1.4h, v2.2h\[3\]|v2.4h, v1.2h\[3\]) >> +** ret >> +*/ >> +float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y) >> +{ >> + return vbfdot_laneq_f32 (r, x, y, 3); >> +} >> + >> +/* >> +**ufooq_lane: >> +** bfdot v0.4s, (v1.8h, v2.2h\[1\]|v2.8h, v1.2h\[1\]) >> +** ret >> +*/ >> +float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y) >> +{ >> + return vbfdotq_lane_f32 (r, x, y, 1); >> +} > > But these aren't, since the operands must be in the order given. Yep. > >> + >> +/* >> +**ufoo_untied: >> +** mov v0.8b, v1.8b >> +** bfdot v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h) >> +** ret >> +*/ >> +float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) >> +{ >> + return vbfdot_f32 (r, x, y); >> +} > > Similarly, OK here. Yep. > >> + >> +/* >> +**ufooq_lane_untied: >> +** mov v0.16b, v1.16b >> +** bfdot v0.4s, (v2.8h, v3.2h\[1\]|v3.8h, v2.2h\[1\]) >> +** ret >> +*/ >> +float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, bfloat16x8_t x, bfloat16x4_t y) >> +{ >> + return vbfdotq_lane_f32 (r, x, y, 1); >> +} > > ...but not here. Yep. > > Same comments for the big-endian test. Done. Thank you so much for the in depth review comments! Cheers, Stam > > Thanks, > Richard >
Please update the names of the testsuite files to match the ones in the bfloat16_t patch. (Same for the usdot/sudot patch -- sorry for forgetting there.) OK with that change, thanks. Richard Stam Markianos-Wright <Stam.Markianos-Wright@arm.com> writes: > On 12/30/19 10:29 AM, Richard Sandiford wrote: >> Stam Markianos-Wright <Stam.Markianos-Wright@arm.com> writes: >>> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md >>> index adfda96f077075ad53d4bea2919c4d3b326e49f5..7587bc46ba1c80389ea49fa83a0e6f8a489711e9 100644 >>> --- a/gcc/config/aarch64/aarch64-simd.md >>> +++ b/gcc/config/aarch64/aarch64-simd.md >>> @@ -7028,3 +7028,36 @@ >>> "xtn\t%0.<Vntype>, %1.<Vtype>" >>> [(set_attr "type" "neon_shift_imm_narrow_q")] >>> ) >>> + >>> +(define_insn "aarch64_bfdot<mode>" >>> + [(set (match_operand:VDQSF 0 "register_operand" "=w") >>> + (plus:VDQSF >>> + (unspec:VDQSF >>> + [(match_operand:<VBFMLA_W> 2 "register_operand" "w") >>> + (match_operand:<VBFMLA_W> 3 "register_operand" "w")] >>> + UNSPEC_BFDOT) >>> + (match_operand:VDQSF 1 "register_operand" "0")))] >>> + "TARGET_BF16_SIMD" >>> + "bfdot\t%0.<Vtype>, %2.<Vbfdottype>, %3.<Vbfdottype>" >>> + [(set_attr "type" "neon_dot<q>")] >>> +) >>> + >>> + >>> +(define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>" >> >> Too many blank lines. > > Fixed, sorry I hadn't noticed! > >> >>> + [(set (match_operand:VDQSF 0 "register_operand" "=w") >>> + (plus:VDQSF >>> + (unspec:VDQSF >>> + [(match_operand:<VDQSF:VBFMLA_W> 2 "register_operand" "w") >>> + (match_operand:VBF 3 "register_operand" "w") >>> + (match_operand:SI 4 "const_int_operand" "n")] >>> + UNSPEC_BFDOT) >>> + (match_operand:VDQSF 1 "register_operand" "0")))] >>> + "TARGET_BF16_SIMD" >>> +{ >>> + int nunits = GET_MODE_NUNITS (<VBF:MODE>mode).to_constant (); >>> + int lane = INTVAL (operands[4]); >>> + operands[4] = gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane), SImode); >>> + return "bfdot\t%0.<VDQSF:Vtype>, %2.<VDQSF:Vbfdottype>, %3.2h[%4]"; >>> +} >>> + [(set_attr "type" "neon_dot<VDQSF:q>")] >>> +) >>> [...] >>> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c >>> new file mode 100644 >>> index 0000000000000000000000000000000000000000..c575dcd3901172a52fa9403c9179d58eea44eb72 >>> --- /dev/null >>> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c >>> @@ -0,0 +1,91 @@ >>> +/* { dg-do assemble { target { aarch64*-*-* } } } */ >>> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ >>> +/* { dg-add-options arm_v8_2a_bf16_neon } */ >>> +/* { dg-additional-options "-O -save-temps" } */ >>> +/* { dg-final { check-function-bodies "**" "" } } */ >>> +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ >> >> Same comment as for USDOT/SUDOT regarding the dg- markup. > > Done! >> >>> + >>> +#include <arm_neon.h> >>> + >>> +/* >>> +**ufoo: >>> +** bfdot v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h) >>> +** ret >>> +*/ >>> +float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) >>> +{ >>> + return vbfdot_f32 (r, x, y); >>> +} >>> + >>> +/* >>> +**ufooq: >>> +** bfdot v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h) >>> +** ret >>> +*/ >>> +float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) >>> +{ >>> + return vbfdotq_f32 (r, x, y); >>> +} >> >> The (...|...)s here are correct. > Yep. >> >>> + >>> +/* >>> +**ufoo_lane: >>> +** bfdot v0.2s, (v1.4h, v2.2h\[0\]|v2.4h, v1.2h\[0\]) >>> +** ret >>> +*/ >>> +float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) >>> +{ >>> + return vbfdot_lane_f32 (r, x, y, 0); >>> +} >>> + >>> +/* >>> +**ufooq_laneq: >>> +** bfdot v0.4s, (v1.8h, v2.2h\[2\]|v2.8h, v1.2h\[2\]) >>> +** ret >>> +*/ >>> +float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) >>> +{ >>> + return vbfdotq_laneq_f32 (r, x, y, 2); >>> +} >>> + >>> +/* >>> +**ufoo_laneq: >>> +** bfdot v0.2s, (v1.4h, v2.2h\[3\]|v2.4h, v1.2h\[3\]) >>> +** ret >>> +*/ >>> +float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y) >>> +{ >>> + return vbfdot_laneq_f32 (r, x, y, 3); >>> +} >>> + >>> +/* >>> +**ufooq_lane: >>> +** bfdot v0.4s, (v1.8h, v2.2h\[1\]|v2.8h, v1.2h\[1\]) >>> +** ret >>> +*/ >>> +float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y) >>> +{ >>> + return vbfdotq_lane_f32 (r, x, y, 1); >>> +} >> >> But these aren't, since the operands must be in the order given. > Yep. >> >>> + >>> +/* >>> +**ufoo_untied: >>> +** mov v0.8b, v1.8b >>> +** bfdot v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h) >>> +** ret >>> +*/ >>> +float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) >>> +{ >>> + return vbfdot_f32 (r, x, y); >>> +} >> >> Similarly, OK here. > Yep. >> >>> + >>> +/* >>> +**ufooq_lane_untied: >>> +** mov v0.16b, v1.16b >>> +** bfdot v0.4s, (v2.8h, v3.2h\[1\]|v3.8h, v2.2h\[1\]) >>> +** ret >>> +*/ >>> +float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, bfloat16x8_t x, bfloat16x4_t y) >>> +{ >>> + return vbfdotq_lane_f32 (r, x, y, 1); >>> +} >> >> ...but not here. > Yep. >> >> Same comments for the big-endian test. > Done. > > Thank you so much for the in depth review comments! > > Cheers, > Stam >> >> Thanks, >> Richard >> > > diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def > index 57fc5933b43bfc0da132342c681b8a2c14549c9c..41ccda8a5d77b8ec3cfd984f3c5fc02369e7199f 100644 > --- a/gcc/config/aarch64/aarch64-simd-builtins.def > +++ b/gcc/config/aarch64/aarch64-simd-builtins.def > @@ -682,3 +682,8 @@ > BUILTIN_VSFDF (UNOP, frint32x, 0) > BUILTIN_VSFDF (UNOP, frint64z, 0) > BUILTIN_VSFDF (UNOP, frint64x, 0) > + > + /* Implemented by aarch64_bfdot{_lane}{q}<mode>. */ > + VAR2 (TERNOP, bfdot, 0, v2sf, v4sf) > + VAR2 (QUADOP_LANE_PAIR, bfdot_lane, 0, v2sf, v4sf) > + VAR2 (QUADOP_LANE_PAIR, bfdot_laneq, 0, v2sf, v4sf) > diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md > index cea9592695ac8bd2f4e625f8b769ddaf716e9091..a95489dc17ac38be8e85457ad1804387f1772dc3 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -7025,3 +7025,35 @@ > "xtn\t%0.<Vntype>, %1.<Vtype>" > [(set_attr "type" "neon_shift_imm_narrow_q")] > ) > + > +(define_insn "aarch64_bfdot<mode>" > + [(set (match_operand:VDQSF 0 "register_operand" "=w") > + (plus:VDQSF > + (unspec:VDQSF > + [(match_operand:<VBFMLA_W> 2 "register_operand" "w") > + (match_operand:<VBFMLA_W> 3 "register_operand" "w")] > + UNSPEC_BFDOT) > + (match_operand:VDQSF 1 "register_operand" "0")))] > + "TARGET_BF16_SIMD" > + "bfdot\t%0.<Vtype>, %2.<Vbfdottype>, %3.<Vbfdottype>" > + [(set_attr "type" "neon_dot<q>")] > +) > + > +(define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>" > + [(set (match_operand:VDQSF 0 "register_operand" "=w") > + (plus:VDQSF > + (unspec:VDQSF > + [(match_operand:<VDQSF:VBFMLA_W> 2 "register_operand" "w") > + (match_operand:VBF 3 "register_operand" "w") > + (match_operand:SI 4 "const_int_operand" "n")] > + UNSPEC_BFDOT) > + (match_operand:VDQSF 1 "register_operand" "0")))] > + "TARGET_BF16_SIMD" > +{ > + int nunits = GET_MODE_NUNITS (<VBF:MODE>mode).to_constant (); > + int lane = INTVAL (operands[4]); > + operands[4] = gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane), SImode); > + return "bfdot\t%0.<VDQSF:Vtype>, %2.<VDQSF:Vbfdottype>, %3.2h[%4]"; > +} > + [(set_attr "type" "neon_dot<VDQSF:q>")] > +) > diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h > index eaba156e26cf35b07b96972fe2741a9c00d6caa9..1a8b27956d4ca25e0ed6f3c38030b3eba0546c4f 100644 > --- a/gcc/config/aarch64/arm_neon.h > +++ b/gcc/config/aarch64/arm_neon.h > @@ -34611,6 +34611,57 @@ vrnd64xq_f64 (float64x2_t __a) > > #include "arm_bf16.h" > > +#pragma GCC push_options > +#pragma GCC target ("arch=armv8.2-a+bf16") > + > +__extension__ extern __inline float32x2_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b) > +{ > + return __builtin_aarch64_bfdotv2sf (__r, __a, __b); > +} > + > +__extension__ extern __inline float32x4_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vbfdotq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b) > +{ > + return __builtin_aarch64_bfdotv4sf (__r, __a, __b); > +} > + > +__extension__ extern __inline float32x2_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vbfdot_lane_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b, > + const int __index) > +{ > + return __builtin_aarch64_bfdot_lanev2sf (__r, __a, __b, __index); > +} > + > +__extension__ extern __inline float32x4_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vbfdotq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b, > + const int __index) > +{ > + return __builtin_aarch64_bfdot_lanev4sf (__r, __a, __b, __index); > +} > + > +__extension__ extern __inline float32x2_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vbfdot_laneq_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x8_t __b, > + const int __index) > +{ > + return __builtin_aarch64_bfdot_laneqv2sf (__r, __a, __b, __index); > +} > + > +__extension__ extern __inline float32x4_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vbfdotq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b, > + const int __index) > +{ > + return __builtin_aarch64_bfdot_laneqv4sf (__r, __a, __b, __index); > +} > + > +#pragma GCC pop_options > + > #undef __aarch64_vget_lane_any > > #undef __aarch64_vdup_lane_any > diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md > index 2d566ca1a5fad18b701f1954cff967342085874a..091d3a2fb6926f614d354052961d0913d41f71e9 100644 > --- a/gcc/config/aarch64/iterators.md > +++ b/gcc/config/aarch64/iterators.md > @@ -122,6 +122,9 @@ > ;; Quad vector with only 2 element modes. > (define_mode_iterator VQ_2E [V2DI V2DF]) > > +;; BFmode vector modes. > +(define_mode_iterator VBF [V4BF V8BF]) > + > ;; This mode iterator allows :P to be used for patterns that operate on > ;; addresses in different modes. In LP64, only DI will match, while in > ;; ILP32, either can match. > @@ -671,6 +674,7 @@ > UNSPEC_UMULHS ; Used in aarch64-sve2.md. > UNSPEC_UMULHRS ; Used in aarch64-sve2.md. > UNSPEC_ASRD ; Used in aarch64-sve.md. > + UNSPEC_BFDOT ; Used in aarch64-simd.md. > ]) > > ;; ------------------------------------------------------------------ > @@ -727,6 +731,8 @@ > > (define_mode_attr FCVT_CHANGE_MODE [(SI "DF") (DI "SF")]) > > +(define_mode_attr isquadop [(V4BF "") (V8BF "q")]) > + > ;; For scalar usage of vector/FP registers > (define_mode_attr v [(QI "b") (HI "h") (SI "s") (DI "d") > (HF "h") (SF "s") (DF "d") > @@ -1310,6 +1316,9 @@ > ;; Register suffix for DOTPROD input types from the return type. > (define_mode_attr Vdottype [(V2SI "8b") (V4SI "16b")]) > > +;; Register suffix for BFDOT input types from the return type. > +(define_mode_attr Vbfdottype [(V2SF "4h") (V4SF "8h")]) > + > ;; Sum of lengths of instructions needed to move vector registers of a mode. > (define_mode_attr insn_count [(OI "8") (CI "12") (XI "16")]) > > @@ -1320,6 +1329,9 @@ > ;; Width of 2nd and 3rd arguments to fp16 vector multiply add/sub > (define_mode_attr VFMLA_W [(V2SF "V4HF") (V4SF "V8HF")]) > > +;; Width of 2nd and 3rd arguments to bf16 vector multiply add/sub > +(define_mode_attr VBFMLA_W [(V2SF "V4BF") (V4SF "V8BF")]) > + > (define_mode_attr VFMLA_SEL_W [(V2SF "V2HF") (V4SF "V4HF")]) > > (define_mode_attr f16quad [(V2SF "") (V4SF "q")]) > diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c > new file mode 100644 > index 0000000000000000000000000000000000000000..ad51507731bbb165de64e583ebfbf8047b4eb781 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c > @@ -0,0 +1,91 @@ > +/* { dg-do assemble { target { aarch64*-*-* } } } */ > +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ > +/* { dg-add-options arm_v8_2a_bf16_neon } */ > +/* { dg-additional-options "-save-temps" } */ > +/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */ > +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ > + > +#include <arm_neon.h> > + > +/* > +**ufoo: > +** bfdot v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h) > +** ret > +*/ > +float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) > +{ > + return vbfdot_f32 (r, x, y); > +} > + > +/* > +**ufooq: > +** bfdot v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h) > +** ret > +*/ > +float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) > +{ > + return vbfdotq_f32 (r, x, y); > +} > + > +/* > +**ufoo_lane: > +** bfdot v0.2s, v1.4h, v2.2h\[0\] > +** ret > +*/ > +float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) > +{ > + return vbfdot_lane_f32 (r, x, y, 0); > +} > + > +/* > +**ufooq_laneq: > +** bfdot v0.4s, v1.8h, v2.2h\[2\] > +** ret > +*/ > +float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) > +{ > + return vbfdotq_laneq_f32 (r, x, y, 2); > +} > + > +/* > +**ufoo_laneq: > +** bfdot v0.2s, v1.4h, v2.2h\[3\] > +** ret > +*/ > +float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y) > +{ > + return vbfdot_laneq_f32 (r, x, y, 3); > +} > + > +/* > +**ufooq_lane: > +** bfdot v0.4s, v1.8h, v2.2h\[1\] > +** ret > +*/ > +float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y) > +{ > + return vbfdotq_lane_f32 (r, x, y, 1); > +} > + > +/* > +**ufoo_untied: > +** mov v0.8b, v1.8b > +** bfdot v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h) > +** ret > +*/ > +float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) > +{ > + return vbfdot_f32 (r, x, y); > +} > + > +/* > +**ufooq_lane_untied: > +** mov v0.16b, v1.16b > +** bfdot v0.4s, v2.8h, v3.2h\[1\] > +** ret > +*/ > +float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, bfloat16x8_t x, bfloat16x4_t y) > +{ > + return vbfdotq_lane_f32 (r, x, y, 1); > +} > + > diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-2.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-2.c > new file mode 100644 > index 0000000000000000000000000000000000000000..58bdee5ac9df602b7569724200b3c9ab7c72bb28 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-2.c > @@ -0,0 +1,91 @@ > +/* { dg-do assemble { target { aarch64*-*-* } } } */ > +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ > +/* { dg-add-options arm_v8_2a_bf16_neon } */ > +/* { dg-additional-options "-mbig-endian --save-temps" } */ > +/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */ > +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ > + > +#include <arm_neon.h> > + > +/* > +**ufoo: > +** bfdot v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h) > +** ret > +*/ > +float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) > +{ > + return vbfdot_f32 (r, x, y); > +} > + > +/* > +**ufooq: > +** bfdot v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h) > +** ret > +*/ > +float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) > +{ > + return vbfdotq_f32 (r, x, y); > +} > + > +/* > +**ufoo_lane: > +** bfdot v0.2s, v1.4h, v2.2h\[0\] > +** ret > +*/ > +float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) > +{ > + return vbfdot_lane_f32 (r, x, y, 0); > +} > + > +/* > +**ufooq_laneq: > +** bfdot v0.4s, v1.8h, v2.2h\[2\] > +** ret > +*/ > +float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) > +{ > + return vbfdotq_laneq_f32 (r, x, y, 2); > +} > + > +/* > +**ufoo_laneq: > +** bfdot v0.2s, v1.4h, v2.2h\[3\] > +** ret > +*/ > +float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y) > +{ > + return vbfdot_laneq_f32 (r, x, y, 3); > +} > + > +/* > +**ufooq_lane: > +** bfdot v0.4s, v1.8h, v2.2h\[1\] > +** ret > +*/ > +float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y) > +{ > + return vbfdotq_lane_f32 (r, x, y, 1); > +} > + > +/* > +**ufoo_untied: > +** mov v0.8b, v1.8b > +** bfdot v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h) > +** ret > +*/ > +float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) > +{ > + return vbfdot_f32 (r, x, y); > +} > + > +/* > +**ufooq_lane_untied: > +** mov v0.16b, v1.16b > +** bfdot v0.4s, v2.8h, v3.2h\[1\] > +** ret > +*/ > +float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, bfloat16x8_t x, bfloat16x4_t y) > +{ > + return vbfdotq_lane_f32 (r, x, y, 1); > +} > + > diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-3.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-3.c > new file mode 100644 > index 0000000000000000000000000000000000000000..607126203b00213d94471a1adefe16f265104af8 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-3.c > @@ -0,0 +1,28 @@ > +/* { dg-do assemble { target { aarch64*-*-* } } } */ > +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ > +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ > +/* { dg-add-options arm_v8_2a_bf16_neon } */ > +/* { dg-additional-options "--save-temps" } */ > + > +#include <arm_neon.h> > + > +float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) > +{ > + return vbfdot_lane_f32 (r, x, y, -1); /* { dg-error {lane -1 out of range 0 - 1} "" { target *-*-* } 0 } */ > +} > + > +float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) > +{ > + return vbfdotq_laneq_f32 (r, x, y, -1); /* { dg-error {lane -1 out of range 0 - 3} "" { target *-*-* } 0 } */ > +} > + > +float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y) > +{ > + return vbfdot_laneq_f32 (r, x, y, 4); /* { dg-error {lane 4 out of range 0 - 3} "" { target *-*-* } 0 } */ > +} > + > +float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y) > +{ > + return vbfdotq_lane_f32 (r, x, y, 2); /* { dg-error {lane 2 out of range 0 - 1} "" { target *-*-* } 0 } */ > +} > +
On 1/9/20 3:54 PM, Richard Sandiford wrote: > Please update the names of the testsuite files to match the ones > in the bfloat16_t patch. (Same for the usdot/sudot patch -- sorry > for forgetting there.) > > OK with that change, thanks. > Done and committed as r10-6006-gf275d73a57f1e5a07fbd4978f4b4457a5eaa1e39 Thank you! Stam > Richard > > Stam Markianos-Wright <Stam.Markianos-Wright@arm.com> writes: >> On 12/30/19 10:29 AM, Richard Sandiford wrote: >>> Stam Markianos-Wright <Stam.Markianos-Wright@arm.com> writes: >>>> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md >>>> index adfda96f077075ad53d4bea2919c4d3b326e49f5..7587bc46ba1c80389ea49fa83a0e6f8a489711e9 100644 >>>> --- a/gcc/config/aarch64/aarch64-simd.md >>>> +++ b/gcc/config/aarch64/aarch64-simd.md >>>> @@ -7028,3 +7028,36 @@ >>>> "xtn\t%0.<Vntype>, %1.<Vtype>" >>>> [(set_attr "type" "neon_shift_imm_narrow_q")] >>>> ) >>>> + >>>> +(define_insn "aarch64_bfdot<mode>" >>>> + [(set (match_operand:VDQSF 0 "register_operand" "=w") >>>> + (plus:VDQSF >>>> + (unspec:VDQSF >>>> + [(match_operand:<VBFMLA_W> 2 "register_operand" "w") >>>> + (match_operand:<VBFMLA_W> 3 "register_operand" "w")] >>>> + UNSPEC_BFDOT) >>>> + (match_operand:VDQSF 1 "register_operand" "0")))] >>>> + "TARGET_BF16_SIMD" >>>> + "bfdot\t%0.<Vtype>, %2.<Vbfdottype>, %3.<Vbfdottype>" >>>> + [(set_attr "type" "neon_dot<q>")] >>>> +) >>>> + >>>> + >>>> +(define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>" >>> >>> Too many blank lines. >> >> Fixed, sorry I hadn't noticed! >> >>> >>>> + [(set (match_operand:VDQSF 0 "register_operand" "=w") >>>> + (plus:VDQSF >>>> + (unspec:VDQSF >>>> + [(match_operand:<VDQSF:VBFMLA_W> 2 "register_operand" "w") >>>> + (match_operand:VBF 3 "register_operand" "w") >>>> + (match_operand:SI 4 "const_int_operand" "n")] >>>> + UNSPEC_BFDOT) >>>> + (match_operand:VDQSF 1 "register_operand" "0")))] >>>> + "TARGET_BF16_SIMD" >>>> +{ >>>> + int nunits = GET_MODE_NUNITS (<VBF:MODE>mode).to_constant (); >>>> + int lane = INTVAL (operands[4]); >>>> + operands[4] = gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane), SImode); >>>> + return "bfdot\t%0.<VDQSF:Vtype>, %2.<VDQSF:Vbfdottype>, %3.2h[%4]"; >>>> +} >>>> + [(set_attr "type" "neon_dot<VDQSF:q>")] >>>> +) >>>> [...] >>>> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c >>>> new file mode 100644 >>>> index 0000000000000000000000000000000000000000..c575dcd3901172a52fa9403c9179d58eea44eb72 >>>> --- /dev/null >>>> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c >>>> @@ -0,0 +1,91 @@ >>>> +/* { dg-do assemble { target { aarch64*-*-* } } } */ >>>> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ >>>> +/* { dg-add-options arm_v8_2a_bf16_neon } */ >>>> +/* { dg-additional-options "-O -save-temps" } */ >>>> +/* { dg-final { check-function-bodies "**" "" } } */ >>>> +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ >>> >>> Same comment as for USDOT/SUDOT regarding the dg- markup. >> >> Done! >>> >>>> + >>>> +#include <arm_neon.h> >>>> + >>>> +/* >>>> +**ufoo: >>>> +** bfdot v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h) >>>> +** ret >>>> +*/ >>>> +float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) >>>> +{ >>>> + return vbfdot_f32 (r, x, y); >>>> +} >>>> + >>>> +/* >>>> +**ufooq: >>>> +** bfdot v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h) >>>> +** ret >>>> +*/ >>>> +float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) >>>> +{ >>>> + return vbfdotq_f32 (r, x, y); >>>> +} >>> >>> The (...|...)s here are correct. >> Yep. >>> >>>> + >>>> +/* >>>> +**ufoo_lane: >>>> +** bfdot v0.2s, (v1.4h, v2.2h\[0\]|v2.4h, v1.2h\[0\]) >>>> +** ret >>>> +*/ >>>> +float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) >>>> +{ >>>> + return vbfdot_lane_f32 (r, x, y, 0); >>>> +} >>>> + >>>> +/* >>>> +**ufooq_laneq: >>>> +** bfdot v0.4s, (v1.8h, v2.2h\[2\]|v2.8h, v1.2h\[2\]) >>>> +** ret >>>> +*/ >>>> +float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) >>>> +{ >>>> + return vbfdotq_laneq_f32 (r, x, y, 2); >>>> +} >>>> + >>>> +/* >>>> +**ufoo_laneq: >>>> +** bfdot v0.2s, (v1.4h, v2.2h\[3\]|v2.4h, v1.2h\[3\]) >>>> +** ret >>>> +*/ >>>> +float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y) >>>> +{ >>>> + return vbfdot_laneq_f32 (r, x, y, 3); >>>> +} >>>> + >>>> +/* >>>> +**ufooq_lane: >>>> +** bfdot v0.4s, (v1.8h, v2.2h\[1\]|v2.8h, v1.2h\[1\]) >>>> +** ret >>>> +*/ >>>> +float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y) >>>> +{ >>>> + return vbfdotq_lane_f32 (r, x, y, 1); >>>> +} >>> >>> But these aren't, since the operands must be in the order given. >> Yep. >>> >>>> + >>>> +/* >>>> +**ufoo_untied: >>>> +** mov v0.8b, v1.8b >>>> +** bfdot v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h) >>>> +** ret >>>> +*/ >>>> +float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) >>>> +{ >>>> + return vbfdot_f32 (r, x, y); >>>> +} >>> >>> Similarly, OK here. >> Yep. >>> >>>> + >>>> +/* >>>> +**ufooq_lane_untied: >>>> +** mov v0.16b, v1.16b >>>> +** bfdot v0.4s, (v2.8h, v3.2h\[1\]|v3.8h, v2.2h\[1\]) >>>> +** ret >>>> +*/ >>>> +float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, bfloat16x8_t x, bfloat16x4_t y) >>>> +{ >>>> + return vbfdotq_lane_f32 (r, x, y, 1); >>>> +} >>> >>> ...but not here. >> Yep. >>> >>> Same comments for the big-endian test. >> Done. >> >> Thank you so much for the in depth review comments! >> >> Cheers, >> Stam >>> >>> Thanks, >>> Richard >>> >> >> diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def >> index 57fc5933b43bfc0da132342c681b8a2c14549c9c..41ccda8a5d77b8ec3cfd984f3c5fc02369e7199f 100644 >> --- a/gcc/config/aarch64/aarch64-simd-builtins.def >> +++ b/gcc/config/aarch64/aarch64-simd-builtins.def >> @@ -682,3 +682,8 @@ >> BUILTIN_VSFDF (UNOP, frint32x, 0) >> BUILTIN_VSFDF (UNOP, frint64z, 0) >> BUILTIN_VSFDF (UNOP, frint64x, 0) >> + >> + /* Implemented by aarch64_bfdot{_lane}{q}<mode>. */ >> + VAR2 (TERNOP, bfdot, 0, v2sf, v4sf) >> + VAR2 (QUADOP_LANE_PAIR, bfdot_lane, 0, v2sf, v4sf) >> + VAR2 (QUADOP_LANE_PAIR, bfdot_laneq, 0, v2sf, v4sf) >> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md >> index cea9592695ac8bd2f4e625f8b769ddaf716e9091..a95489dc17ac38be8e85457ad1804387f1772dc3 100644 >> --- a/gcc/config/aarch64/aarch64-simd.md >> +++ b/gcc/config/aarch64/aarch64-simd.md >> @@ -7025,3 +7025,35 @@ >> "xtn\t%0.<Vntype>, %1.<Vtype>" >> [(set_attr "type" "neon_shift_imm_narrow_q")] >> ) >> + >> +(define_insn "aarch64_bfdot<mode>" >> + [(set (match_operand:VDQSF 0 "register_operand" "=w") >> + (plus:VDQSF >> + (unspec:VDQSF >> + [(match_operand:<VBFMLA_W> 2 "register_operand" "w") >> + (match_operand:<VBFMLA_W> 3 "register_operand" "w")] >> + UNSPEC_BFDOT) >> + (match_operand:VDQSF 1 "register_operand" "0")))] >> + "TARGET_BF16_SIMD" >> + "bfdot\t%0.<Vtype>, %2.<Vbfdottype>, %3.<Vbfdottype>" >> + [(set_attr "type" "neon_dot<q>")] >> +) >> + >> +(define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>" >> + [(set (match_operand:VDQSF 0 "register_operand" "=w") >> + (plus:VDQSF >> + (unspec:VDQSF >> + [(match_operand:<VDQSF:VBFMLA_W> 2 "register_operand" "w") >> + (match_operand:VBF 3 "register_operand" "w") >> + (match_operand:SI 4 "const_int_operand" "n")] >> + UNSPEC_BFDOT) >> + (match_operand:VDQSF 1 "register_operand" "0")))] >> + "TARGET_BF16_SIMD" >> +{ >> + int nunits = GET_MODE_NUNITS (<VBF:MODE>mode).to_constant (); >> + int lane = INTVAL (operands[4]); >> + operands[4] = gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane), SImode); >> + return "bfdot\t%0.<VDQSF:Vtype>, %2.<VDQSF:Vbfdottype>, %3.2h[%4]"; >> +} >> + [(set_attr "type" "neon_dot<VDQSF:q>")] >> +) >> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h >> index eaba156e26cf35b07b96972fe2741a9c00d6caa9..1a8b27956d4ca25e0ed6f3c38030b3eba0546c4f 100644 >> --- a/gcc/config/aarch64/arm_neon.h >> +++ b/gcc/config/aarch64/arm_neon.h >> @@ -34611,6 +34611,57 @@ vrnd64xq_f64 (float64x2_t __a) >> >> #include "arm_bf16.h" >> >> +#pragma GCC push_options >> +#pragma GCC target ("arch=armv8.2-a+bf16") >> + >> +__extension__ extern __inline float32x2_t >> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >> +vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b) >> +{ >> + return __builtin_aarch64_bfdotv2sf (__r, __a, __b); >> +} >> + >> +__extension__ extern __inline float32x4_t >> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >> +vbfdotq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b) >> +{ >> + return __builtin_aarch64_bfdotv4sf (__r, __a, __b); >> +} >> + >> +__extension__ extern __inline float32x2_t >> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >> +vbfdot_lane_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b, >> + const int __index) >> +{ >> + return __builtin_aarch64_bfdot_lanev2sf (__r, __a, __b, __index); >> +} >> + >> +__extension__ extern __inline float32x4_t >> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >> +vbfdotq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b, >> + const int __index) >> +{ >> + return __builtin_aarch64_bfdot_lanev4sf (__r, __a, __b, __index); >> +} >> + >> +__extension__ extern __inline float32x2_t >> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >> +vbfdot_laneq_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x8_t __b, >> + const int __index) >> +{ >> + return __builtin_aarch64_bfdot_laneqv2sf (__r, __a, __b, __index); >> +} >> + >> +__extension__ extern __inline float32x4_t >> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >> +vbfdotq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b, >> + const int __index) >> +{ >> + return __builtin_aarch64_bfdot_laneqv4sf (__r, __a, __b, __index); >> +} >> + >> +#pragma GCC pop_options >> + >> #undef __aarch64_vget_lane_any >> >> #undef __aarch64_vdup_lane_any >> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md >> index 2d566ca1a5fad18b701f1954cff967342085874a..091d3a2fb6926f614d354052961d0913d41f71e9 100644 >> --- a/gcc/config/aarch64/iterators.md >> +++ b/gcc/config/aarch64/iterators.md >> @@ -122,6 +122,9 @@ >> ;; Quad vector with only 2 element modes. >> (define_mode_iterator VQ_2E [V2DI V2DF]) >> >> +;; BFmode vector modes. >> +(define_mode_iterator VBF [V4BF V8BF]) >> + >> ;; This mode iterator allows :P to be used for patterns that operate on >> ;; addresses in different modes. In LP64, only DI will match, while in >> ;; ILP32, either can match. >> @@ -671,6 +674,7 @@ >> UNSPEC_UMULHS ; Used in aarch64-sve2.md. >> UNSPEC_UMULHRS ; Used in aarch64-sve2.md. >> UNSPEC_ASRD ; Used in aarch64-sve.md. >> + UNSPEC_BFDOT ; Used in aarch64-simd.md. >> ]) >> >> ;; ------------------------------------------------------------------ >> @@ -727,6 +731,8 @@ >> >> (define_mode_attr FCVT_CHANGE_MODE [(SI "DF") (DI "SF")]) >> >> +(define_mode_attr isquadop [(V4BF "") (V8BF "q")]) >> + >> ;; For scalar usage of vector/FP registers >> (define_mode_attr v [(QI "b") (HI "h") (SI "s") (DI "d") >> (HF "h") (SF "s") (DF "d") >> @@ -1310,6 +1316,9 @@ >> ;; Register suffix for DOTPROD input types from the return type. >> (define_mode_attr Vdottype [(V2SI "8b") (V4SI "16b")]) >> >> +;; Register suffix for BFDOT input types from the return type. >> +(define_mode_attr Vbfdottype [(V2SF "4h") (V4SF "8h")]) >> + >> ;; Sum of lengths of instructions needed to move vector registers of a mode. >> (define_mode_attr insn_count [(OI "8") (CI "12") (XI "16")]) >> >> @@ -1320,6 +1329,9 @@ >> ;; Width of 2nd and 3rd arguments to fp16 vector multiply add/sub >> (define_mode_attr VFMLA_W [(V2SF "V4HF") (V4SF "V8HF")]) >> >> +;; Width of 2nd and 3rd arguments to bf16 vector multiply add/sub >> +(define_mode_attr VBFMLA_W [(V2SF "V4BF") (V4SF "V8BF")]) >> + >> (define_mode_attr VFMLA_SEL_W [(V2SF "V2HF") (V4SF "V4HF")]) >> >> (define_mode_attr f16quad [(V2SF "") (V4SF "q")]) >> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c >> new file mode 100644 >> index 0000000000000000000000000000000000000000..ad51507731bbb165de64e583ebfbf8047b4eb781 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c >> @@ -0,0 +1,91 @@ >> +/* { dg-do assemble { target { aarch64*-*-* } } } */ >> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ >> +/* { dg-add-options arm_v8_2a_bf16_neon } */ >> +/* { dg-additional-options "-save-temps" } */ >> +/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */ >> +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ >> + >> +#include <arm_neon.h> >> + >> +/* >> +**ufoo: >> +** bfdot v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h) >> +** ret >> +*/ >> +float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) >> +{ >> + return vbfdot_f32 (r, x, y); >> +} >> + >> +/* >> +**ufooq: >> +** bfdot v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h) >> +** ret >> +*/ >> +float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) >> +{ >> + return vbfdotq_f32 (r, x, y); >> +} >> + >> +/* >> +**ufoo_lane: >> +** bfdot v0.2s, v1.4h, v2.2h\[0\] >> +** ret >> +*/ >> +float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) >> +{ >> + return vbfdot_lane_f32 (r, x, y, 0); >> +} >> + >> +/* >> +**ufooq_laneq: >> +** bfdot v0.4s, v1.8h, v2.2h\[2\] >> +** ret >> +*/ >> +float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) >> +{ >> + return vbfdotq_laneq_f32 (r, x, y, 2); >> +} >> + >> +/* >> +**ufoo_laneq: >> +** bfdot v0.2s, v1.4h, v2.2h\[3\] >> +** ret >> +*/ >> +float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y) >> +{ >> + return vbfdot_laneq_f32 (r, x, y, 3); >> +} >> + >> +/* >> +**ufooq_lane: >> +** bfdot v0.4s, v1.8h, v2.2h\[1\] >> +** ret >> +*/ >> +float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y) >> +{ >> + return vbfdotq_lane_f32 (r, x, y, 1); >> +} >> + >> +/* >> +**ufoo_untied: >> +** mov v0.8b, v1.8b >> +** bfdot v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h) >> +** ret >> +*/ >> +float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) >> +{ >> + return vbfdot_f32 (r, x, y); >> +} >> + >> +/* >> +**ufooq_lane_untied: >> +** mov v0.16b, v1.16b >> +** bfdot v0.4s, v2.8h, v3.2h\[1\] >> +** ret >> +*/ >> +float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, bfloat16x8_t x, bfloat16x4_t y) >> +{ >> + return vbfdotq_lane_f32 (r, x, y, 1); >> +} >> + >> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-2.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-2.c >> new file mode 100644 >> index 0000000000000000000000000000000000000000..58bdee5ac9df602b7569724200b3c9ab7c72bb28 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-2.c >> @@ -0,0 +1,91 @@ >> +/* { dg-do assemble { target { aarch64*-*-* } } } */ >> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ >> +/* { dg-add-options arm_v8_2a_bf16_neon } */ >> +/* { dg-additional-options "-mbig-endian --save-temps" } */ >> +/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */ >> +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ >> + >> +#include <arm_neon.h> >> + >> +/* >> +**ufoo: >> +** bfdot v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h) >> +** ret >> +*/ >> +float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) >> +{ >> + return vbfdot_f32 (r, x, y); >> +} >> + >> +/* >> +**ufooq: >> +** bfdot v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h) >> +** ret >> +*/ >> +float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) >> +{ >> + return vbfdotq_f32 (r, x, y); >> +} >> + >> +/* >> +**ufoo_lane: >> +** bfdot v0.2s, v1.4h, v2.2h\[0\] >> +** ret >> +*/ >> +float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) >> +{ >> + return vbfdot_lane_f32 (r, x, y, 0); >> +} >> + >> +/* >> +**ufooq_laneq: >> +** bfdot v0.4s, v1.8h, v2.2h\[2\] >> +** ret >> +*/ >> +float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) >> +{ >> + return vbfdotq_laneq_f32 (r, x, y, 2); >> +} >> + >> +/* >> +**ufoo_laneq: >> +** bfdot v0.2s, v1.4h, v2.2h\[3\] >> +** ret >> +*/ >> +float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y) >> +{ >> + return vbfdot_laneq_f32 (r, x, y, 3); >> +} >> + >> +/* >> +**ufooq_lane: >> +** bfdot v0.4s, v1.8h, v2.2h\[1\] >> +** ret >> +*/ >> +float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y) >> +{ >> + return vbfdotq_lane_f32 (r, x, y, 1); >> +} >> + >> +/* >> +**ufoo_untied: >> +** mov v0.8b, v1.8b >> +** bfdot v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h) >> +** ret >> +*/ >> +float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) >> +{ >> + return vbfdot_f32 (r, x, y); >> +} >> + >> +/* >> +**ufooq_lane_untied: >> +** mov v0.16b, v1.16b >> +** bfdot v0.4s, v2.8h, v3.2h\[1\] >> +** ret >> +*/ >> +float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, bfloat16x8_t x, bfloat16x4_t y) >> +{ >> + return vbfdotq_lane_f32 (r, x, y, 1); >> +} >> + >> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-3.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-3.c >> new file mode 100644 >> index 0000000000000000000000000000000000000000..607126203b00213d94471a1adefe16f265104af8 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-3.c >> @@ -0,0 +1,28 @@ >> +/* { dg-do assemble { target { aarch64*-*-* } } } */ >> +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ >> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ >> +/* { dg-add-options arm_v8_2a_bf16_neon } */ >> +/* { dg-additional-options "--save-temps" } */ >> + >> +#include <arm_neon.h> >> + >> +float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) >> +{ >> + return vbfdot_lane_f32 (r, x, y, -1); /* { dg-error {lane -1 out of range 0 - 1} "" { target *-*-* } 0 } */ >> +} >> + >> +float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) >> +{ >> + return vbfdotq_laneq_f32 (r, x, y, -1); /* { dg-error {lane -1 out of range 0 - 3} "" { target *-*-* } 0 } */ >> +} >> + >> +float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y) >> +{ >> + return vbfdot_laneq_f32 (r, x, y, 4); /* { dg-error {lane 4 out of range 0 - 3} "" { target *-*-* } 0 } */ >> +} >> + >> +float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y) >> +{ >> + return vbfdotq_lane_f32 (r, x, y, 2); /* { dg-error {lane 2 out of range 0 - 1} "" { target *-*-* } 0 } */ >> +} >> +
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index f4ca35a59704c761fe2ac2b6d401fff7c8aba80d..6c5b61c37bcb340f963861723c6e365e32f6ca95 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -682,3 +682,8 @@ BUILTIN_VSFDF (UNOP, frint32x, 0) BUILTIN_VSFDF (UNOP, frint64z, 0) BUILTIN_VSFDF (UNOP, frint64x, 0) + + /* Implemented by aarch64_bfdot{_lane}{q}<mode>. */ + VAR2 (TERNOP, bfdot, 0, v2sf, v4sf) + VAR2 (QUADOP_LANE_PAIR, bfdot_lane, 0, v2sf, v4sf) + VAR2 (QUADOP_LANE_PAIR, bfdot_laneq, 0, v2sf, v4sf) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index c4858ab7cffd786066646a5cd95a168311990b76..bdc26c190610580e57e9749804b7729ee4e34793 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -7027,3 +7027,37 @@ "xtn\t%0.<Vntype>, %1.<Vtype>" [(set_attr "type" "neon_shift_imm_narrow_q")] ) + +(define_insn "aarch64_bfdot<mode>" + [(set (match_operand:VDQSF 0 "register_operand" "=w") + (plus:VDQSF (match_operand:VDQSF 1 "register_operand" "0") + (unspec:VDQSF [(match_operand:<VBFMLA_W> 2 + "register_operand" "w") + (match_operand:<VBFMLA_W> 3 + "register_operand" "w")] + UNSPEC_BFDOT)))] + "TARGET_BF16_SIMD" + "bfdot\t%0.<Vtype>, %2.<Vbfdottype>, %3.<Vbfdottype>" + [(set_attr "type" "neon_dot<q>")] +) + + +(define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>" + [(set (match_operand:VDQSF 0 "register_operand" "=w") + (plus:VDQSF (match_operand:VDQSF 1 "register_operand" "0") + (unspec:VDQSF [(match_operand:<VDQSF:VBFMLA_W> 2 + "register_operand" "w") + (match_operand: VBF 3 + "register_operand" "w") + (match_operand:SI 4 + "const_int_operand" "n")] + UNSPEC_BFDOT)))] + "TARGET_BF16_SIMD" +{ + int nunits = GET_MODE_NUNITS (<VBF:MODE>mode).to_constant (); + int lane = INTVAL (operands[4]); + operands[4] = gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane), SImode); + return "bfdot\t%0.<VDQSF:Vtype>, %2.<VDQSF:Vbfdottype>, %3.2h[%4]"; +} + [(set_attr "type" "neon_dot<VDQSF:q>")] +) diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 5996df0a612caff3c881fc15b0aa12b8f91a193b..0357d97cc4143c3a9c56260d9a9cc24138afc049 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -34612,6 +34612,57 @@ vrnd64xq_f64 (float64x2_t __a) #include "arm_bf16.h" +#pragma GCC push_options +#pragma GCC target ("arch=armv8.2-a+bf16") + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b) +{ + return __builtin_aarch64_bfdotv2sf (__r, __a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfdotq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b) +{ + return __builtin_aarch64_bfdotv4sf (__r, __a, __b); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfdot_lane_f32 \ + (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b, const int __index) +{ + return __builtin_aarch64_bfdot_lanev2sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfdotq_lane_f32 \ + (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b, const int __index) +{ + return __builtin_aarch64_bfdot_lanev4sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfdot_laneq_f32 \ + (float32x2_t __r, bfloat16x4_t __a, bfloat16x8_t __b, const int __index) +{ + return __builtin_aarch64_bfdot_laneqv2sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfdotq_laneq_f32 \ + (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b, const int __index) +{ + return __builtin_aarch64_bfdot_laneqv4sf (__r, __a, __b, __index); +} + +#pragma GCC pop_options + #undef __aarch64_vget_lane_any #undef __aarch64_vdup_lane_any diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 9480efef47c1865867148bd43dbf10faf227f5d0..4645e481b864d505c0d2de2d0bae776982f2c823 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -120,6 +120,9 @@ ;; Quad vector with only 2 element modes. (define_mode_iterator VQ_2E [V2DI V2DF]) +;; BFmode vector modes. +(define_mode_iterator VBF [V4BF V8BF]) + ;; This mode iterator allows :P to be used for patterns that operate on ;; addresses in different modes. In LP64, only DI will match, while in ;; ILP32, either can match. @@ -673,6 +676,7 @@ UNSPEC_UMULHS ; Used in aarch64-sve2.md. UNSPEC_UMULHRS ; Used in aarch64-sve2.md. UNSPEC_ASRD ; Used in aarch64-sve.md. + UNSPEC_BFDOT ; Used in aarch64-simd.md. ]) ;; ------------------------------------------------------------------ @@ -729,6 +733,8 @@ (define_mode_attr FCVT_CHANGE_MODE [(SI "DF") (DI "SF")]) +(define_mode_attr isquadop [(V4BF "") (V8BF "q")]) + ;; For scalar usage of vector/FP registers (define_mode_attr v [(QI "b") (HI "h") (SI "s") (DI "d") (HF "h") (SF "s") (DF "d") @@ -1310,6 +1316,9 @@ ;; Register suffix for DOTPROD input types from the return type. (define_mode_attr Vdottype [(V2SI "8b") (V4SI "16b")]) +;; Register suffix for BFDOT input types from the return type. +(define_mode_attr Vbfdottype [(V2SF "4h") (V4SF "8h")]) + ;; Sum of lengths of instructions needed to move vector registers of a mode. (define_mode_attr insn_count [(OI "8") (CI "12") (XI "16")]) @@ -1320,6 +1329,9 @@ ;; Width of 2nd and 3rd arguments to fp16 vector multiply add/sub (define_mode_attr VFMLA_W [(V2SF "V4HF") (V4SF "V8HF")]) +;; Width of 2nd and 3rd arguments to bf16 vector multiply add/sub +(define_mode_attr VBFMLA_W [(V2SF "V4BF") (V4SF "V8BF")]) + (define_mode_attr VFMLA_SEL_W [(V2SF "V2HF") (V4SF "V4HF")]) (define_mode_attr f16quad [(V2SF "") (V4SF "q")]) diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c new file mode 100644 index 0000000000000000000000000000000000000000..62ac715c2a9c4468eb7c143464390dbf1144d6d6 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c @@ -0,0 +1,80 @@ +/* { dg-do assemble { target { aarch64*-*-* } } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-additional-options "--save-temps" } */ +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */ + +#include <arm_neon.h> + +/* +**ufoo: +** ... +** bfdot\tv[0-9]+.2s, v[0-9]+.4h, v[0-9]+.4h +** ... +** ret +*/ +float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) +{ + return vbfdot_f32 (r, x, y); +} + +/* +**ufooq: +** ... +** bfdot\tv[0-9]+.4s, v[0-9]+.8h, v[0-9]+.8h +** ... +** ret +*/ +float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) +{ + return vbfdotq_f32 (r, x, y); +} + +/* +**ufoo_lane: +** ... +** bfdot\tv[0-9]+.2s, v[0-9]+.4h, v[0-9]+.2h\[0\] +** ... +** ret +*/ +float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) +{ + return vbfdot_lane_f32 (r, x, y, 0); +} + +/* +**ufooq_laneq: +** ... +** bfdot\tv[0-9]+.4s, v[0-9]+.8h, v[0-9]+.2h\[2\] +** ... +** ret +*/ +float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) +{ + return vbfdotq_laneq_f32 (r, x, y, 2); +} + +/* +**ufoo_laneq: +** ... +** bfdot\tv[0-9]+.2s, v[0-9]+.4h, v[0-9]+.2h\[3\] +** ... +** ret +*/ +float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y) +{ + return vbfdot_laneq_f32 (r, x, y, 3); +} + +/* +**ufooq_lane: +** ... +** bfdot\tv[0-9]+.4s, v[0-9]+.8h, v[0-9]+.2h\[1\] +** ... +** ret +*/ +float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y) +{ + return vbfdotq_lane_f32 (r, x, y, 1); +} + diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-2.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-2.c new file mode 100644 index 0000000000000000000000000000000000000000..ae910bbdc0759e7bdd40566ef211f4f83b411792 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-2.c @@ -0,0 +1,80 @@ +/* { dg-do assemble { target { aarch64*-*-* } } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-additional-options "-mbig-endian --save-temps" } */ +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */ + +#include <arm_neon.h> + +/* +**ufoo: +** ... +** bfdot\tv[0-9]+.2s, v[0-9]+.4h, v[0-9]+.4h +** ... +** ret +*/ +float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) +{ + return vbfdot_f32 (r, x, y); +} + +/* +**ufooq: +** ... +** bfdot\tv[0-9]+.4s, v[0-9]+.8h, v[0-9]+.8h +** ... +** ret +*/ +float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) +{ + return vbfdotq_f32 (r, x, y); +} + +/* +**ufoo_lane: +** ... +** bfdot\tv[0-9]+.2s, v[0-9]+.4h, v[0-9]+.2h\[0\] +** ... +** ret +*/ +float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) +{ + return vbfdot_lane_f32 (r, x, y, 0); +} + +/* +**ufooq_laneq: +** ... +** bfdot\tv[0-9]+.4s, v[0-9]+.8h, v[0-9]+.2h\[2\] +** ... +** ret +*/ +float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) +{ + return vbfdotq_laneq_f32 (r, x, y, 2); +} + +/* +**ufoo_laneq: +** ... +** bfdot\tv[0-9]+.2s, v[0-9]+.4h, v[0-9]+.2h\[3\] +** ... +** ret +*/ +float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y) +{ + return vbfdot_laneq_f32 (r, x, y, 3); +} + +/* +**ufooq_lane: +** ... +** bfdot\tv[0-9]+.4s, v[0-9]+.8h, v[0-9]+.2h\[1\] +** ... +** ret +*/ +float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y) +{ + return vbfdotq_lane_f32 (r, x, y, 1); +} + diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-3.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-3.c new file mode 100644 index 0000000000000000000000000000000000000000..607126203b00213d94471a1adefe16f265104af8 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-3.c @@ -0,0 +1,28 @@ +/* { dg-do assemble { target { aarch64*-*-* } } } */ +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-additional-options "--save-temps" } */ + +#include <arm_neon.h> + +float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) +{ + return vbfdot_lane_f32 (r, x, y, -1); /* { dg-error {lane -1 out of range 0 - 1} "" { target *-*-* } 0 } */ +} + +float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) +{ + return vbfdotq_laneq_f32 (r, x, y, -1); /* { dg-error {lane -1 out of range 0 - 3} "" { target *-*-* } 0 } */ +} + +float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y) +{ + return vbfdot_laneq_f32 (r, x, y, 4); /* { dg-error {lane 4 out of range 0 - 3} "" { target *-*-* } 0 } */ +} + +float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y) +{ + return vbfdotq_lane_f32 (r, x, y, 2); /* { dg-error {lane 2 out of range 0 - 1} "" { target *-*-* } 0 } */ +} +