diff mbox series

[AArch64] Add ACLE intrinsics for bfdot for ARMv8.6 Extension

Message ID 44ca371c-24bb-3a47-7dea-58aeb1595697@arm.com
State New
Headers show
Series [AArch64] Add ACLE intrinsics for bfdot for ARMv8.6 Extension | expand

Commit Message

Stamatis Markianos-Wright Dec. 20, 2019, 1:56 p.m. UTC
Hi all,

This patch adds the ARMv8.6 Extension ACLE intrinsics for the bfloat bfdot 
operation.

The functions are declared in arm_neon.h with the armv8.2-a+bf16 target option 
as required.

RTL patterns are defined to generate assembler.

Tests added to verify expected assembly and perform adequate lane checks.

This patch depends on:

https://gcc.gnu.org/ml/gcc-patches/2019-12/msg00857.html

for testuite effective_target update and on:

https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01323.html
https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01324.html

for back-end Bfloat enablement.

Cheers,
Stam


gcc/ChangeLog:

2019-11-04  Stam Markianos-Wright  <stam.markianos-wright@arm.com>

	* config/aarch64/aarch64-simd-builtins.def (aarch64_bfdot,
           aarch64_bfdot_lane, aarch64_bfdot_laneq): New.
	* config/aarch64/aarch64-simd.md
           (aarch64_bfdot, aarch64_bfdot_lane): New.
	* config/aarch64/arm_neon.h (vbfdot_f32, vbfdotq_f32, vbfdot_lane_f32,
           vbfdotq_lane_f32, vbfdot_laneq_f32, vbfdotq_laneq_f32): New.
  	* config/aarch64/iterators.md (UNSPEC_BFDOT, VBF, isquadop, Vbfdottype,
           VBFMLA_W): New.

gcc/testsuite/ChangeLog:

2019-11-04  Stam Markianos-Wright  <stam.markianos-wright@arm.com>

	* gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c: New.
	* gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-2.c: New.
	* gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-3.c: New.

Comments

Richard Sandiford Dec. 20, 2019, 2:36 p.m. UTC | #1
Stam Markianos-Wright <Stam.Markianos-Wright@arm.com> writes:
> Hi all,
>
> This patch adds the ARMv8.6 Extension ACLE intrinsics for the bfloat bfdot 
> operation.
>
> The functions are declared in arm_neon.h with the armv8.2-a+bf16 target option 
> as required.
>
> RTL patterns are defined to generate assembler.
>
> Tests added to verify expected assembly and perform adequate lane checks.
>
> This patch depends on:
>
> https://gcc.gnu.org/ml/gcc-patches/2019-12/msg00857.html
>
> for testuite effective_target update and on:
>
> https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01323.html
> https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01324.html
>
> for back-end Bfloat enablement.
>
> Cheers,
> Stam
>
>
> gcc/ChangeLog:
>
> 2019-11-04  Stam Markianos-Wright  <stam.markianos-wright@arm.com>
>
> 	* config/aarch64/aarch64-simd-builtins.def (aarch64_bfdot,
>            aarch64_bfdot_lane, aarch64_bfdot_laneq): New.
> 	* config/aarch64/aarch64-simd.md
>            (aarch64_bfdot, aarch64_bfdot_lane): New.
> 	* config/aarch64/arm_neon.h (vbfdot_f32, vbfdotq_f32, vbfdot_lane_f32,
>            vbfdotq_lane_f32, vbfdot_laneq_f32, vbfdotq_laneq_f32): New.
>   	* config/aarch64/iterators.md (UNSPEC_BFDOT, VBF, isquadop, Vbfdottype,
>            VBFMLA_W): New.

Changelog nit: the continuation lines should be indened by a tab only.

> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index c4858ab7cffd786066646a5cd95a168311990b76..bdc26c190610580e57e9749804b7729ee4e34793 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -7027,3 +7027,37 @@
>    "xtn\t%0.<Vntype>, %1.<Vtype>"
>    [(set_attr "type" "neon_shift_imm_narrow_q")]
>  )
> +
> +(define_insn "aarch64_bfdot<mode>"
> +  [(set (match_operand:VDQSF 0 "register_operand" "=w")
> +	(plus:VDQSF (match_operand:VDQSF 1 "register_operand" "0")
> +		    (unspec:VDQSF [(match_operand:<VBFMLA_W> 2
> +						"register_operand" "w")
> +				   (match_operand:<VBFMLA_W> 3
> +						"register_operand" "w")]
> +				   UNSPEC_BFDOT)))]

The operands to the plus should be the other way around, so that
the more complicated operand comes first,

> +  "TARGET_BF16_SIMD"
> +  "bfdot\t%0.<Vtype>, %2.<Vbfdottype>, %3.<Vbfdottype>"
> +  [(set_attr "type" "neon_dot<q>")]
> +)
> +
> +
> +(define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>"
> +  [(set (match_operand:VDQSF 0 "register_operand" "=w")
> +	(plus:VDQSF (match_operand:VDQSF 1 "register_operand" "0")
> +		    (unspec:VDQSF [(match_operand:<VDQSF:VBFMLA_W> 2
> +						"register_operand" "w")
> +				   (match_operand: VBF 3

Nit: should be no space before "VBF".

> +						"register_operand" "w")
> +				   (match_operand:SI 4
> +						"const_int_operand" "n")]
> +				   UNSPEC_BFDOT)))]
> +  "TARGET_BF16_SIMD"
> +{
> +  int nunits = GET_MODE_NUNITS (<VBF:MODE>mode).to_constant ();
> +  int lane = INTVAL (operands[4]);
> +  operands[4] =  gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane), SImode);

Should only be one space after "=".

> +  return "bfdot\t%0.<VDQSF:Vtype>, %2.<VDQSF:Vbfdottype>, %3.2h[%4]";
> +}
> +  [(set_attr "type" "neon_dot<VDQSF:q>")]
> +)
> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
> index 5996df0a612caff3c881fc15b0aa12b8f91a193b..0357d97cc4143c3a9c56260d9a9cc24138afc049 100644
> --- a/gcc/config/aarch64/arm_neon.h
> +++ b/gcc/config/aarch64/arm_neon.h
> @@ -34612,6 +34612,57 @@ vrnd64xq_f64 (float64x2_t __a)
>  
>  #include "arm_bf16.h"
>  
> +#pragma GCC push_options
> +#pragma GCC target ("arch=armv8.2-a+bf16")
> +
> +__extension__ extern __inline float32x2_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b)
> +{
> +  return __builtin_aarch64_bfdotv2sf (__r, __a, __b);
> +}
> +
> +__extension__ extern __inline float32x4_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vbfdotq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
> +{
> +  return __builtin_aarch64_bfdotv4sf (__r, __a, __b);
> +}
> +
> +__extension__ extern __inline float32x2_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vbfdot_lane_f32 \
> +      (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b, const int __index)

Stray backslash (same comment as for the USDOT/SUDOT review
just posted).

> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..62ac715c2a9c4468eb7c143464390dbf1144d6d6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c
> @@ -0,0 +1,80 @@
> +/* { dg-do assemble { target { aarch64*-*-* } } } */
> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
> +/* { dg-add-options arm_v8_2a_bf16_neon }  */
> +/* { dg-additional-options "--save-temps" } */
> +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +**ufoo:
> +**	...
> +**	bfdot\tv[0-9]+.2s, v[0-9]+.4h, v[0-9]+.4h
> +**	...
> +**	ret
> +*/
> +float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
> +{
> +  return vbfdot_f32 (r, x, y);
> +}

Same comments as for SUDOT and USDOT here too.

Thanks,
Richard
Stamatis Markianos-Wright Dec. 30, 2019, 9:22 a.m. UTC | #2
On 12/20/19 2:36 PM, Richard Sandiford wrote:
> Stam Markianos-Wright <Stam.Markianos-Wright@arm.com> writes:
>> Hi all,
>>
>> This patch adds the ARMv8.6 Extension ACLE intrinsics for the bfloat bfdot
>> operation.
>>
>> The functions are declared in arm_neon.h with the armv8.2-a+bf16 target option
>> as required.
>>
>> RTL patterns are defined to generate assembler.
>>
>> Tests added to verify expected assembly and perform adequate lane checks.
>>
>> This patch depends on:
>>
>> https://gcc.gnu.org/ml/gcc-patches/2019-12/msg00857.html
>>
>> for testuite effective_target update and on:
>>
>> https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01323.html
>> https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01324.html
>>
>> for back-end Bfloat enablement.
>>
>> Cheers,
>> Stam
>>
>>
>> gcc/ChangeLog:
>>
>> 2019-11-04  Stam Markianos-Wright  <stam.markianos-wright@arm.com>
>>
>> 	* config/aarch64/aarch64-simd-builtins.def (aarch64_bfdot,
>>             aarch64_bfdot_lane, aarch64_bfdot_laneq): New.
>> 	* config/aarch64/aarch64-simd.md
>>             (aarch64_bfdot, aarch64_bfdot_lane): New.
>> 	* config/aarch64/arm_neon.h (vbfdot_f32, vbfdotq_f32, vbfdot_lane_f32,
>>             vbfdotq_lane_f32, vbfdot_laneq_f32, vbfdotq_laneq_f32): New.
>>    	* config/aarch64/iterators.md (UNSPEC_BFDOT, VBF, isquadop, Vbfdottype,
>>             VBFMLA_W): New.
> 
> Changelog nit: the continuation lines should be indened by a tab only.

Yes, sorry, that's my email client messing things up again! Fixed 
locally and will carry over when I do the commit.

> 
>> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
>> index c4858ab7cffd786066646a5cd95a168311990b76..bdc26c190610580e57e9749804b7729ee4e34793 100644
>> --- a/gcc/config/aarch64/aarch64-simd.md
>> +++ b/gcc/config/aarch64/aarch64-simd.md
>> @@ -7027,3 +7027,37 @@
>>     "xtn\t%0.<Vntype>, %1.<Vtype>"
>>     [(set_attr "type" "neon_shift_imm_narrow_q")]
>>   )
>> +
>> +(define_insn "aarch64_bfdot<mode>"
>> +  [(set (match_operand:VDQSF 0 "register_operand" "=w")
>> +	(plus:VDQSF (match_operand:VDQSF 1 "register_operand" "0")
>> +		    (unspec:VDQSF [(match_operand:<VBFMLA_W> 2
>> +						"register_operand" "w")
>> +				   (match_operand:<VBFMLA_W> 3
>> +						"register_operand" "w")]
>> +				   UNSPEC_BFDOT)))]
> 
> The operands to the plus should be the other way around, so that
> the more complicated operand comes first,
> 

Done

>> +  "TARGET_BF16_SIMD"
>> +  "bfdot\t%0.<Vtype>, %2.<Vbfdottype>, %3.<Vbfdottype>"
>> +  [(set_attr "type" "neon_dot<q>")]
>> +)
>> +
>> +
>> +(define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>"
>> +  [(set (match_operand:VDQSF 0 "register_operand" "=w")
>> +	(plus:VDQSF (match_operand:VDQSF 1 "register_operand" "0")
>> +		    (unspec:VDQSF [(match_operand:<VDQSF:VBFMLA_W> 2
>> +						"register_operand" "w")
>> +				   (match_operand: VBF 3
> 
> Nit: should be no space before "VBF".

Done

> 
>> +						"register_operand" "w")
>> +				   (match_operand:SI 4
>> +						"const_int_operand" "n")]
>> +				   UNSPEC_BFDOT)))]
>> +  "TARGET_BF16_SIMD"
>> +{
>> +  int nunits = GET_MODE_NUNITS (<VBF:MODE>mode).to_constant ();
>> +  int lane = INTVAL (operands[4]);
>> +  operands[4] =  gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane), SImode);
> 
> Should only be one space after "=".

Done

> 
>> +  return "bfdot\t%0.<VDQSF:Vtype>, %2.<VDQSF:Vbfdottype>, %3.2h[%4]";
>> +}
>> +  [(set_attr "type" "neon_dot<VDQSF:q>")]
>> +)
>> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
>> index 5996df0a612caff3c881fc15b0aa12b8f91a193b..0357d97cc4143c3a9c56260d9a9cc24138afc049 100644
>> --- a/gcc/config/aarch64/arm_neon.h
>> +++ b/gcc/config/aarch64/arm_neon.h
>> @@ -34612,6 +34612,57 @@ vrnd64xq_f64 (float64x2_t __a)
>>   
>>   #include "arm_bf16.h"
>>   
>> +#pragma GCC push_options
>> +#pragma GCC target ("arch=armv8.2-a+bf16")
>> +
>> +__extension__ extern __inline float32x2_t
>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> +vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b)
>> +{
>> +  return __builtin_aarch64_bfdotv2sf (__r, __a, __b);
>> +}
>> +
>> +__extension__ extern __inline float32x4_t
>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> +vbfdotq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
>> +{
>> +  return __builtin_aarch64_bfdotv4sf (__r, __a, __b);
>> +}
>> +
>> +__extension__ extern __inline float32x2_t
>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> +vbfdot_lane_f32 \
>> +      (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b, const int __index)
> 
> Stray backslash (same comment as for the USDOT/SUDOT review
> just posted).

Done


> 
>> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..62ac715c2a9c4468eb7c143464390dbf1144d6d6
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c
>> @@ -0,0 +1,80 @@
>> +/* { dg-do assemble { target { aarch64*-*-* } } } */
>> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
>> +/* { dg-add-options arm_v8_2a_bf16_neon }  */
>> +/* { dg-additional-options "--save-temps" } */
>> +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
>> +
>> +#include <arm_neon.h>
>> +
>> +/*
>> +**ufoo:
>> +**	...
>> +**	bfdot\tv[0-9]+.2s, v[0-9]+.4h, v[0-9]+.4h
>> +**	...
>> +**	ret
>> +*/
>> +float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
>> +{
>> +  return vbfdot_f32 (r, x, y);
>> +}
> 
> Same comments as for SUDOT and USDOT here too.

Same changes as US/SUDOT.

Thank you!
Stam
> 
> Thanks,
> Richard
>
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index f4ca35a59704c761fe2ac2b6d401fff7c8aba80d..6c5b61c37bcb340f963861723c6e365e32f6ca95 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -682,3 +682,8 @@
   BUILTIN_VSFDF (UNOP, frint32x, 0)
   BUILTIN_VSFDF (UNOP, frint64z, 0)
   BUILTIN_VSFDF (UNOP, frint64x, 0)
+
+  /* Implemented by aarch64_bfdot{_lane}{q}<mode>.  */
+  VAR2 (TERNOP, bfdot, 0, v2sf, v4sf)
+  VAR2 (QUADOP_LANE_PAIR, bfdot_lane, 0, v2sf, v4sf)
+  VAR2 (QUADOP_LANE_PAIR, bfdot_laneq, 0, v2sf, v4sf)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index adfda96f077075ad53d4bea2919c4d3b326e49f5..7587bc46ba1c80389ea49fa83a0e6f8a489711e9 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -7028,3 +7028,36 @@
   "xtn\t%0.<Vntype>, %1.<Vtype>"
   [(set_attr "type" "neon_shift_imm_narrow_q")]
 )
+
+(define_insn "aarch64_bfdot<mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+	(plus:VDQSF
+	  (unspec:VDQSF
+	   [(match_operand:<VBFMLA_W> 2 "register_operand" "w")
+	    (match_operand:<VBFMLA_W> 3 "register_operand" "w")]
+	    UNSPEC_BFDOT)
+	  (match_operand:VDQSF 1 "register_operand" "0")))]
+  "TARGET_BF16_SIMD"
+  "bfdot\t%0.<Vtype>, %2.<Vbfdottype>, %3.<Vbfdottype>"
+  [(set_attr "type" "neon_dot<q>")]
+)
+
+
+(define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+	(plus:VDQSF
+	  (unspec:VDQSF
+	   [(match_operand:<VDQSF:VBFMLA_W> 2 "register_operand" "w")
+	    (match_operand:VBF 3 "register_operand" "w")
+	    (match_operand:SI 4 "const_int_operand" "n")]
+	    UNSPEC_BFDOT)
+	  (match_operand:VDQSF 1 "register_operand" "0")))]
+  "TARGET_BF16_SIMD"
+{
+  int nunits = GET_MODE_NUNITS (<VBF:MODE>mode).to_constant ();
+  int lane = INTVAL (operands[4]);
+  operands[4] = gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane), SImode);
+  return "bfdot\t%0.<VDQSF:Vtype>, %2.<VDQSF:Vbfdottype>, %3.2h[%4]";
+}
+  [(set_attr "type" "neon_dot<VDQSF:q>")]
+)
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index ee4bb76bcd4f52bdf99ba9b24fc5749ba555a73b..c304c2c4597550882377d1dfce03fff92e8ebde3 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -34611,6 +34611,57 @@ vrnd64xq_f64 (float64x2_t __a)
 
 #include "arm_bf16.h"
 
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+bf16")
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b)
+{
+  return __builtin_aarch64_bfdotv2sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdotq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_aarch64_bfdotv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdot_lane_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b,
+		 const int __index)
+{
+  return __builtin_aarch64_bfdot_lanev2sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdotq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
+		  const int __index)
+{
+  return __builtin_aarch64_bfdot_lanev4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdot_laneq_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x8_t __b,
+		  const int __index)
+{
+  return __builtin_aarch64_bfdot_laneqv2sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdotq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
+		   const int __index)
+{
+  return __builtin_aarch64_bfdot_laneqv4sf (__r, __a, __b, __index);
+}
+
+#pragma GCC pop_options
+
 #undef __aarch64_vget_lane_any
 
 #undef __aarch64_vdup_lane_any
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 04262645a019087b600ff47667c13381dab10d66..2277abcaf7f10a256ddbadb1d4be40ba42f0ac67 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -119,6 +119,9 @@
 ;; Quad vector with only 2 element modes.
 (define_mode_iterator VQ_2E [V2DI V2DF])
 
+;; BFmode vector modes.
+(define_mode_iterator VBF [V4BF V8BF])
+
 ;; This mode iterator allows :P to be used for patterns that operate on
 ;; addresses in different modes.  In LP64, only DI will match, while in
 ;; ILP32, either can match.
@@ -671,6 +674,7 @@
     UNSPEC_UMULHS	; Used in aarch64-sve2.md.
     UNSPEC_UMULHRS	; Used in aarch64-sve2.md.
     UNSPEC_ASRD		; Used in aarch64-sve.md.
+    UNSPEC_BFDOT	; Used in aarch64-simd.md.
 ])
 
 ;; ------------------------------------------------------------------
@@ -727,6 +731,8 @@
 
 (define_mode_attr FCVT_CHANGE_MODE [(SI "DF") (DI "SF")])
 
+(define_mode_attr isquadop [(V4BF "") (V8BF "q")])
+
 ;; For scalar usage of vector/FP registers
 (define_mode_attr v [(QI "b") (HI "h") (SI "s") (DI "d")
 		    (HF  "h") (SF "s") (DF "d")
@@ -1308,6 +1314,9 @@
 ;; Register suffix for DOTPROD input types from the return type.
 (define_mode_attr Vdottype [(V2SI "8b") (V4SI "16b")])
 
+;; Register suffix for BFDOT input types from the return type.
+(define_mode_attr Vbfdottype [(V2SF "4h") (V4SF "8h")])
+
 ;; Sum of lengths of instructions needed to move vector registers of a mode.
 (define_mode_attr insn_count [(OI "8") (CI "12") (XI "16")])
 
@@ -1318,6 +1327,9 @@
 ;; Width of 2nd and 3rd arguments to fp16 vector multiply add/sub
 (define_mode_attr VFMLA_W [(V2SF "V4HF") (V4SF "V8HF")])
 
+;; Width of 2nd and 3rd arguments to bf16 vector multiply add/sub
+(define_mode_attr VBFMLA_W [(V2SF "V4BF") (V4SF "V8BF")])
+
 (define_mode_attr VFMLA_SEL_W [(V2SF "V2HF") (V4SF "V4HF")])
 
 (define_mode_attr f16quad [(V2SF "") (V4SF "q")])
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c
new file mode 100644
index 0000000000000000000000000000000000000000..c575dcd3901172a52fa9403c9179d58eea44eb72
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c
@@ -0,0 +1,91 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-O -save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+
+#include <arm_neon.h>
+
+/*
+**ufoo:
+**	bfdot	v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h)
+**	ret
+*/
+float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
+{
+  return vbfdot_f32 (r, x, y);
+}
+
+/*
+**ufooq:
+**	bfdot	v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h)
+**	ret
+*/
+float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+  return vbfdotq_f32 (r, x, y);
+}
+
+/*
+**ufoo_lane:
+**	bfdot	v0.2s, (v1.4h, v2.2h\[0\]|v2.4h, v1.2h\[0\])
+**	ret
+*/
+float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
+{
+  return vbfdot_lane_f32 (r, x, y, 0);
+}
+
+/*
+**ufooq_laneq:
+**	bfdot	v0.4s, (v1.8h, v2.2h\[2\]|v2.8h, v1.2h\[2\])
+**	ret
+*/
+float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+  return vbfdotq_laneq_f32 (r, x, y, 2);
+}
+
+/*
+**ufoo_laneq:
+**	bfdot	v0.2s, (v1.4h, v2.2h\[3\]|v2.4h, v1.2h\[3\])
+**	ret
+*/
+float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
+{
+  return vbfdot_laneq_f32 (r, x, y, 3);
+}
+
+/*
+**ufooq_lane:
+**	bfdot	v0.4s, (v1.8h, v2.2h\[1\]|v2.8h, v1.2h\[1\])
+**	ret
+*/
+float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
+{
+  return vbfdotq_lane_f32 (r, x, y, 1);
+}
+
+/*
+**ufoo_untied:
+**	mov	v0.8b, v1.8b
+**	bfdot	v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h)
+**	ret
+*/
+float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
+{
+  return vbfdot_f32 (r, x, y);
+}
+
+/*
+**ufooq_lane_untied:
+**	mov	v0.16b, v1.16b
+**	bfdot	v0.4s, (v2.8h, v3.2h\[1\]|v3.8h, v2.2h\[1\])
+**	ret
+*/
+float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
+{
+  return vbfdotq_lane_f32 (r, x, y, 1);
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-2.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..a4da60a0a721c6ea819e28cb8f178c317eb54de1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-2.c
@@ -0,0 +1,91 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-O -mbig-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+
+#include <arm_neon.h>
+
+/*
+**ufoo:
+**	bfdot	v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h)
+**	ret
+*/
+float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
+{
+  return vbfdot_f32 (r, x, y);
+}
+
+/*
+**ufooq:
+**	bfdot	v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h)
+**	ret
+*/
+float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+  return vbfdotq_f32 (r, x, y);
+}
+
+/*
+**ufoo_lane:
+**	bfdot	v0.2s, (v1.4h, v2.2h\[0\]|v2.4h, v1.2h\[0\])
+**	ret
+*/
+float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
+{
+  return vbfdot_lane_f32 (r, x, y, 0);
+}
+
+/*
+**ufooq_laneq:
+**	bfdot	v0.4s, (v1.8h, v2.2h\[2\]|v2.8h, v1.2h\[2\])
+**	ret
+*/
+float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+  return vbfdotq_laneq_f32 (r, x, y, 2);
+}
+
+/*
+**ufoo_laneq:
+**	bfdot	v0.2s, (v1.4h, v2.2h\[3\]|v2.4h, v1.2h\[3\])
+**	ret
+*/
+float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
+{
+  return vbfdot_laneq_f32 (r, x, y, 3);
+}
+
+/*
+**ufooq_lane:
+**	bfdot	v0.4s, (v1.8h, v2.2h\[1\]|v2.8h, v1.2h\[1\])
+**	ret
+*/
+float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
+{
+  return vbfdotq_lane_f32 (r, x, y, 1);
+}
+
+/*
+**ufoo_untied:
+**	mov	v0.8b, v1.8b
+**	bfdot	v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h)
+**	ret
+*/
+float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
+{
+  return vbfdot_f32 (r, x, y);
+}
+
+/*
+**ufooq_lane_untied:
+**	mov	v0.16b, v1.16b
+**	bfdot	v0.4s, (v2.8h, v3.2h\[1\]|v3.8h, v2.2h\[1\])
+**	ret
+*/
+float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
+{
+  return vbfdotq_lane_f32 (r, x, y, 1);
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-3.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-3.c
new file mode 100644
index 0000000000000000000000000000000000000000..607126203b00213d94471a1adefe16f265104af8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-3.c
@@ -0,0 +1,28 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "--save-temps" } */
+
+#include <arm_neon.h>
+
+float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
+{
+  return vbfdot_lane_f32 (r, x, y, -1); /* { dg-error {lane -1 out of range 0 - 1} "" { target *-*-* } 0 } */
+}
+
+float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+  return vbfdotq_laneq_f32 (r, x, y, -1); /* { dg-error {lane -1 out of range 0 - 3} "" { target *-*-* } 0 } */
+}
+
+float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
+{
+  return vbfdot_laneq_f32 (r, x, y, 4); /* { dg-error {lane 4 out of range 0 - 3} "" { target *-*-* } 0 } */
+}
+
+float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
+{
+  return vbfdotq_lane_f32 (r, x, y, 2); /* { dg-error {lane 2 out of range 0 - 1} "" { target *-*-* } 0 } */
+}
+
Richard Sandiford Dec. 30, 2019, 10:29 a.m. UTC | #3
Stam Markianos-Wright <Stam.Markianos-Wright@arm.com> writes:
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index adfda96f077075ad53d4bea2919c4d3b326e49f5..7587bc46ba1c80389ea49fa83a0e6f8a489711e9 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -7028,3 +7028,36 @@
>    "xtn\t%0.<Vntype>, %1.<Vtype>"
>    [(set_attr "type" "neon_shift_imm_narrow_q")]
>  )
> +
> +(define_insn "aarch64_bfdot<mode>"
> +  [(set (match_operand:VDQSF 0 "register_operand" "=w")
> +	(plus:VDQSF
> +	  (unspec:VDQSF
> +	   [(match_operand:<VBFMLA_W> 2 "register_operand" "w")
> +	    (match_operand:<VBFMLA_W> 3 "register_operand" "w")]
> +	    UNSPEC_BFDOT)
> +	  (match_operand:VDQSF 1 "register_operand" "0")))]
> +  "TARGET_BF16_SIMD"
> +  "bfdot\t%0.<Vtype>, %2.<Vbfdottype>, %3.<Vbfdottype>"
> +  [(set_attr "type" "neon_dot<q>")]
> +)
> +
> +
> +(define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>"

Too many blank lines.

> +  [(set (match_operand:VDQSF 0 "register_operand" "=w")
> +	(plus:VDQSF
> +	  (unspec:VDQSF
> +	   [(match_operand:<VDQSF:VBFMLA_W> 2 "register_operand" "w")
> +	    (match_operand:VBF 3 "register_operand" "w")
> +	    (match_operand:SI 4 "const_int_operand" "n")]
> +	    UNSPEC_BFDOT)
> +	  (match_operand:VDQSF 1 "register_operand" "0")))]
> +  "TARGET_BF16_SIMD"
> +{
> +  int nunits = GET_MODE_NUNITS (<VBF:MODE>mode).to_constant ();
> +  int lane = INTVAL (operands[4]);
> +  operands[4] = gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane), SImode);
> +  return "bfdot\t%0.<VDQSF:Vtype>, %2.<VDQSF:Vbfdottype>, %3.2h[%4]";
> +}
> +  [(set_attr "type" "neon_dot<VDQSF:q>")]
> +)
> [...]
> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..c575dcd3901172a52fa9403c9179d58eea44eb72
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c
> @@ -0,0 +1,91 @@
> +/* { dg-do assemble { target { aarch64*-*-* } } } */
> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
> +/* { dg-add-options arm_v8_2a_bf16_neon }  */
> +/* { dg-additional-options "-O -save-temps" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */

Same comment as for USDOT/SUDOT regarding the dg- markup.

> +
> +#include <arm_neon.h>
> +
> +/*
> +**ufoo:
> +**	bfdot	v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h)
> +**	ret
> +*/
> +float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
> +{
> +  return vbfdot_f32 (r, x, y);
> +}
> +
> +/*
> +**ufooq:
> +**	bfdot	v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h)
> +**	ret
> +*/
> +float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
> +{
> +  return vbfdotq_f32 (r, x, y);
> +}

The (...|...)s here are correct.

> +
> +/*
> +**ufoo_lane:
> +**	bfdot	v0.2s, (v1.4h, v2.2h\[0\]|v2.4h, v1.2h\[0\])
> +**	ret
> +*/
> +float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
> +{
> +  return vbfdot_lane_f32 (r, x, y, 0);
> +}
> +
> +/*
> +**ufooq_laneq:
> +**	bfdot	v0.4s, (v1.8h, v2.2h\[2\]|v2.8h, v1.2h\[2\])
> +**	ret
> +*/
> +float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
> +{
> +  return vbfdotq_laneq_f32 (r, x, y, 2);
> +}
> +
> +/*
> +**ufoo_laneq:
> +**	bfdot	v0.2s, (v1.4h, v2.2h\[3\]|v2.4h, v1.2h\[3\])
> +**	ret
> +*/
> +float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
> +{
> +  return vbfdot_laneq_f32 (r, x, y, 3);
> +}
> +
> +/*
> +**ufooq_lane:
> +**	bfdot	v0.4s, (v1.8h, v2.2h\[1\]|v2.8h, v1.2h\[1\])
> +**	ret
> +*/
> +float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
> +{
> +  return vbfdotq_lane_f32 (r, x, y, 1);
> +}

But these aren't, since the operands must be in the order given.

> +
> +/*
> +**ufoo_untied:
> +**	mov	v0.8b, v1.8b
> +**	bfdot	v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h)
> +**	ret
> +*/
> +float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
> +{
> +  return vbfdot_f32 (r, x, y);
> +}

Similarly, OK here.

> +
> +/*
> +**ufooq_lane_untied:
> +**	mov	v0.16b, v1.16b
> +**	bfdot	v0.4s, (v2.8h, v3.2h\[1\]|v3.8h, v2.2h\[1\])
> +**	ret
> +*/
> +float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
> +{
> +  return vbfdotq_lane_f32 (r, x, y, 1);
> +}

...but not here.

Same comments for the big-endian test.

Thanks,
Richard
Stamatis Markianos-Wright Jan. 9, 2020, 2:47 p.m. UTC | #4
On 12/30/19 10:29 AM, Richard Sandiford wrote:
> Stam Markianos-Wright <Stam.Markianos-Wright@arm.com> writes:
>> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
>> index adfda96f077075ad53d4bea2919c4d3b326e49f5..7587bc46ba1c80389ea49fa83a0e6f8a489711e9 100644
>> --- a/gcc/config/aarch64/aarch64-simd.md
>> +++ b/gcc/config/aarch64/aarch64-simd.md
>> @@ -7028,3 +7028,36 @@
>>     "xtn\t%0.<Vntype>, %1.<Vtype>"
>>     [(set_attr "type" "neon_shift_imm_narrow_q")]
>>   )
>> +
>> +(define_insn "aarch64_bfdot<mode>"
>> +  [(set (match_operand:VDQSF 0 "register_operand" "=w")
>> +	(plus:VDQSF
>> +	  (unspec:VDQSF
>> +	   [(match_operand:<VBFMLA_W> 2 "register_operand" "w")
>> +	    (match_operand:<VBFMLA_W> 3 "register_operand" "w")]
>> +	    UNSPEC_BFDOT)
>> +	  (match_operand:VDQSF 1 "register_operand" "0")))]
>> +  "TARGET_BF16_SIMD"
>> +  "bfdot\t%0.<Vtype>, %2.<Vbfdottype>, %3.<Vbfdottype>"
>> +  [(set_attr "type" "neon_dot<q>")]
>> +)
>> +
>> +
>> +(define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>"
> 
> Too many blank lines.

Fixed, sorry I hadn't noticed!

> 
>> +  [(set (match_operand:VDQSF 0 "register_operand" "=w")
>> +	(plus:VDQSF
>> +	  (unspec:VDQSF
>> +	   [(match_operand:<VDQSF:VBFMLA_W> 2 "register_operand" "w")
>> +	    (match_operand:VBF 3 "register_operand" "w")
>> +	    (match_operand:SI 4 "const_int_operand" "n")]
>> +	    UNSPEC_BFDOT)
>> +	  (match_operand:VDQSF 1 "register_operand" "0")))]
>> +  "TARGET_BF16_SIMD"
>> +{
>> +  int nunits = GET_MODE_NUNITS (<VBF:MODE>mode).to_constant ();
>> +  int lane = INTVAL (operands[4]);
>> +  operands[4] = gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane), SImode);
>> +  return "bfdot\t%0.<VDQSF:Vtype>, %2.<VDQSF:Vbfdottype>, %3.2h[%4]";
>> +}
>> +  [(set_attr "type" "neon_dot<VDQSF:q>")]
>> +)
>> [...]
>> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..c575dcd3901172a52fa9403c9179d58eea44eb72
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c
>> @@ -0,0 +1,91 @@
>> +/* { dg-do assemble { target { aarch64*-*-* } } } */
>> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
>> +/* { dg-add-options arm_v8_2a_bf16_neon }  */
>> +/* { dg-additional-options "-O -save-temps" } */
>> +/* { dg-final { check-function-bodies "**" "" } } */
>> +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
> 
> Same comment as for USDOT/SUDOT regarding the dg- markup.

Done!
> 
>> +
>> +#include <arm_neon.h>
>> +
>> +/*
>> +**ufoo:
>> +**	bfdot	v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h)
>> +**	ret
>> +*/
>> +float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
>> +{
>> +  return vbfdot_f32 (r, x, y);
>> +}
>> +
>> +/*
>> +**ufooq:
>> +**	bfdot	v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h)
>> +**	ret
>> +*/
>> +float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
>> +{
>> +  return vbfdotq_f32 (r, x, y);
>> +}
> 
> The (...|...)s here are correct.
Yep.
> 
>> +
>> +/*
>> +**ufoo_lane:
>> +**	bfdot	v0.2s, (v1.4h, v2.2h\[0\]|v2.4h, v1.2h\[0\])
>> +**	ret
>> +*/
>> +float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
>> +{
>> +  return vbfdot_lane_f32 (r, x, y, 0);
>> +}
>> +
>> +/*
>> +**ufooq_laneq:
>> +**	bfdot	v0.4s, (v1.8h, v2.2h\[2\]|v2.8h, v1.2h\[2\])
>> +**	ret
>> +*/
>> +float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
>> +{
>> +  return vbfdotq_laneq_f32 (r, x, y, 2);
>> +}
>> +
>> +/*
>> +**ufoo_laneq:
>> +**	bfdot	v0.2s, (v1.4h, v2.2h\[3\]|v2.4h, v1.2h\[3\])
>> +**	ret
>> +*/
>> +float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
>> +{
>> +  return vbfdot_laneq_f32 (r, x, y, 3);
>> +}
>> +
>> +/*
>> +**ufooq_lane:
>> +**	bfdot	v0.4s, (v1.8h, v2.2h\[1\]|v2.8h, v1.2h\[1\])
>> +**	ret
>> +*/
>> +float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
>> +{
>> +  return vbfdotq_lane_f32 (r, x, y, 1);
>> +}
> 
> But these aren't, since the operands must be in the order given.
Yep.
> 
>> +
>> +/*
>> +**ufoo_untied:
>> +**	mov	v0.8b, v1.8b
>> +**	bfdot	v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h)
>> +**	ret
>> +*/
>> +float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
>> +{
>> +  return vbfdot_f32 (r, x, y);
>> +}
> 
> Similarly, OK here.
Yep.
> 
>> +
>> +/*
>> +**ufooq_lane_untied:
>> +**	mov	v0.16b, v1.16b
>> +**	bfdot	v0.4s, (v2.8h, v3.2h\[1\]|v3.8h, v2.2h\[1\])
>> +**	ret
>> +*/
>> +float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
>> +{
>> +  return vbfdotq_lane_f32 (r, x, y, 1);
>> +}
> 
> ...but not here.
Yep.
> 
> Same comments for the big-endian test.
Done.

Thank you so much for the in depth review comments!

Cheers,
Stam
> 
> Thanks,
> Richard
>
Richard Sandiford Jan. 9, 2020, 3:54 p.m. UTC | #5
Please update the names of the testsuite files to match the ones
in the bfloat16_t patch.  (Same for the usdot/sudot patch -- sorry
for forgetting there.)

OK with that change, thanks.

Richard

Stam Markianos-Wright <Stam.Markianos-Wright@arm.com> writes:
> On 12/30/19 10:29 AM, Richard Sandiford wrote:
>> Stam Markianos-Wright <Stam.Markianos-Wright@arm.com> writes:
>>> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
>>> index adfda96f077075ad53d4bea2919c4d3b326e49f5..7587bc46ba1c80389ea49fa83a0e6f8a489711e9 100644
>>> --- a/gcc/config/aarch64/aarch64-simd.md
>>> +++ b/gcc/config/aarch64/aarch64-simd.md
>>> @@ -7028,3 +7028,36 @@
>>>     "xtn\t%0.<Vntype>, %1.<Vtype>"
>>>     [(set_attr "type" "neon_shift_imm_narrow_q")]
>>>   )
>>> +
>>> +(define_insn "aarch64_bfdot<mode>"
>>> +  [(set (match_operand:VDQSF 0 "register_operand" "=w")
>>> +	(plus:VDQSF
>>> +	  (unspec:VDQSF
>>> +	   [(match_operand:<VBFMLA_W> 2 "register_operand" "w")
>>> +	    (match_operand:<VBFMLA_W> 3 "register_operand" "w")]
>>> +	    UNSPEC_BFDOT)
>>> +	  (match_operand:VDQSF 1 "register_operand" "0")))]
>>> +  "TARGET_BF16_SIMD"
>>> +  "bfdot\t%0.<Vtype>, %2.<Vbfdottype>, %3.<Vbfdottype>"
>>> +  [(set_attr "type" "neon_dot<q>")]
>>> +)
>>> +
>>> +
>>> +(define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>"
>> 
>> Too many blank lines.
>
> Fixed, sorry I hadn't noticed!
>
>> 
>>> +  [(set (match_operand:VDQSF 0 "register_operand" "=w")
>>> +	(plus:VDQSF
>>> +	  (unspec:VDQSF
>>> +	   [(match_operand:<VDQSF:VBFMLA_W> 2 "register_operand" "w")
>>> +	    (match_operand:VBF 3 "register_operand" "w")
>>> +	    (match_operand:SI 4 "const_int_operand" "n")]
>>> +	    UNSPEC_BFDOT)
>>> +	  (match_operand:VDQSF 1 "register_operand" "0")))]
>>> +  "TARGET_BF16_SIMD"
>>> +{
>>> +  int nunits = GET_MODE_NUNITS (<VBF:MODE>mode).to_constant ();
>>> +  int lane = INTVAL (operands[4]);
>>> +  operands[4] = gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane), SImode);
>>> +  return "bfdot\t%0.<VDQSF:Vtype>, %2.<VDQSF:Vbfdottype>, %3.2h[%4]";
>>> +}
>>> +  [(set_attr "type" "neon_dot<VDQSF:q>")]
>>> +)
>>> [...]
>>> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c
>>> new file mode 100644
>>> index 0000000000000000000000000000000000000000..c575dcd3901172a52fa9403c9179d58eea44eb72
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c
>>> @@ -0,0 +1,91 @@
>>> +/* { dg-do assemble { target { aarch64*-*-* } } } */
>>> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
>>> +/* { dg-add-options arm_v8_2a_bf16_neon }  */
>>> +/* { dg-additional-options "-O -save-temps" } */
>>> +/* { dg-final { check-function-bodies "**" "" } } */
>>> +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
>> 
>> Same comment as for USDOT/SUDOT regarding the dg- markup.
>
> Done!
>> 
>>> +
>>> +#include <arm_neon.h>
>>> +
>>> +/*
>>> +**ufoo:
>>> +**	bfdot	v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h)
>>> +**	ret
>>> +*/
>>> +float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
>>> +{
>>> +  return vbfdot_f32 (r, x, y);
>>> +}
>>> +
>>> +/*
>>> +**ufooq:
>>> +**	bfdot	v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h)
>>> +**	ret
>>> +*/
>>> +float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
>>> +{
>>> +  return vbfdotq_f32 (r, x, y);
>>> +}
>> 
>> The (...|...)s here are correct.
> Yep.
>> 
>>> +
>>> +/*
>>> +**ufoo_lane:
>>> +**	bfdot	v0.2s, (v1.4h, v2.2h\[0\]|v2.4h, v1.2h\[0\])
>>> +**	ret
>>> +*/
>>> +float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
>>> +{
>>> +  return vbfdot_lane_f32 (r, x, y, 0);
>>> +}
>>> +
>>> +/*
>>> +**ufooq_laneq:
>>> +**	bfdot	v0.4s, (v1.8h, v2.2h\[2\]|v2.8h, v1.2h\[2\])
>>> +**	ret
>>> +*/
>>> +float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
>>> +{
>>> +  return vbfdotq_laneq_f32 (r, x, y, 2);
>>> +}
>>> +
>>> +/*
>>> +**ufoo_laneq:
>>> +**	bfdot	v0.2s, (v1.4h, v2.2h\[3\]|v2.4h, v1.2h\[3\])
>>> +**	ret
>>> +*/
>>> +float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
>>> +{
>>> +  return vbfdot_laneq_f32 (r, x, y, 3);
>>> +}
>>> +
>>> +/*
>>> +**ufooq_lane:
>>> +**	bfdot	v0.4s, (v1.8h, v2.2h\[1\]|v2.8h, v1.2h\[1\])
>>> +**	ret
>>> +*/
>>> +float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
>>> +{
>>> +  return vbfdotq_lane_f32 (r, x, y, 1);
>>> +}
>> 
>> But these aren't, since the operands must be in the order given.
> Yep.
>> 
>>> +
>>> +/*
>>> +**ufoo_untied:
>>> +**	mov	v0.8b, v1.8b
>>> +**	bfdot	v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h)
>>> +**	ret
>>> +*/
>>> +float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
>>> +{
>>> +  return vbfdot_f32 (r, x, y);
>>> +}
>> 
>> Similarly, OK here.
> Yep.
>> 
>>> +
>>> +/*
>>> +**ufooq_lane_untied:
>>> +**	mov	v0.16b, v1.16b
>>> +**	bfdot	v0.4s, (v2.8h, v3.2h\[1\]|v3.8h, v2.2h\[1\])
>>> +**	ret
>>> +*/
>>> +float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
>>> +{
>>> +  return vbfdotq_lane_f32 (r, x, y, 1);
>>> +}
>> 
>> ...but not here.
> Yep.
>> 
>> Same comments for the big-endian test.
> Done.
>
> Thank you so much for the in depth review comments!
>
> Cheers,
> Stam
>> 
>> Thanks,
>> Richard
>> 
>
> diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
> index 57fc5933b43bfc0da132342c681b8a2c14549c9c..41ccda8a5d77b8ec3cfd984f3c5fc02369e7199f 100644
> --- a/gcc/config/aarch64/aarch64-simd-builtins.def
> +++ b/gcc/config/aarch64/aarch64-simd-builtins.def
> @@ -682,3 +682,8 @@
>    BUILTIN_VSFDF (UNOP, frint32x, 0)
>    BUILTIN_VSFDF (UNOP, frint64z, 0)
>    BUILTIN_VSFDF (UNOP, frint64x, 0)
> +
> +  /* Implemented by aarch64_bfdot{_lane}{q}<mode>.  */
> +  VAR2 (TERNOP, bfdot, 0, v2sf, v4sf)
> +  VAR2 (QUADOP_LANE_PAIR, bfdot_lane, 0, v2sf, v4sf)
> +  VAR2 (QUADOP_LANE_PAIR, bfdot_laneq, 0, v2sf, v4sf)
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index cea9592695ac8bd2f4e625f8b769ddaf716e9091..a95489dc17ac38be8e85457ad1804387f1772dc3 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -7025,3 +7025,35 @@
>    "xtn\t%0.<Vntype>, %1.<Vtype>"
>    [(set_attr "type" "neon_shift_imm_narrow_q")]
>  )
> +
> +(define_insn "aarch64_bfdot<mode>"
> +  [(set (match_operand:VDQSF 0 "register_operand" "=w")
> +	(plus:VDQSF
> +	  (unspec:VDQSF
> +	   [(match_operand:<VBFMLA_W> 2 "register_operand" "w")
> +	    (match_operand:<VBFMLA_W> 3 "register_operand" "w")]
> +	    UNSPEC_BFDOT)
> +	  (match_operand:VDQSF 1 "register_operand" "0")))]
> +  "TARGET_BF16_SIMD"
> +  "bfdot\t%0.<Vtype>, %2.<Vbfdottype>, %3.<Vbfdottype>"
> +  [(set_attr "type" "neon_dot<q>")]
> +)
> +
> +(define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>"
> +  [(set (match_operand:VDQSF 0 "register_operand" "=w")
> +	(plus:VDQSF
> +	  (unspec:VDQSF
> +	   [(match_operand:<VDQSF:VBFMLA_W> 2 "register_operand" "w")
> +	    (match_operand:VBF 3 "register_operand" "w")
> +	    (match_operand:SI 4 "const_int_operand" "n")]
> +	    UNSPEC_BFDOT)
> +	  (match_operand:VDQSF 1 "register_operand" "0")))]
> +  "TARGET_BF16_SIMD"
> +{
> +  int nunits = GET_MODE_NUNITS (<VBF:MODE>mode).to_constant ();
> +  int lane = INTVAL (operands[4]);
> +  operands[4] = gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane), SImode);
> +  return "bfdot\t%0.<VDQSF:Vtype>, %2.<VDQSF:Vbfdottype>, %3.2h[%4]";
> +}
> +  [(set_attr "type" "neon_dot<VDQSF:q>")]
> +)
> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
> index eaba156e26cf35b07b96972fe2741a9c00d6caa9..1a8b27956d4ca25e0ed6f3c38030b3eba0546c4f 100644
> --- a/gcc/config/aarch64/arm_neon.h
> +++ b/gcc/config/aarch64/arm_neon.h
> @@ -34611,6 +34611,57 @@ vrnd64xq_f64 (float64x2_t __a)
>  
>  #include "arm_bf16.h"
>  
> +#pragma GCC push_options
> +#pragma GCC target ("arch=armv8.2-a+bf16")
> +
> +__extension__ extern __inline float32x2_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b)
> +{
> +  return __builtin_aarch64_bfdotv2sf (__r, __a, __b);
> +}
> +
> +__extension__ extern __inline float32x4_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vbfdotq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
> +{
> +  return __builtin_aarch64_bfdotv4sf (__r, __a, __b);
> +}
> +
> +__extension__ extern __inline float32x2_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vbfdot_lane_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b,
> +		 const int __index)
> +{
> +  return __builtin_aarch64_bfdot_lanev2sf (__r, __a, __b, __index);
> +}
> +
> +__extension__ extern __inline float32x4_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vbfdotq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
> +		  const int __index)
> +{
> +  return __builtin_aarch64_bfdot_lanev4sf (__r, __a, __b, __index);
> +}
> +
> +__extension__ extern __inline float32x2_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vbfdot_laneq_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x8_t __b,
> +		  const int __index)
> +{
> +  return __builtin_aarch64_bfdot_laneqv2sf (__r, __a, __b, __index);
> +}
> +
> +__extension__ extern __inline float32x4_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vbfdotq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
> +		   const int __index)
> +{
> +  return __builtin_aarch64_bfdot_laneqv4sf (__r, __a, __b, __index);
> +}
> +
> +#pragma GCC pop_options
> +
>  #undef __aarch64_vget_lane_any
>  
>  #undef __aarch64_vdup_lane_any
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 2d566ca1a5fad18b701f1954cff967342085874a..091d3a2fb6926f614d354052961d0913d41f71e9 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -122,6 +122,9 @@
>  ;; Quad vector with only 2 element modes.
>  (define_mode_iterator VQ_2E [V2DI V2DF])
>  
> +;; BFmode vector modes.
> +(define_mode_iterator VBF [V4BF V8BF])
> +
>  ;; This mode iterator allows :P to be used for patterns that operate on
>  ;; addresses in different modes.  In LP64, only DI will match, while in
>  ;; ILP32, either can match.
> @@ -671,6 +674,7 @@
>      UNSPEC_UMULHS	; Used in aarch64-sve2.md.
>      UNSPEC_UMULHRS	; Used in aarch64-sve2.md.
>      UNSPEC_ASRD		; Used in aarch64-sve.md.
> +    UNSPEC_BFDOT	; Used in aarch64-simd.md.
>  ])
>  
>  ;; ------------------------------------------------------------------
> @@ -727,6 +731,8 @@
>  
>  (define_mode_attr FCVT_CHANGE_MODE [(SI "DF") (DI "SF")])
>  
> +(define_mode_attr isquadop [(V4BF "") (V8BF "q")])
> +
>  ;; For scalar usage of vector/FP registers
>  (define_mode_attr v [(QI "b") (HI "h") (SI "s") (DI "d")
>  		    (HF  "h") (SF "s") (DF "d")
> @@ -1310,6 +1316,9 @@
>  ;; Register suffix for DOTPROD input types from the return type.
>  (define_mode_attr Vdottype [(V2SI "8b") (V4SI "16b")])
>  
> +;; Register suffix for BFDOT input types from the return type.
> +(define_mode_attr Vbfdottype [(V2SF "4h") (V4SF "8h")])
> +
>  ;; Sum of lengths of instructions needed to move vector registers of a mode.
>  (define_mode_attr insn_count [(OI "8") (CI "12") (XI "16")])
>  
> @@ -1320,6 +1329,9 @@
>  ;; Width of 2nd and 3rd arguments to fp16 vector multiply add/sub
>  (define_mode_attr VFMLA_W [(V2SF "V4HF") (V4SF "V8HF")])
>  
> +;; Width of 2nd and 3rd arguments to bf16 vector multiply add/sub
> +(define_mode_attr VBFMLA_W [(V2SF "V4BF") (V4SF "V8BF")])
> +
>  (define_mode_attr VFMLA_SEL_W [(V2SF "V2HF") (V4SF "V4HF")])
>  
>  (define_mode_attr f16quad [(V2SF "") (V4SF "q")])
> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..ad51507731bbb165de64e583ebfbf8047b4eb781
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c
> @@ -0,0 +1,91 @@
> +/* { dg-do assemble { target { aarch64*-*-* } } } */
> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
> +/* { dg-add-options arm_v8_2a_bf16_neon }  */
> +/* { dg-additional-options "-save-temps" } */
> +/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
> +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +**ufoo:
> +**	bfdot	v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h)
> +**	ret
> +*/
> +float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
> +{
> +  return vbfdot_f32 (r, x, y);
> +}
> +
> +/*
> +**ufooq:
> +**	bfdot	v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h)
> +**	ret
> +*/
> +float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
> +{
> +  return vbfdotq_f32 (r, x, y);
> +}
> +
> +/*
> +**ufoo_lane:
> +**	bfdot	v0.2s, v1.4h, v2.2h\[0\]
> +**	ret
> +*/
> +float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
> +{
> +  return vbfdot_lane_f32 (r, x, y, 0);
> +}
> +
> +/*
> +**ufooq_laneq:
> +**	bfdot	v0.4s, v1.8h, v2.2h\[2\]
> +**	ret
> +*/
> +float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
> +{
> +  return vbfdotq_laneq_f32 (r, x, y, 2);
> +}
> +
> +/*
> +**ufoo_laneq:
> +**	bfdot	v0.2s, v1.4h, v2.2h\[3\]
> +**	ret
> +*/
> +float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
> +{
> +  return vbfdot_laneq_f32 (r, x, y, 3);
> +}
> +
> +/*
> +**ufooq_lane:
> +**	bfdot	v0.4s, v1.8h, v2.2h\[1\]
> +**	ret
> +*/
> +float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
> +{
> +  return vbfdotq_lane_f32 (r, x, y, 1);
> +}
> +
> +/*
> +**ufoo_untied:
> +**	mov	v0.8b, v1.8b
> +**	bfdot	v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h)
> +**	ret
> +*/
> +float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
> +{
> +  return vbfdot_f32 (r, x, y);
> +}
> +
> +/*
> +**ufooq_lane_untied:
> +**	mov	v0.16b, v1.16b
> +**	bfdot	v0.4s, v2.8h, v3.2h\[1\]
> +**	ret
> +*/
> +float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
> +{
> +  return vbfdotq_lane_f32 (r, x, y, 1);
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-2.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..58bdee5ac9df602b7569724200b3c9ab7c72bb28
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-2.c
> @@ -0,0 +1,91 @@
> +/* { dg-do assemble { target { aarch64*-*-* } } } */
> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
> +/* { dg-add-options arm_v8_2a_bf16_neon }  */
> +/* { dg-additional-options "-mbig-endian --save-temps" } */
> +/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
> +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +**ufoo:
> +**	bfdot	v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h)
> +**	ret
> +*/
> +float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
> +{
> +  return vbfdot_f32 (r, x, y);
> +}
> +
> +/*
> +**ufooq:
> +**	bfdot	v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h)
> +**	ret
> +*/
> +float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
> +{
> +  return vbfdotq_f32 (r, x, y);
> +}
> +
> +/*
> +**ufoo_lane:
> +**	bfdot	v0.2s, v1.4h, v2.2h\[0\]
> +**	ret
> +*/
> +float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
> +{
> +  return vbfdot_lane_f32 (r, x, y, 0);
> +}
> +
> +/*
> +**ufooq_laneq:
> +**	bfdot	v0.4s, v1.8h, v2.2h\[2\]
> +**	ret
> +*/
> +float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
> +{
> +  return vbfdotq_laneq_f32 (r, x, y, 2);
> +}
> +
> +/*
> +**ufoo_laneq:
> +**	bfdot	v0.2s, v1.4h, v2.2h\[3\]
> +**	ret
> +*/
> +float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
> +{
> +  return vbfdot_laneq_f32 (r, x, y, 3);
> +}
> +
> +/*
> +**ufooq_lane:
> +**	bfdot	v0.4s, v1.8h, v2.2h\[1\]
> +**	ret
> +*/
> +float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
> +{
> +  return vbfdotq_lane_f32 (r, x, y, 1);
> +}
> +
> +/*
> +**ufoo_untied:
> +**	mov	v0.8b, v1.8b
> +**	bfdot	v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h)
> +**	ret
> +*/
> +float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
> +{
> +  return vbfdot_f32 (r, x, y);
> +}
> +
> +/*
> +**ufooq_lane_untied:
> +**	mov	v0.16b, v1.16b
> +**	bfdot	v0.4s, v2.8h, v3.2h\[1\]
> +**	ret
> +*/
> +float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
> +{
> +  return vbfdotq_lane_f32 (r, x, y, 1);
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-3.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-3.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..607126203b00213d94471a1adefe16f265104af8
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-3.c
> @@ -0,0 +1,28 @@
> +/* { dg-do assemble { target { aarch64*-*-* } } } */
> +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
> +/* { dg-add-options arm_v8_2a_bf16_neon }  */
> +/* { dg-additional-options "--save-temps" } */
> +
> +#include <arm_neon.h>
> +
> +float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
> +{
> +  return vbfdot_lane_f32 (r, x, y, -1); /* { dg-error {lane -1 out of range 0 - 1} "" { target *-*-* } 0 } */
> +}
> +
> +float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
> +{
> +  return vbfdotq_laneq_f32 (r, x, y, -1); /* { dg-error {lane -1 out of range 0 - 3} "" { target *-*-* } 0 } */
> +}
> +
> +float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
> +{
> +  return vbfdot_laneq_f32 (r, x, y, 4); /* { dg-error {lane 4 out of range 0 - 3} "" { target *-*-* } 0 } */
> +}
> +
> +float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
> +{
> +  return vbfdotq_lane_f32 (r, x, y, 2); /* { dg-error {lane 2 out of range 0 - 1} "" { target *-*-* } 0 } */
> +}
> +
Stamatis Markianos-Wright Jan. 16, 2020, 4:02 p.m. UTC | #6
On 1/9/20 3:54 PM, Richard Sandiford wrote:
> Please update the names of the testsuite files to match the ones
> in the bfloat16_t patch.  (Same for the usdot/sudot patch -- sorry
> for forgetting there.)
> 
> OK with that change, thanks.
> 

Done and committed as r10-6006-gf275d73a57f1e5a07fbd4978f4b4457a5eaa1e39

Thank you!
Stam

> Richard
> 
> Stam Markianos-Wright <Stam.Markianos-Wright@arm.com> writes:
>> On 12/30/19 10:29 AM, Richard Sandiford wrote:
>>> Stam Markianos-Wright <Stam.Markianos-Wright@arm.com> writes:
>>>> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
>>>> index adfda96f077075ad53d4bea2919c4d3b326e49f5..7587bc46ba1c80389ea49fa83a0e6f8a489711e9 100644
>>>> --- a/gcc/config/aarch64/aarch64-simd.md
>>>> +++ b/gcc/config/aarch64/aarch64-simd.md
>>>> @@ -7028,3 +7028,36 @@
>>>>      "xtn\t%0.<Vntype>, %1.<Vtype>"
>>>>      [(set_attr "type" "neon_shift_imm_narrow_q")]
>>>>    )
>>>> +
>>>> +(define_insn "aarch64_bfdot<mode>"
>>>> +  [(set (match_operand:VDQSF 0 "register_operand" "=w")
>>>> +	(plus:VDQSF
>>>> +	  (unspec:VDQSF
>>>> +	   [(match_operand:<VBFMLA_W> 2 "register_operand" "w")
>>>> +	    (match_operand:<VBFMLA_W> 3 "register_operand" "w")]
>>>> +	    UNSPEC_BFDOT)
>>>> +	  (match_operand:VDQSF 1 "register_operand" "0")))]
>>>> +  "TARGET_BF16_SIMD"
>>>> +  "bfdot\t%0.<Vtype>, %2.<Vbfdottype>, %3.<Vbfdottype>"
>>>> +  [(set_attr "type" "neon_dot<q>")]
>>>> +)
>>>> +
>>>> +
>>>> +(define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>"
>>>
>>> Too many blank lines.
>>
>> Fixed, sorry I hadn't noticed!
>>
>>>
>>>> +  [(set (match_operand:VDQSF 0 "register_operand" "=w")
>>>> +	(plus:VDQSF
>>>> +	  (unspec:VDQSF
>>>> +	   [(match_operand:<VDQSF:VBFMLA_W> 2 "register_operand" "w")
>>>> +	    (match_operand:VBF 3 "register_operand" "w")
>>>> +	    (match_operand:SI 4 "const_int_operand" "n")]
>>>> +	    UNSPEC_BFDOT)
>>>> +	  (match_operand:VDQSF 1 "register_operand" "0")))]
>>>> +  "TARGET_BF16_SIMD"
>>>> +{
>>>> +  int nunits = GET_MODE_NUNITS (<VBF:MODE>mode).to_constant ();
>>>> +  int lane = INTVAL (operands[4]);
>>>> +  operands[4] = gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane), SImode);
>>>> +  return "bfdot\t%0.<VDQSF:Vtype>, %2.<VDQSF:Vbfdottype>, %3.2h[%4]";
>>>> +}
>>>> +  [(set_attr "type" "neon_dot<VDQSF:q>")]
>>>> +)
>>>> [...]
>>>> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c
>>>> new file mode 100644
>>>> index 0000000000000000000000000000000000000000..c575dcd3901172a52fa9403c9179d58eea44eb72
>>>> --- /dev/null
>>>> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c
>>>> @@ -0,0 +1,91 @@
>>>> +/* { dg-do assemble { target { aarch64*-*-* } } } */
>>>> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
>>>> +/* { dg-add-options arm_v8_2a_bf16_neon }  */
>>>> +/* { dg-additional-options "-O -save-temps" } */
>>>> +/* { dg-final { check-function-bodies "**" "" } } */
>>>> +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
>>>
>>> Same comment as for USDOT/SUDOT regarding the dg- markup.
>>
>> Done!
>>>
>>>> +
>>>> +#include <arm_neon.h>
>>>> +
>>>> +/*
>>>> +**ufoo:
>>>> +**	bfdot	v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h)
>>>> +**	ret
>>>> +*/
>>>> +float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
>>>> +{
>>>> +  return vbfdot_f32 (r, x, y);
>>>> +}
>>>> +
>>>> +/*
>>>> +**ufooq:
>>>> +**	bfdot	v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h)
>>>> +**	ret
>>>> +*/
>>>> +float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
>>>> +{
>>>> +  return vbfdotq_f32 (r, x, y);
>>>> +}
>>>
>>> The (...|...)s here are correct.
>> Yep.
>>>
>>>> +
>>>> +/*
>>>> +**ufoo_lane:
>>>> +**	bfdot	v0.2s, (v1.4h, v2.2h\[0\]|v2.4h, v1.2h\[0\])
>>>> +**	ret
>>>> +*/
>>>> +float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
>>>> +{
>>>> +  return vbfdot_lane_f32 (r, x, y, 0);
>>>> +}
>>>> +
>>>> +/*
>>>> +**ufooq_laneq:
>>>> +**	bfdot	v0.4s, (v1.8h, v2.2h\[2\]|v2.8h, v1.2h\[2\])
>>>> +**	ret
>>>> +*/
>>>> +float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
>>>> +{
>>>> +  return vbfdotq_laneq_f32 (r, x, y, 2);
>>>> +}
>>>> +
>>>> +/*
>>>> +**ufoo_laneq:
>>>> +**	bfdot	v0.2s, (v1.4h, v2.2h\[3\]|v2.4h, v1.2h\[3\])
>>>> +**	ret
>>>> +*/
>>>> +float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
>>>> +{
>>>> +  return vbfdot_laneq_f32 (r, x, y, 3);
>>>> +}
>>>> +
>>>> +/*
>>>> +**ufooq_lane:
>>>> +**	bfdot	v0.4s, (v1.8h, v2.2h\[1\]|v2.8h, v1.2h\[1\])
>>>> +**	ret
>>>> +*/
>>>> +float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
>>>> +{
>>>> +  return vbfdotq_lane_f32 (r, x, y, 1);
>>>> +}
>>>
>>> But these aren't, since the operands must be in the order given.
>> Yep.
>>>
>>>> +
>>>> +/*
>>>> +**ufoo_untied:
>>>> +**	mov	v0.8b, v1.8b
>>>> +**	bfdot	v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h)
>>>> +**	ret
>>>> +*/
>>>> +float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
>>>> +{
>>>> +  return vbfdot_f32 (r, x, y);
>>>> +}
>>>
>>> Similarly, OK here.
>> Yep.
>>>
>>>> +
>>>> +/*
>>>> +**ufooq_lane_untied:
>>>> +**	mov	v0.16b, v1.16b
>>>> +**	bfdot	v0.4s, (v2.8h, v3.2h\[1\]|v3.8h, v2.2h\[1\])
>>>> +**	ret
>>>> +*/
>>>> +float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
>>>> +{
>>>> +  return vbfdotq_lane_f32 (r, x, y, 1);
>>>> +}
>>>
>>> ...but not here.
>> Yep.
>>>
>>> Same comments for the big-endian test.
>> Done.
>>
>> Thank you so much for the in depth review comments!
>>
>> Cheers,
>> Stam
>>>
>>> Thanks,
>>> Richard
>>>
>>
>> diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
>> index 57fc5933b43bfc0da132342c681b8a2c14549c9c..41ccda8a5d77b8ec3cfd984f3c5fc02369e7199f 100644
>> --- a/gcc/config/aarch64/aarch64-simd-builtins.def
>> +++ b/gcc/config/aarch64/aarch64-simd-builtins.def
>> @@ -682,3 +682,8 @@
>>     BUILTIN_VSFDF (UNOP, frint32x, 0)
>>     BUILTIN_VSFDF (UNOP, frint64z, 0)
>>     BUILTIN_VSFDF (UNOP, frint64x, 0)
>> +
>> +  /* Implemented by aarch64_bfdot{_lane}{q}<mode>.  */
>> +  VAR2 (TERNOP, bfdot, 0, v2sf, v4sf)
>> +  VAR2 (QUADOP_LANE_PAIR, bfdot_lane, 0, v2sf, v4sf)
>> +  VAR2 (QUADOP_LANE_PAIR, bfdot_laneq, 0, v2sf, v4sf)
>> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
>> index cea9592695ac8bd2f4e625f8b769ddaf716e9091..a95489dc17ac38be8e85457ad1804387f1772dc3 100644
>> --- a/gcc/config/aarch64/aarch64-simd.md
>> +++ b/gcc/config/aarch64/aarch64-simd.md
>> @@ -7025,3 +7025,35 @@
>>     "xtn\t%0.<Vntype>, %1.<Vtype>"
>>     [(set_attr "type" "neon_shift_imm_narrow_q")]
>>   )
>> +
>> +(define_insn "aarch64_bfdot<mode>"
>> +  [(set (match_operand:VDQSF 0 "register_operand" "=w")
>> +	(plus:VDQSF
>> +	  (unspec:VDQSF
>> +	   [(match_operand:<VBFMLA_W> 2 "register_operand" "w")
>> +	    (match_operand:<VBFMLA_W> 3 "register_operand" "w")]
>> +	    UNSPEC_BFDOT)
>> +	  (match_operand:VDQSF 1 "register_operand" "0")))]
>> +  "TARGET_BF16_SIMD"
>> +  "bfdot\t%0.<Vtype>, %2.<Vbfdottype>, %3.<Vbfdottype>"
>> +  [(set_attr "type" "neon_dot<q>")]
>> +)
>> +
>> +(define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>"
>> +  [(set (match_operand:VDQSF 0 "register_operand" "=w")
>> +	(plus:VDQSF
>> +	  (unspec:VDQSF
>> +	   [(match_operand:<VDQSF:VBFMLA_W> 2 "register_operand" "w")
>> +	    (match_operand:VBF 3 "register_operand" "w")
>> +	    (match_operand:SI 4 "const_int_operand" "n")]
>> +	    UNSPEC_BFDOT)
>> +	  (match_operand:VDQSF 1 "register_operand" "0")))]
>> +  "TARGET_BF16_SIMD"
>> +{
>> +  int nunits = GET_MODE_NUNITS (<VBF:MODE>mode).to_constant ();
>> +  int lane = INTVAL (operands[4]);
>> +  operands[4] = gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane), SImode);
>> +  return "bfdot\t%0.<VDQSF:Vtype>, %2.<VDQSF:Vbfdottype>, %3.2h[%4]";
>> +}
>> +  [(set_attr "type" "neon_dot<VDQSF:q>")]
>> +)
>> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
>> index eaba156e26cf35b07b96972fe2741a9c00d6caa9..1a8b27956d4ca25e0ed6f3c38030b3eba0546c4f 100644
>> --- a/gcc/config/aarch64/arm_neon.h
>> +++ b/gcc/config/aarch64/arm_neon.h
>> @@ -34611,6 +34611,57 @@ vrnd64xq_f64 (float64x2_t __a)
>>   
>>   #include "arm_bf16.h"
>>   
>> +#pragma GCC push_options
>> +#pragma GCC target ("arch=armv8.2-a+bf16")
>> +
>> +__extension__ extern __inline float32x2_t
>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> +vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b)
>> +{
>> +  return __builtin_aarch64_bfdotv2sf (__r, __a, __b);
>> +}
>> +
>> +__extension__ extern __inline float32x4_t
>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> +vbfdotq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
>> +{
>> +  return __builtin_aarch64_bfdotv4sf (__r, __a, __b);
>> +}
>> +
>> +__extension__ extern __inline float32x2_t
>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> +vbfdot_lane_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b,
>> +		 const int __index)
>> +{
>> +  return __builtin_aarch64_bfdot_lanev2sf (__r, __a, __b, __index);
>> +}
>> +
>> +__extension__ extern __inline float32x4_t
>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> +vbfdotq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
>> +		  const int __index)
>> +{
>> +  return __builtin_aarch64_bfdot_lanev4sf (__r, __a, __b, __index);
>> +}
>> +
>> +__extension__ extern __inline float32x2_t
>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> +vbfdot_laneq_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x8_t __b,
>> +		  const int __index)
>> +{
>> +  return __builtin_aarch64_bfdot_laneqv2sf (__r, __a, __b, __index);
>> +}
>> +
>> +__extension__ extern __inline float32x4_t
>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> +vbfdotq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
>> +		   const int __index)
>> +{
>> +  return __builtin_aarch64_bfdot_laneqv4sf (__r, __a, __b, __index);
>> +}
>> +
>> +#pragma GCC pop_options
>> +
>>   #undef __aarch64_vget_lane_any
>>   
>>   #undef __aarch64_vdup_lane_any
>> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
>> index 2d566ca1a5fad18b701f1954cff967342085874a..091d3a2fb6926f614d354052961d0913d41f71e9 100644
>> --- a/gcc/config/aarch64/iterators.md
>> +++ b/gcc/config/aarch64/iterators.md
>> @@ -122,6 +122,9 @@
>>   ;; Quad vector with only 2 element modes.
>>   (define_mode_iterator VQ_2E [V2DI V2DF])
>>   
>> +;; BFmode vector modes.
>> +(define_mode_iterator VBF [V4BF V8BF])
>> +
>>   ;; This mode iterator allows :P to be used for patterns that operate on
>>   ;; addresses in different modes.  In LP64, only DI will match, while in
>>   ;; ILP32, either can match.
>> @@ -671,6 +674,7 @@
>>       UNSPEC_UMULHS	; Used in aarch64-sve2.md.
>>       UNSPEC_UMULHRS	; Used in aarch64-sve2.md.
>>       UNSPEC_ASRD		; Used in aarch64-sve.md.
>> +    UNSPEC_BFDOT	; Used in aarch64-simd.md.
>>   ])
>>   
>>   ;; ------------------------------------------------------------------
>> @@ -727,6 +731,8 @@
>>   
>>   (define_mode_attr FCVT_CHANGE_MODE [(SI "DF") (DI "SF")])
>>   
>> +(define_mode_attr isquadop [(V4BF "") (V8BF "q")])
>> +
>>   ;; For scalar usage of vector/FP registers
>>   (define_mode_attr v [(QI "b") (HI "h") (SI "s") (DI "d")
>>   		    (HF  "h") (SF "s") (DF "d")
>> @@ -1310,6 +1316,9 @@
>>   ;; Register suffix for DOTPROD input types from the return type.
>>   (define_mode_attr Vdottype [(V2SI "8b") (V4SI "16b")])
>>   
>> +;; Register suffix for BFDOT input types from the return type.
>> +(define_mode_attr Vbfdottype [(V2SF "4h") (V4SF "8h")])
>> +
>>   ;; Sum of lengths of instructions needed to move vector registers of a mode.
>>   (define_mode_attr insn_count [(OI "8") (CI "12") (XI "16")])
>>   
>> @@ -1320,6 +1329,9 @@
>>   ;; Width of 2nd and 3rd arguments to fp16 vector multiply add/sub
>>   (define_mode_attr VFMLA_W [(V2SF "V4HF") (V4SF "V8HF")])
>>   
>> +;; Width of 2nd and 3rd arguments to bf16 vector multiply add/sub
>> +(define_mode_attr VBFMLA_W [(V2SF "V4BF") (V4SF "V8BF")])
>> +
>>   (define_mode_attr VFMLA_SEL_W [(V2SF "V2HF") (V4SF "V4HF")])
>>   
>>   (define_mode_attr f16quad [(V2SF "") (V4SF "q")])
>> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..ad51507731bbb165de64e583ebfbf8047b4eb781
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c
>> @@ -0,0 +1,91 @@
>> +/* { dg-do assemble { target { aarch64*-*-* } } } */
>> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
>> +/* { dg-add-options arm_v8_2a_bf16_neon }  */
>> +/* { dg-additional-options "-save-temps" } */
>> +/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
>> +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
>> +
>> +#include <arm_neon.h>
>> +
>> +/*
>> +**ufoo:
>> +**	bfdot	v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h)
>> +**	ret
>> +*/
>> +float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
>> +{
>> +  return vbfdot_f32 (r, x, y);
>> +}
>> +
>> +/*
>> +**ufooq:
>> +**	bfdot	v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h)
>> +**	ret
>> +*/
>> +float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
>> +{
>> +  return vbfdotq_f32 (r, x, y);
>> +}
>> +
>> +/*
>> +**ufoo_lane:
>> +**	bfdot	v0.2s, v1.4h, v2.2h\[0\]
>> +**	ret
>> +*/
>> +float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
>> +{
>> +  return vbfdot_lane_f32 (r, x, y, 0);
>> +}
>> +
>> +/*
>> +**ufooq_laneq:
>> +**	bfdot	v0.4s, v1.8h, v2.2h\[2\]
>> +**	ret
>> +*/
>> +float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
>> +{
>> +  return vbfdotq_laneq_f32 (r, x, y, 2);
>> +}
>> +
>> +/*
>> +**ufoo_laneq:
>> +**	bfdot	v0.2s, v1.4h, v2.2h\[3\]
>> +**	ret
>> +*/
>> +float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
>> +{
>> +  return vbfdot_laneq_f32 (r, x, y, 3);
>> +}
>> +
>> +/*
>> +**ufooq_lane:
>> +**	bfdot	v0.4s, v1.8h, v2.2h\[1\]
>> +**	ret
>> +*/
>> +float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
>> +{
>> +  return vbfdotq_lane_f32 (r, x, y, 1);
>> +}
>> +
>> +/*
>> +**ufoo_untied:
>> +**	mov	v0.8b, v1.8b
>> +**	bfdot	v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h)
>> +**	ret
>> +*/
>> +float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
>> +{
>> +  return vbfdot_f32 (r, x, y);
>> +}
>> +
>> +/*
>> +**ufooq_lane_untied:
>> +**	mov	v0.16b, v1.16b
>> +**	bfdot	v0.4s, v2.8h, v3.2h\[1\]
>> +**	ret
>> +*/
>> +float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
>> +{
>> +  return vbfdotq_lane_f32 (r, x, y, 1);
>> +}
>> +
>> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-2.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-2.c
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..58bdee5ac9df602b7569724200b3c9ab7c72bb28
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-2.c
>> @@ -0,0 +1,91 @@
>> +/* { dg-do assemble { target { aarch64*-*-* } } } */
>> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
>> +/* { dg-add-options arm_v8_2a_bf16_neon }  */
>> +/* { dg-additional-options "-mbig-endian --save-temps" } */
>> +/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
>> +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
>> +
>> +#include <arm_neon.h>
>> +
>> +/*
>> +**ufoo:
>> +**	bfdot	v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h)
>> +**	ret
>> +*/
>> +float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
>> +{
>> +  return vbfdot_f32 (r, x, y);
>> +}
>> +
>> +/*
>> +**ufooq:
>> +**	bfdot	v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h)
>> +**	ret
>> +*/
>> +float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
>> +{
>> +  return vbfdotq_f32 (r, x, y);
>> +}
>> +
>> +/*
>> +**ufoo_lane:
>> +**	bfdot	v0.2s, v1.4h, v2.2h\[0\]
>> +**	ret
>> +*/
>> +float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
>> +{
>> +  return vbfdot_lane_f32 (r, x, y, 0);
>> +}
>> +
>> +/*
>> +**ufooq_laneq:
>> +**	bfdot	v0.4s, v1.8h, v2.2h\[2\]
>> +**	ret
>> +*/
>> +float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
>> +{
>> +  return vbfdotq_laneq_f32 (r, x, y, 2);
>> +}
>> +
>> +/*
>> +**ufoo_laneq:
>> +**	bfdot	v0.2s, v1.4h, v2.2h\[3\]
>> +**	ret
>> +*/
>> +float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
>> +{
>> +  return vbfdot_laneq_f32 (r, x, y, 3);
>> +}
>> +
>> +/*
>> +**ufooq_lane:
>> +**	bfdot	v0.4s, v1.8h, v2.2h\[1\]
>> +**	ret
>> +*/
>> +float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
>> +{
>> +  return vbfdotq_lane_f32 (r, x, y, 1);
>> +}
>> +
>> +/*
>> +**ufoo_untied:
>> +**	mov	v0.8b, v1.8b
>> +**	bfdot	v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h)
>> +**	ret
>> +*/
>> +float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
>> +{
>> +  return vbfdot_f32 (r, x, y);
>> +}
>> +
>> +/*
>> +**ufooq_lane_untied:
>> +**	mov	v0.16b, v1.16b
>> +**	bfdot	v0.4s, v2.8h, v3.2h\[1\]
>> +**	ret
>> +*/
>> +float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
>> +{
>> +  return vbfdotq_lane_f32 (r, x, y, 1);
>> +}
>> +
>> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-3.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-3.c
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..607126203b00213d94471a1adefe16f265104af8
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-3.c
>> @@ -0,0 +1,28 @@
>> +/* { dg-do assemble { target { aarch64*-*-* } } } */
>> +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
>> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
>> +/* { dg-add-options arm_v8_2a_bf16_neon }  */
>> +/* { dg-additional-options "--save-temps" } */
>> +
>> +#include <arm_neon.h>
>> +
>> +float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
>> +{
>> +  return vbfdot_lane_f32 (r, x, y, -1); /* { dg-error {lane -1 out of range 0 - 1} "" { target *-*-* } 0 } */
>> +}
>> +
>> +float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
>> +{
>> +  return vbfdotq_laneq_f32 (r, x, y, -1); /* { dg-error {lane -1 out of range 0 - 3} "" { target *-*-* } 0 } */
>> +}
>> +
>> +float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
>> +{
>> +  return vbfdot_laneq_f32 (r, x, y, 4); /* { dg-error {lane 4 out of range 0 - 3} "" { target *-*-* } 0 } */
>> +}
>> +
>> +float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
>> +{
>> +  return vbfdotq_lane_f32 (r, x, y, 2); /* { dg-error {lane 2 out of range 0 - 1} "" { target *-*-* } 0 } */
>> +}
>> +
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index f4ca35a59704c761fe2ac2b6d401fff7c8aba80d..6c5b61c37bcb340f963861723c6e365e32f6ca95 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -682,3 +682,8 @@ 
   BUILTIN_VSFDF (UNOP, frint32x, 0)
   BUILTIN_VSFDF (UNOP, frint64z, 0)
   BUILTIN_VSFDF (UNOP, frint64x, 0)
+
+  /* Implemented by aarch64_bfdot{_lane}{q}<mode>.  */
+  VAR2 (TERNOP, bfdot, 0, v2sf, v4sf)
+  VAR2 (QUADOP_LANE_PAIR, bfdot_lane, 0, v2sf, v4sf)
+  VAR2 (QUADOP_LANE_PAIR, bfdot_laneq, 0, v2sf, v4sf)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index c4858ab7cffd786066646a5cd95a168311990b76..bdc26c190610580e57e9749804b7729ee4e34793 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -7027,3 +7027,37 @@ 
   "xtn\t%0.<Vntype>, %1.<Vtype>"
   [(set_attr "type" "neon_shift_imm_narrow_q")]
 )
+
+(define_insn "aarch64_bfdot<mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+	(plus:VDQSF (match_operand:VDQSF 1 "register_operand" "0")
+		    (unspec:VDQSF [(match_operand:<VBFMLA_W> 2
+						"register_operand" "w")
+				   (match_operand:<VBFMLA_W> 3
+						"register_operand" "w")]
+				   UNSPEC_BFDOT)))]
+  "TARGET_BF16_SIMD"
+  "bfdot\t%0.<Vtype>, %2.<Vbfdottype>, %3.<Vbfdottype>"
+  [(set_attr "type" "neon_dot<q>")]
+)
+
+
+(define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+	(plus:VDQSF (match_operand:VDQSF 1 "register_operand" "0")
+		    (unspec:VDQSF [(match_operand:<VDQSF:VBFMLA_W> 2
+						"register_operand" "w")
+				   (match_operand: VBF 3
+						"register_operand" "w")
+				   (match_operand:SI 4
+						"const_int_operand" "n")]
+				   UNSPEC_BFDOT)))]
+  "TARGET_BF16_SIMD"
+{
+  int nunits = GET_MODE_NUNITS (<VBF:MODE>mode).to_constant ();
+  int lane = INTVAL (operands[4]);
+  operands[4] =  gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane), SImode);
+  return "bfdot\t%0.<VDQSF:Vtype>, %2.<VDQSF:Vbfdottype>, %3.2h[%4]";
+}
+  [(set_attr "type" "neon_dot<VDQSF:q>")]
+)
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 5996df0a612caff3c881fc15b0aa12b8f91a193b..0357d97cc4143c3a9c56260d9a9cc24138afc049 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -34612,6 +34612,57 @@  vrnd64xq_f64 (float64x2_t __a)
 
 #include "arm_bf16.h"
 
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+bf16")
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b)
+{
+  return __builtin_aarch64_bfdotv2sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdotq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_aarch64_bfdotv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdot_lane_f32 \
+      (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b, const int __index)
+{
+  return __builtin_aarch64_bfdot_lanev2sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdotq_lane_f32 \
+      (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b, const int __index)
+{
+  return __builtin_aarch64_bfdot_lanev4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdot_laneq_f32 \
+     (float32x2_t __r, bfloat16x4_t __a, bfloat16x8_t __b, const int __index)
+{
+  return __builtin_aarch64_bfdot_laneqv2sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdotq_laneq_f32 \
+      (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b, const int __index)
+{
+  return __builtin_aarch64_bfdot_laneqv4sf (__r, __a, __b, __index);
+}
+
+#pragma GCC pop_options
+
 #undef __aarch64_vget_lane_any
 
 #undef __aarch64_vdup_lane_any
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 9480efef47c1865867148bd43dbf10faf227f5d0..4645e481b864d505c0d2de2d0bae776982f2c823 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -120,6 +120,9 @@ 
 ;; Quad vector with only 2 element modes.
 (define_mode_iterator VQ_2E [V2DI V2DF])
 
+;; BFmode vector modes.
+(define_mode_iterator VBF [V4BF V8BF])
+
 ;; This mode iterator allows :P to be used for patterns that operate on
 ;; addresses in different modes.  In LP64, only DI will match, while in
 ;; ILP32, either can match.
@@ -673,6 +676,7 @@ 
     UNSPEC_UMULHS	; Used in aarch64-sve2.md.
     UNSPEC_UMULHRS	; Used in aarch64-sve2.md.
     UNSPEC_ASRD		; Used in aarch64-sve.md.
+    UNSPEC_BFDOT	; Used in aarch64-simd.md.
 ])
 
 ;; ------------------------------------------------------------------
@@ -729,6 +733,8 @@ 
 
 (define_mode_attr FCVT_CHANGE_MODE [(SI "DF") (DI "SF")])
 
+(define_mode_attr isquadop [(V4BF "") (V8BF "q")])
+
 ;; For scalar usage of vector/FP registers
 (define_mode_attr v [(QI "b") (HI "h") (SI "s") (DI "d")
 		    (HF  "h") (SF "s") (DF "d")
@@ -1310,6 +1316,9 @@ 
 ;; Register suffix for DOTPROD input types from the return type.
 (define_mode_attr Vdottype [(V2SI "8b") (V4SI "16b")])
 
+;; Register suffix for BFDOT input types from the return type.
+(define_mode_attr Vbfdottype [(V2SF "4h") (V4SF "8h")])
+
 ;; Sum of lengths of instructions needed to move vector registers of a mode.
 (define_mode_attr insn_count [(OI "8") (CI "12") (XI "16")])
 
@@ -1320,6 +1329,9 @@ 
 ;; Width of 2nd and 3rd arguments to fp16 vector multiply add/sub
 (define_mode_attr VFMLA_W [(V2SF "V4HF") (V4SF "V8HF")])
 
+;; Width of 2nd and 3rd arguments to bf16 vector multiply add/sub
+(define_mode_attr VBFMLA_W [(V2SF "V4BF") (V4SF "V8BF")])
+
 (define_mode_attr VFMLA_SEL_W [(V2SF "V2HF") (V4SF "V4HF")])
 
 (define_mode_attr f16quad [(V2SF "") (V4SF "q")])
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c
new file mode 100644
index 0000000000000000000000000000000000000000..62ac715c2a9c4468eb7c143464390dbf1144d6d6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c
@@ -0,0 +1,80 @@ 
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "--save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include <arm_neon.h>
+
+/*
+**ufoo:
+**	...
+**	bfdot\tv[0-9]+.2s, v[0-9]+.4h, v[0-9]+.4h
+**	...
+**	ret
+*/
+float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
+{
+  return vbfdot_f32 (r, x, y);
+}
+
+/*
+**ufooq:
+**	...
+**	bfdot\tv[0-9]+.4s, v[0-9]+.8h, v[0-9]+.8h
+**	...
+**	ret
+*/
+float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+  return vbfdotq_f32 (r, x, y);
+}
+
+/*
+**ufoo_lane:
+**	...
+**	bfdot\tv[0-9]+.2s, v[0-9]+.4h, v[0-9]+.2h\[0\]
+**	...
+**	ret
+*/
+float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
+{
+  return vbfdot_lane_f32 (r, x, y, 0);
+}
+
+/*
+**ufooq_laneq:
+**	...
+**	bfdot\tv[0-9]+.4s, v[0-9]+.8h, v[0-9]+.2h\[2\]
+**	...
+**	ret
+*/
+float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+  return vbfdotq_laneq_f32 (r, x, y, 2);
+}
+
+/*
+**ufoo_laneq:
+**	...
+**	bfdot\tv[0-9]+.2s, v[0-9]+.4h, v[0-9]+.2h\[3\]
+**	...
+**	ret
+*/
+float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
+{
+  return vbfdot_laneq_f32 (r, x, y, 3);
+}
+
+/*
+**ufooq_lane:
+**	...
+**	bfdot\tv[0-9]+.4s, v[0-9]+.8h, v[0-9]+.2h\[1\]
+**	...
+**	ret
+*/
+float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
+{
+  return vbfdotq_lane_f32 (r, x, y, 1);
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-2.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..ae910bbdc0759e7bdd40566ef211f4f83b411792
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-2.c
@@ -0,0 +1,80 @@ 
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-mbig-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include <arm_neon.h>
+
+/*
+**ufoo:
+**	...
+**	bfdot\tv[0-9]+.2s, v[0-9]+.4h, v[0-9]+.4h
+**	...
+**	ret
+*/
+float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
+{
+  return vbfdot_f32 (r, x, y);
+}
+
+/*
+**ufooq:
+**	...
+**	bfdot\tv[0-9]+.4s, v[0-9]+.8h, v[0-9]+.8h
+**	...
+**	ret
+*/
+float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+  return vbfdotq_f32 (r, x, y);
+}
+
+/*
+**ufoo_lane:
+**	...
+**	bfdot\tv[0-9]+.2s, v[0-9]+.4h, v[0-9]+.2h\[0\]
+**	...
+**	ret
+*/
+float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
+{
+  return vbfdot_lane_f32 (r, x, y, 0);
+}
+
+/*
+**ufooq_laneq:
+**	...
+**	bfdot\tv[0-9]+.4s, v[0-9]+.8h, v[0-9]+.2h\[2\]
+**	...
+**	ret
+*/
+float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+  return vbfdotq_laneq_f32 (r, x, y, 2);
+}
+
+/*
+**ufoo_laneq:
+**	...
+**	bfdot\tv[0-9]+.2s, v[0-9]+.4h, v[0-9]+.2h\[3\]
+**	...
+**	ret
+*/
+float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
+{
+  return vbfdot_laneq_f32 (r, x, y, 3);
+}
+
+/*
+**ufooq_lane:
+**	...
+**	bfdot\tv[0-9]+.4s, v[0-9]+.8h, v[0-9]+.2h\[1\]
+**	...
+**	ret
+*/
+float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
+{
+  return vbfdotq_lane_f32 (r, x, y, 1);
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-3.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-3.c
new file mode 100644
index 0000000000000000000000000000000000000000..607126203b00213d94471a1adefe16f265104af8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-3.c
@@ -0,0 +1,28 @@ 
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "--save-temps" } */
+
+#include <arm_neon.h>
+
+float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
+{
+  return vbfdot_lane_f32 (r, x, y, -1); /* { dg-error {lane -1 out of range 0 - 1} "" { target *-*-* } 0 } */
+}
+
+float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+  return vbfdotq_laneq_f32 (r, x, y, -1); /* { dg-error {lane -1 out of range 0 - 3} "" { target *-*-* } 0 } */
+}
+
+float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
+{
+  return vbfdot_laneq_f32 (r, x, y, 4); /* { dg-error {lane 4 out of range 0 - 3} "" { target *-*-* } 0 } */
+}
+
+float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
+{
+  return vbfdotq_lane_f32 (r, x, y, 2); /* { dg-error {lane 2 out of range 0 - 1} "" { target *-*-* } 0 } */
+}
+