@@ -682,3 +682,14 @@
BUILTIN_VSFDF (UNOP, frint32x, 0)
BUILTIN_VSFDF (UNOP, frint64z, 0)
BUILTIN_VSFDF (UNOP, frint64x, 0)
+
+ /* Implemented by aarch64_bfmmlaqv4sf */
+ VAR1 (TERNOP, bfmmlaq, 0, v4sf)
+
+ /* Implemented by aarch64_bfmlal<bt>{_lane{q}}v4sf */
+ VAR1 (TERNOP, bfmlalb, 0, v4sf)
+ VAR1 (TERNOP, bfmlalt, 0, v4sf)
+ VAR1 (QUADOP_LANE, bfmlalb_lane, 0, v4sf)
+ VAR1 (QUADOP_LANE, bfmlalt_lane, 0, v4sf)
+ VAR1 (QUADOP_LANE, bfmlalb_laneq, 0, v4sf)
+ VAR1 (QUADOP_LANE, bfmlalt_laneq, 0, v4sf)
@@ -7027,3 +7027,57 @@
"xtn\t%0.<Vntype>, %1.<Vtype>"
[(set_attr "type" "neon_shift_imm_narrow_q")]
)
+
+;; bfmmla
+(define_insn "aarch64_bfmmlaqv4sf"
+ [(set (match_operand:V4SF 0 "register_operand" "=w")
+ (plus:V4SF (match_operand:V4SF 1 "register_operand" "0")
+ (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+ (match_operand:V8BF 3 "register_operand" "w")]
+ UNSPEC_BFMMLA)))]
+ "TARGET_BF16_SIMD"
+ "bfmmla\\t%0.4s, %2.8h, %3.8h"
+ [(set_attr "type" "neon_mla_s_q")]
+)
+
+;; bfmlal<bt>
+(define_insn "aarch64_bfmlal<bt>v4sf"
+ [(set (match_operand:V4SF 0 "register_operand" "=w")
+ (plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
+ (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+ (match_operand:V8BF 3 "register_operand" "w")]
+ BF_MLA)))]
+ "TARGET_BF16_SIMD"
+ "bfmlal<bt>\\t%0.4s, %2.8h, %3.8h"
+ [(set_attr "type" "neon_fp_mla_s")]
+)
+
+(define_insn "aarch64_bfmlal<bt>_lanev4sf"
+ [(set (match_operand:V4SF 0 "register_operand" "=w")
+ (plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
+ (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+ (match_operand:V4BF 3 "register_operand" "w")
+ (match_operand:SI 4 "const_int_operand" "n")]
+ BF_MLA)))]
+ "TARGET_BF16_SIMD"
+{
+ operands[4] = aarch64_endian_lane_rtx (V4BFmode, INTVAL (operands[4]));
+ return "bfmlal<bt>\\t%0.4s, %2.8h, %3.h[%4]";
+}
+ [(set_attr "type" "neon_fp_mla_s")]
+)
+
+(define_insn "aarch64_bfmlal<bt>_laneqv4sf"
+ [(set (match_operand:V4SF 0 "register_operand" "=w")
+ (plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
+ (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+ (match_operand:V8BF 3 "register_operand" "w")
+ (match_operand:SI 4 "const_int_operand" "n")]
+ BF_MLA)))]
+ "TARGET_BF16_SIMD"
+{
+ operands[4] = aarch64_endian_lane_rtx (V8BFmode, INTVAL (operands[4]));
+ return "bfmlal<bt>\\t%0.4s, %2.8h, %3.h[%4]";
+}
+ [(set_attr "type" "neon_fp_mla_s")]
+)
@@ -34610,6 +34610,70 @@ vrnd64xq_f64 (float64x2_t __a)
#include "arm_bf16.h"
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+bf16")
+#ifdef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmmlaq_f32 \
+ (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+
+{
+ return __builtin_aarch64_bfmmlaqv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_f32 \
+ (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+ return __builtin_aarch64_bfmlalbv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_f32 \
+ (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+ return __builtin_aarch64_bfmlaltv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_lane_f32 \
+ (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b, const int __index)
+{
+ return __builtin_aarch64_bfmlalb_lanev4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_lane_f32 \
+ (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b, const int __index)
+{
+ return __builtin_aarch64_bfmlalt_lanev4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_laneq_f32 \
+ (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b, const int __index)
+{
+ return __builtin_aarch64_bfmlalb_laneqv4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_laneq_f32 \
+ (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b, const int __index)
+{
+ return __builtin_aarch64_bfmlalt_laneqv4sf (__r, __a, __b, __index);
+}
+
+#endif
+#pragma GCC pop_options
+
#pragma GCC pop_options
#undef __aarch64_vget_lane_any
@@ -673,6 +673,9 @@
UNSPEC_UMULHS ; Used in aarch64-sve2.md.
UNSPEC_UMULHRS ; Used in aarch64-sve2.md.
UNSPEC_ASRD ; Used in aarch64-sve.md.
+ UNSPEC_BFMMLA ; Used in aarch64-simd.md.
+ UNSPEC_BFMLALB ; Used in aarch64-simd.md.
+ UNSPEC_BFMLALT ; Used in aarch64-simd.md.
])
;; ------------------------------------------------------------------
@@ -2127,6 +2130,9 @@
(define_int_iterator SVE_PITER [UNSPEC_PFIRST UNSPEC_PNEXT])
+(define_int_iterator BF_MLA [UNSPEC_BFMLALB
+ UNSPEC_BFMLALT])
+
;; Iterators for atomic operations.
(define_int_iterator ATOMIC_LDOP
@@ -2342,7 +2348,8 @@
(UNSPEC_SRHADD "") (UNSPEC_URHADD "u")])
(define_int_attr bt [(UNSPEC_SMULLB "b") (UNSPEC_UMULLB "b")
- (UNSPEC_SMULLT "t") (UNSPEC_UMULLT "t")])
+ (UNSPEC_SMULLT "t") (UNSPEC_UMULLT "t")
+ (UNSPEC_BFMLALB "b") (UNSPEC_BFMLALT "t")])
(define_int_attr fn [(UNSPEC_LDFF1 "f") (UNSPEC_LDNF1 "n")])
@@ -550,6 +550,10 @@
; The classification below is for TME instructions
;
; tme
+;
+; The classification below is for BFloat16 widening multiply-add
+;
+; bf_mla
(define_attr "type"
"adc_imm,\
new file mode 100644
@@ -0,0 +1,73 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+/* { dg-additional-options "-save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include <arm_neon.h>
+
+/*
+**test_bfmlalb:
+** ...
+** bfmlalb v[0-9]+.4s, v[0-9]+.8h, v[0-9]+.8h
+** ...
+*/
+float32x4_t test_bfmlalb (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+ return vbfmlalbq_f32 (r, a, b);
+}
+
+/*
+**test_bfmlalt:
+** ...
+** bfmlalt v[0-9]+.4s, v[0-9]+.8h, v[0-9]+.8h
+** ...
+*/
+float32x4_t test_bfmlalt (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+ return vbfmlaltq_f32 (r, a, b);
+}
+
+/*
+**test_bfmlalb_lane:
+** ...
+** bfmlalb v[0-9]+.4s, v[0-9]+.8h, v[0-9]+.h\[0\]
+** ...
+*/
+float32x4_t test_bfmlalb_lane (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+ return vbfmlalbq_lane_f32 (r, a, b, 0);
+}
+
+/*
+**test_bfmlalt_lane:
+** ...
+** bfmlalt v[0-9]+.4s, v[0-9]+.8h, v[0-9]+.h\[2\]
+** ...
+*/
+float32x4_t test_bfmlalt_lane (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+ return vbfmlaltq_lane_f32 (r, a, b, 2);
+}
+
+/*
+**test_bfmlalb_laneq:
+** ...
+** bfmlalb v[0-9]+.4s, v[0-9]+.8h, v[0-9]+.h\[4\]
+** ...
+*/
+float32x4_t test_bfmlalb_laneq (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+ return vbfmlalbq_laneq_f32 (r, a, b, 4);
+}
+
+/*
+**test_bfmlalt_laneq:
+** ...
+** bfmlalt v[0-9]+.4s, v[0-9]+.8h, v[0-9]+.h\[7\]
+** ...
+*/
+float32x4_t test_bfmlalt_laneq (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+ return vbfmlaltq_laneq_f32 (r, a, b, 7);
+}
new file mode 100644
@@ -0,0 +1,19 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+/* { dg-additional-options "-save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include <arm_neon.h>
+
+
+/*
+**test_bfmmla:
+** ...
+** bfmmla v[0-9]+.4s, v[0-9]+.8h, v[0-9]+.8h
+** ...
+*/
+float32x4_t test_bfmmla (float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+ return vbfmmlaq_f32 (r, x, y);
+}
new file mode 100644
@@ -0,0 +1,46 @@
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+
+#include <arm_neon.h>
+
+void
+f_vbfmlaltq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+ /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 34655 } */
+ vbfmlaltq_lane_f32 (r, a, b, -1);
+ /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 34655 } */
+ vbfmlaltq_lane_f32 (r, a, b, 4);
+ return;
+}
+
+void
+f_vbfmlaltq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+ /* { dg-error "lane -1 out of range 0 - 7" "" { target *-*-* } 34671 } */
+ vbfmlaltq_laneq_f32 (r, a, b, -1);
+ /* { dg-error "lane 8 out of range 0 - 7" "" { target *-*-* } 34671 } */
+ vbfmlaltq_laneq_f32 (r, a, b, 8);
+ return;
+}
+
+void
+f_vbfmlalbq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+ /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 34647 } */
+ vbfmlalbq_lane_f32 (r, a, b, -1);
+ /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 34647 } */
+ vbfmlalbq_lane_f32 (r, a, b, 4);
+ return;
+}
+
+void
+f_vbfmlalbq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+ /* { dg-error "lane -1 out of range 0 - 7" "" { target *-*-* } 34663 } */
+ vbfmlalbq_laneq_f32 (r, a, b, -1);
+ /* { dg-error "lane 8 out of range 0 - 7" "" { target *-*-* } 34663 } */
+ vbfmlalbq_laneq_f32 (r, a, b, 8);
+ return;
+}