diff mbox series

[AArch64] ACLE intrinsics bfmmla and bfmlal<b/t> for AArch64 AdvSIMD

Message ID baf48c6b-8c79-cd5f-1eff-95346c4e1392@arm.com
State New
Headers show
Series [AArch64] ACLE intrinsics bfmmla and bfmlal<b/t> for AArch64 AdvSIMD | expand

Commit Message

Delia Burduv Dec. 20, 2019, 6:39 p.m. UTC
This patch adds the ARMv8.6 ACLE intrinsics for bfmmla, bfmlalb and 
bfmlalt as part of the BFloat16 extension.
(https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics)
The intrinsics are declared in arm_neon.h and the RTL patterns are 
defined in aarch64-simd.md.
Two new tests are added to check assembler output.

This patch depends on the two Aarch64 back-end patches. 
(https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01323.html and 
https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01324.html)

Tested for regression on aarch64-none-elf and aarch64_be-none-elf. I 
don't have commit rights, so if this is ok can someone please commit it 
for me?

gcc/ChangeLog:

2019-10-29  Delia Burduv  <delia.burduv@arm.com>

         * config/aarch64/aarch64-simd-builtins.def
           (bfmmla): New built-in function.
           (bfmlalb): New built-in function.
           (bfmlalt): New built-in function.
           (bfmlalb_lane): New built-in function.
           (bfmlalt_lane): New built-in function.
           (bfmlalb_laneq): New built-in function.
           (bfmlalt_laneq): New built-in function
         * config/aarch64/aarch64-simd.md (bfmmla): New pattern.
           (bfmlal<bt>): New patterns.
         * config/aarch64/arm_neon.h (vbfmmlaq_f32): New intrinsic.
           (vbfmlalbq_f32): New intrinsic.
           (vbfmlaltq_f32): New intrinsic.
           (vbfmlalbq_lane_f32): New intrinsic.
           (vbfmlaltq_lane_f32): New intrinsic.
           (vbfmlalbq_laneq_f32): New intrinsic.
           (vbfmlaltq_laneq_f32): New intrinsic.
         * config/aarch64/iterators.md (UNSPEC_BFMMLA): New UNSPEC.
           (UNSPEC_BFMLALB): New UNSPEC.
           (UNSPEC_BFMLALT): New UNSPEC.
           (BF_MLA): New int iterator.
           (bt): Added UNSPEC_BFMLALB, UNSPEC_BFMLALT.
         * config/arm/types.md (bf_mmla): New type.
           (bf_mla): New type.

gcc/testsuite/ChangeLog:

2019-10-29  Delia Burduv  <delia.burduv@arm.com>

         * gcc.target/aarch64/advsimd-intrinsics/bfmlalbt-compile.c: New 
test.
         * gcc.target/aarch64/advsimd-intrinsics/bfmmla-compile.c: New test.
         * 
gcc.target/aarch64/advsimd-intrinsics/vbfmlalbt_lane_f32_indices_1.c:
           New test.
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index f4ca35a59704c761fe2ac2b6d401fff7c8aba80d..5e9f50f090870d0c63916540a48f5ac132d2630d 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -682,3 +682,14 @@ 
   BUILTIN_VSFDF (UNOP, frint32x, 0)
   BUILTIN_VSFDF (UNOP, frint64z, 0)
   BUILTIN_VSFDF (UNOP, frint64x, 0)
+
+  /* Implemented by aarch64_bfmmlaqv4sf  */
+  VAR1 (TERNOP, bfmmlaq, 0, v4sf)
+
+  /* Implemented by aarch64_bfmlal<bt>{_lane{q}}v4sf  */
+  VAR1 (TERNOP, bfmlalb, 0, v4sf)
+  VAR1 (TERNOP, bfmlalt, 0, v4sf)
+  VAR1 (QUADOP_LANE, bfmlalb_lane, 0, v4sf)
+  VAR1 (QUADOP_LANE, bfmlalt_lane, 0, v4sf)
+  VAR1 (QUADOP_LANE, bfmlalb_laneq, 0, v4sf)
+  VAR1 (QUADOP_LANE, bfmlalt_laneq, 0, v4sf)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 55660ae248f4fa75d35ba2949cd4b9d5139ff5f5..66a6c4116a1fdd26dd4eec8b0609e28eb2c38fa1 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -7027,3 +7027,57 @@ 
   "xtn\t%0.<Vntype>, %1.<Vtype>"
   [(set_attr "type" "neon_shift_imm_narrow_q")]
 )
+
+;; bfmmla
+(define_insn "aarch64_bfmmlaqv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+        (plus:V4SF (match_operand:V4SF 1 "register_operand" "0")
+                   (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+                                 (match_operand:V8BF 3 "register_operand" "w")]
+                    UNSPEC_BFMMLA)))]
+  "TARGET_BF16_SIMD"
+  "bfmmla\\t%0.4s, %2.8h, %3.8h"
+  [(set_attr "type" "neon_mla_s_q")]
+)
+
+;; bfmlal<bt>
+(define_insn "aarch64_bfmlal<bt>v4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+        (plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
+                    (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+                                  (match_operand:V8BF 3 "register_operand" "w")]
+                     BF_MLA)))]
+  "TARGET_BF16_SIMD"
+  "bfmlal<bt>\\t%0.4s, %2.8h, %3.8h"
+  [(set_attr "type" "neon_fp_mla_s")]
+)
+
+(define_insn "aarch64_bfmlal<bt>_lanev4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+        (plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
+                    (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+                                  (match_operand:V4BF 3 "register_operand" "w")
+                                  (match_operand:SI 4 "const_int_operand" "n")]
+                     BF_MLA)))]
+  "TARGET_BF16_SIMD"
+{
+  operands[4] = aarch64_endian_lane_rtx (V4BFmode, INTVAL (operands[4]));
+  return "bfmlal<bt>\\t%0.4s, %2.8h, %3.h[%4]";
+}
+  [(set_attr "type" "neon_fp_mla_s")]
+)
+
+(define_insn "aarch64_bfmlal<bt>_laneqv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+        (plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
+                    (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+                                  (match_operand:V8BF 3 "register_operand" "w")
+                                  (match_operand:SI 4 "const_int_operand" "n")]
+                     BF_MLA)))]
+  "TARGET_BF16_SIMD"
+{
+  operands[4] = aarch64_endian_lane_rtx (V8BFmode, INTVAL (operands[4]));
+  return "bfmlal<bt>\\t%0.4s, %2.8h, %3.h[%4]";
+}
+  [(set_attr "type" "neon_fp_mla_s")]
+)
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 6cdbf381f0156ed993f03b847228b36ebbdd14f8..9001c63b0d44e7ad699ace097b9259681b691033 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -34610,6 +34610,70 @@  vrnd64xq_f64 (float64x2_t __a)
 
 #include "arm_bf16.h"
 
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+bf16")
+#ifdef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmmlaq_f32 \
+      (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+
+{
+  return __builtin_aarch64_bfmmlaqv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_f32 \
+      (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_aarch64_bfmlalbv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_f32 \
+      (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_aarch64_bfmlaltv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_lane_f32 \
+      (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b, const int __index)
+{
+  return __builtin_aarch64_bfmlalb_lanev4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_lane_f32 \
+      (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b, const int __index)
+{
+  return __builtin_aarch64_bfmlalt_lanev4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_laneq_f32 \
+      (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b, const int __index)
+{
+  return __builtin_aarch64_bfmlalb_laneqv4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_laneq_f32 \
+      (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b, const int __index)
+{
+  return __builtin_aarch64_bfmlalt_laneqv4sf (__r, __a, __b, __index);
+}
+
+#endif
+#pragma GCC pop_options
+
 #pragma GCC pop_options
 
 #undef __aarch64_vget_lane_any
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 931166da5e47302afe810498eea9c8c2ab89b9de..e7ca2faba0e1ef5d59ac658eaa4d788b00b8b587 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -673,6 +673,9 @@ 
     UNSPEC_UMULHS	; Used in aarch64-sve2.md.
     UNSPEC_UMULHRS	; Used in aarch64-sve2.md.
     UNSPEC_ASRD		; Used in aarch64-sve.md.
+    UNSPEC_BFMMLA	; Used in aarch64-simd.md.
+    UNSPEC_BFMLALB	; Used in aarch64-simd.md.
+    UNSPEC_BFMLALT	; Used in aarch64-simd.md.
 ])
 
 ;; ------------------------------------------------------------------
@@ -2127,6 +2130,9 @@ 
 
 (define_int_iterator SVE_PITER [UNSPEC_PFIRST UNSPEC_PNEXT])
 
+(define_int_iterator BF_MLA [UNSPEC_BFMLALB
+			     UNSPEC_BFMLALT])
+
 ;; Iterators for atomic operations.
 
 (define_int_iterator ATOMIC_LDOP
@@ -2342,7 +2348,8 @@ 
 		    (UNSPEC_SRHADD "") (UNSPEC_URHADD "u")])
 
 (define_int_attr bt [(UNSPEC_SMULLB "b") (UNSPEC_UMULLB "b")
-		     (UNSPEC_SMULLT "t") (UNSPEC_UMULLT "t")])
+		     (UNSPEC_SMULLT "t") (UNSPEC_UMULLT "t")
+		     (UNSPEC_BFMLALB "b") (UNSPEC_BFMLALT "t")])
 
 (define_int_attr fn [(UNSPEC_LDFF1 "f") (UNSPEC_LDNF1 "n")])
 
diff --git a/gcc/config/arm/types.md b/gcc/config/arm/types.md
index df39522f2ad63a52c910b1a6bcc7aa13aaf5d021..2f5ada97991abc88cc74f4768eb395b2b757ee26 100644
--- a/gcc/config/arm/types.md
+++ b/gcc/config/arm/types.md
@@ -550,6 +550,10 @@ 
 ; The classification below is for TME instructions
 ;
 ; tme
+;
+; The classification below is for BFloat16 widening multiply-add
+;
+; bf_mla
 
 (define_attr "type"
  "adc_imm,\
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfmlalbt-compile.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfmlalbt-compile.c
new file mode 100644
index 0000000000000000000000000000000000000000..11558be667c65228529ead90628604cba0bbd044
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfmlalbt-compile.c
@@ -0,0 +1,73 @@ 
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+/* { dg-additional-options "-save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include <arm_neon.h>
+
+/*
+**test_bfmlalb:
+**      ...
+**      bfmlalb	v[0-9]+.4s, v[0-9]+.8h, v[0-9]+.8h
+**      ...
+*/
+float32x4_t test_bfmlalb (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlalbq_f32 (r, a, b);
+}
+
+/*
+**test_bfmlalt:
+**      ...
+**      bfmlalt	v[0-9]+.4s, v[0-9]+.8h, v[0-9]+.8h
+**      ...
+*/
+float32x4_t test_bfmlalt (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlaltq_f32 (r, a, b);
+}
+
+/*
+**test_bfmlalb_lane:
+**      ...
+**      bfmlalb	v[0-9]+.4s, v[0-9]+.8h, v[0-9]+.h\[0\]
+**      ...
+*/
+float32x4_t test_bfmlalb_lane (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return vbfmlalbq_lane_f32 (r, a, b, 0);
+}
+
+/*
+**test_bfmlalt_lane:
+**      ...
+**      bfmlalt	v[0-9]+.4s, v[0-9]+.8h, v[0-9]+.h\[2\]
+**      ...
+*/
+float32x4_t test_bfmlalt_lane (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return vbfmlaltq_lane_f32 (r, a, b, 2);
+}
+
+/*
+**test_bfmlalb_laneq:
+**      ...
+**      bfmlalb	v[0-9]+.4s, v[0-9]+.8h, v[0-9]+.h\[4\]
+**      ...
+*/
+float32x4_t test_bfmlalb_laneq (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlalbq_laneq_f32 (r, a, b, 4);
+}
+
+/*
+**test_bfmlalt_laneq:
+**      ...
+**      bfmlalt	v[0-9]+.4s, v[0-9]+.8h, v[0-9]+.h\[7\]
+**      ...
+*/
+float32x4_t test_bfmlalt_laneq (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlaltq_laneq_f32 (r, a, b, 7);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfmmla-compile.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfmmla-compile.c
new file mode 100644
index 0000000000000000000000000000000000000000..b12cf47d67a33f13967738b48a4984765c0ff2df
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfmmla-compile.c
@@ -0,0 +1,19 @@ 
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+/* { dg-additional-options "-save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include <arm_neon.h>
+
+
+/*
+**test_bfmmla:
+**	...
+**	bfmmla	v[0-9]+.4s, v[0-9]+.8h, v[0-9]+.8h
+**	...
+*/
+float32x4_t test_bfmmla (float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+  return vbfmmlaq_f32 (r, x, y);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vbfmlalbt_lane_f32_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vbfmlalbt_lane_f32_indices_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..4a8a9b64c04b39f3cd95101326022f67326921f5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vbfmlalbt_lane_f32_indices_1.c
@@ -0,0 +1,46 @@ 
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+
+#include <arm_neon.h>
+
+void
+f_vbfmlaltq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 34655 } */
+  vbfmlaltq_lane_f32 (r, a, b, -1);
+  /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 34655 } */
+  vbfmlaltq_lane_f32 (r, a, b, 4);
+  return;
+}
+
+void
+f_vbfmlaltq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  /* { dg-error "lane -1 out of range 0 - 7" "" { target *-*-* } 34671 } */
+  vbfmlaltq_laneq_f32 (r, a, b, -1);
+  /* { dg-error "lane 8 out of range 0 - 7" "" { target *-*-* } 34671 } */
+  vbfmlaltq_laneq_f32 (r, a, b, 8);
+  return;
+}
+
+void
+f_vbfmlalbq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 34647 } */
+  vbfmlalbq_lane_f32 (r, a, b, -1);
+  /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 34647 } */
+  vbfmlalbq_lane_f32 (r, a, b, 4);
+  return;
+}
+
+void
+f_vbfmlalbq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  /* { dg-error "lane -1 out of range 0 - 7" "" { target *-*-* } 34663 } */
+  vbfmlalbq_laneq_f32 (r, a, b, -1);
+  /* { dg-error "lane 8 out of range 0 - 7" "" { target *-*-* } 34663 } */
+  vbfmlalbq_laneq_f32 (r, a, b, 8);
+  return;
+}