diff mbox series

aarch64: Use RTL builtins for integer mla intrinsics

Message ID DBBPR08MB4758B47688A80ABAB10F6213EBA00@DBBPR08MB4758.eurprd08.prod.outlook.com
State New
Headers show
Series aarch64: Use RTL builtins for integer mla intrinsics | expand

Commit Message

Jonathan Wright Jan. 22, 2021, 2:44 p.m. UTC
Hi,

As subject, this patch rewrites integer mla Neon intrinsics to use
RTL builtins rather than inline assembly code, allowing for better
scheduling and optimization.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

If ok, please commit to master (I don't have commit rights.)

Thanks,
Jonathan

---

gcc/Changelog:

2021-01-14  Jonathan Wright  <jonathan.wright@arm.com>

	* config/aarch64/aarch64-simd-builtins.def: Add mla builtin
	generator macro.
	* config/aarch64/arm_neon.h (vmla_s8): Use RTL builtin rather
	than asm.
	(vmla_s16): Likewise.
	(vmla_s32): Likewise.
	(vmla_u8): Likewise.
	(vmla_u16): Likewise.
	(vmla_u32): Likewise.
	(vmlaq_s8): Likewise.
	(vmlaq_s16): Likewise.
	(vmlaq_s32): Likewise.
	(vmlaq_u8): Likewise.
	(vmlaq_u16): Likewise.
	(vmlaq_u32): Likewise.

Comments

Richard Sandiford Jan. 22, 2021, 2:56 p.m. UTC | #1
Thanks for doing this.  The patch looks good with one very minor nit fixed:

Jonathan Wright <Jonathan.Wright@arm.com> writes:
> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
> index f7efee61de4c5268acf446555af4a93fece6b169..da696d9fee2ffbabc9d89f2e9299fbde086cfee1 100644
> --- a/gcc/config/aarch64/arm_neon.h
> +++ b/gcc/config/aarch64/arm_neon.h
> @@ -7294,72 +7294,48 @@ __extension__ extern __inline int8x8_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmla_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
>  {
> -  int8x8_t __result;
> -  __asm__ ("mla %0.8b, %2.8b, %3.8b"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __builtin_aarch64_mlav8qi(__a, __b, __c);

GNU style (followed in the header file) is to insert a space between
the function name and the arguments.  Same for the other functions.

Since other patches like this are on their way, would you mind
going through the process on https://gcc.gnu.org/gitwrite.html
to get commit access?  (I'll sponsor.)

Once you've got access, the patch is OK to commit with the change above.

A nice follow-on would be to lower the mla intrinsics to IFN_FMA.
See aarch64_general_gimple_fold_builtin, which does something similar
for IFN_REDUC_PLUS etc.

Thanks,
Richard

>  }
>  
>  __extension__ extern __inline int16x4_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmla_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
>  {
> -  int16x4_t __result;
> -  __asm__ ("mla %0.4h, %2.4h, %3.4h"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __builtin_aarch64_mlav4hi(__a, __b, __c);
>  }
>  
>  __extension__ extern __inline int32x2_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmla_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
>  {
> -  int32x2_t __result;
> -  __asm__ ("mla %0.2s, %2.2s, %3.2s"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __builtin_aarch64_mlav2si(__a, __b, __c);
>  }
>  
>  __extension__ extern __inline uint8x8_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmla_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
>  {
> -  uint8x8_t __result;
> -  __asm__ ("mla %0.8b, %2.8b, %3.8b"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return (uint8x8_t) __builtin_aarch64_mlav8qi((int8x8_t) __a,
> +                                               (int8x8_t) __b,
> +                                               (int8x8_t) __c);
>  }
>  
>  __extension__ extern __inline uint16x4_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmla_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
>  {
> -  uint16x4_t __result;
> -  __asm__ ("mla %0.4h, %2.4h, %3.4h"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return (uint16x4_t) __builtin_aarch64_mlav4hi((int16x4_t) __a,
> +                                                (int16x4_t) __b,
> +                                                (int16x4_t) __c);
>  }
>  
>  __extension__ extern __inline uint32x2_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmla_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
>  {
> -  uint32x2_t __result;
> -  __asm__ ("mla %0.2s, %2.2s, %3.2s"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return (uint32x2_t) __builtin_aarch64_mlav2si((int32x2_t) __a,
> +                                                (int32x2_t) __b,
> +                                                (int32x2_t) __c);
>  }
>  
>  #define vmlal_high_lane_s16(a, b, c, d)                                 \
> @@ -7835,72 +7811,48 @@ __extension__ extern __inline int8x16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmlaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
>  {
> -  int8x16_t __result;
> -  __asm__ ("mla %0.16b, %2.16b, %3.16b"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __builtin_aarch64_mlav16qi(__a, __b, __c);
>  }
>  
>  __extension__ extern __inline int16x8_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmlaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
>  {
> -  int16x8_t __result;
> -  __asm__ ("mla %0.8h, %2.8h, %3.8h"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __builtin_aarch64_mlav8hi(__a, __b, __c);
>  }
>  
>  __extension__ extern __inline int32x4_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmlaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
>  {
> -  int32x4_t __result;
> -  __asm__ ("mla %0.4s, %2.4s, %3.4s"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __builtin_aarch64_mlav4si(__a, __b, __c);
>  }
>  
>  __extension__ extern __inline uint8x16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmlaq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
>  {
> -  uint8x16_t __result;
> -  __asm__ ("mla %0.16b, %2.16b, %3.16b"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return (uint8x16_t) __builtin_aarch64_mlav16qi((int8x16_t) __a,
> +                                                 (int8x16_t) __b,
> +                                                 (int8x16_t) __c);
>  }
>  
>  __extension__ extern __inline uint16x8_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmlaq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
>  {
> -  uint16x8_t __result;
> -  __asm__ ("mla %0.8h, %2.8h, %3.8h"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return (uint16x8_t) __builtin_aarch64_mlav8hi((int16x8_t) __a,
> +                                                (int16x8_t) __b,
> +                                                (int16x8_t) __c);
>  }
>  
>  __extension__ extern __inline uint32x4_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmlaq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
>  {
> -  uint32x4_t __result;
> -  __asm__ ("mla %0.4s, %2.4s, %3.4s"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return (uint32x4_t) __builtin_aarch64_mlav4si((int32x4_t) __a,
> +                                                (int32x4_t) __b,
> +                                                (int32x4_t) __c);
>  }
>  
>  __extension__ extern __inline float32x2_t
Jonathan Wright Jan. 22, 2021, 3:46 p.m. UTC | #2
GNU style (followed in the header file) is to insert a space between
the function name and the arguments.  Same for the other functions.
Ah, yes - will change.

Since other patches like this are on their way, would you mind
going through the process on https://gcc.gnu.org/gitwrite.html
to get commit access?  (I'll sponsor.)
Request submitted.

Thanks,
Jonathan
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 73a24d59745ab03fbed213b01eb3134d053295e1..d156f50e5df5568e563f9b175b84062b6575e7e5 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -178,6 +178,9 @@ 
   /* Implemented by aarch64_xtn<mode>.  */
   BUILTIN_VQN (UNOP, xtn, 0, NONE)
 
+  /* Implemented by aarch64_mla<mode>.  */
+  BUILTIN_VDQ_BHSI (TERNOP, mla, 0, NONE)
+
   /* Implemented by aarch64_<su>mlsl<mode>.  */
   BUILTIN_VD_BHSI (TERNOP, smlsl, 0, NONE)
   BUILTIN_VD_BHSI (TERNOPU, umlsl, 0, NONE)
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index f7efee61de4c5268acf446555af4a93fece6b169..da696d9fee2ffbabc9d89f2e9299fbde086cfee1 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -7294,72 +7294,48 @@  __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
 {
-  int8x8_t __result;
-  __asm__ ("mla %0.8b, %2.8b, %3.8b"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_mlav8qi(__a, __b, __c);
 }
 
 __extension__ extern __inline int16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
 {
-  int16x4_t __result;
-  __asm__ ("mla %0.4h, %2.4h, %3.4h"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_mlav4hi(__a, __b, __c);
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
 {
-  int32x2_t __result;
-  __asm__ ("mla %0.2s, %2.2s, %3.2s"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_mlav2si(__a, __b, __c);
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
 {
-  uint8x8_t __result;
-  __asm__ ("mla %0.8b, %2.8b, %3.8b"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return (uint8x8_t) __builtin_aarch64_mlav8qi((int8x8_t) __a,
+                                               (int8x8_t) __b,
+                                               (int8x8_t) __c);
 }
 
 __extension__ extern __inline uint16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
 {
-  uint16x4_t __result;
-  __asm__ ("mla %0.4h, %2.4h, %3.4h"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return (uint16x4_t) __builtin_aarch64_mlav4hi((int16x4_t) __a,
+                                                (int16x4_t) __b,
+                                                (int16x4_t) __c);
 }
 
 __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
 {
-  uint32x2_t __result;
-  __asm__ ("mla %0.2s, %2.2s, %3.2s"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return (uint32x2_t) __builtin_aarch64_mlav2si((int32x2_t) __a,
+                                                (int32x2_t) __b,
+                                                (int32x2_t) __c);
 }
 
 #define vmlal_high_lane_s16(a, b, c, d)                                 \
@@ -7835,72 +7811,48 @@  __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
 {
-  int8x16_t __result;
-  __asm__ ("mla %0.16b, %2.16b, %3.16b"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_mlav16qi(__a, __b, __c);
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
 {
-  int16x8_t __result;
-  __asm__ ("mla %0.8h, %2.8h, %3.8h"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_mlav8hi(__a, __b, __c);
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
 {
-  int32x4_t __result;
-  __asm__ ("mla %0.4s, %2.4s, %3.4s"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_mlav4si(__a, __b, __c);
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
 {
-  uint8x16_t __result;
-  __asm__ ("mla %0.16b, %2.16b, %3.16b"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return (uint8x16_t) __builtin_aarch64_mlav16qi((int8x16_t) __a,
+                                                 (int8x16_t) __b,
+                                                 (int8x16_t) __c);
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
 {
-  uint16x8_t __result;
-  __asm__ ("mla %0.8h, %2.8h, %3.8h"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return (uint16x8_t) __builtin_aarch64_mlav8hi((int16x8_t) __a,
+                                                (int16x8_t) __b,
+                                                (int16x8_t) __c);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
 {
-  uint32x4_t __result;
-  __asm__ ("mla %0.4s, %2.4s, %3.4s"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return (uint32x4_t) __builtin_aarch64_mlav4si((int32x4_t) __a,
+                                                (int32x4_t) __b,
+                                                (int32x4_t) __c);
 }
 
 __extension__ extern __inline float32x2_t