diff mbox series

[AArch64] Upgrade integer MLA intrinsics to GCC vector extensions

Message ID 20200812084027.35513-1-james.greenhalgh@arm.com
State New
Headers show
Series [AArch64] Upgrade integer MLA intrinsics to GCC vector extensions | expand

Commit Message

James Greenhalgh Aug. 12, 2020, 8:40 a.m. UTC
Hi,

As subject, this patch rewrites the mla intrinsics to use a + b * c rather
than inline assembler, thereby opening them to CSE, scheduling, etc.

Bootstrapped and tested on aarch64-none-linux-gnu.

OK?

Thanks,
James

---

gcc/Changelog:

2020-08-11  James Greenhalgh  <james.greenhalgh@arm.com>

	config/aarch64/arm_neon.h (vmla_s8): Upgrade to C rather than asm.
	(vmla_s16): Likewise.
	(vmla_s32): Likewise.
	(vmla_u8): Likewise.
	(vmla_u16): Likewise.
	(vmla_u32): Likewise.
	(vmlaq_s8): Likewise.
	(vmlaq_s16): Likewise.
	(vmlaq_s32): Likewise.
	(vmlaq_u8): Likewise.
	(vmlaq_u16): Likewise.
	(vmlaq_u32): Likewise.

Comments

Christophe Lyon Aug. 12, 2020, 8:54 a.m. UTC | #1
Hi James,

On Wed, 12 Aug 2020 at 10:40, James Greenhalgh <james.greenhalgh@arm.com> wrote:
>
>
> Hi,
>
> As subject, this patch rewrites the mla intrinsics to use a + b * c rather
> than inline assembler, thereby opening them to CSE, scheduling, etc.
>
> Bootstrapped and tested on aarch64-none-linux-gnu.
>

Do we have tests that make sure we still generate the mla instructions?

> OK?
>
> Thanks,
> James
>
> ---
>
> gcc/Changelog:
>
> 2020-08-11  James Greenhalgh  <james.greenhalgh@arm.com>
>
>         config/aarch64/arm_neon.h (vmla_s8): Upgrade to C rather than asm.
>         (vmla_s16): Likewise.
>         (vmla_s32): Likewise.
>         (vmla_u8): Likewise.
>         (vmla_u16): Likewise.
>         (vmla_u32): Likewise.
>         (vmlaq_s8): Likewise.
>         (vmlaq_s16): Likewise.
>         (vmlaq_s32): Likewise.
>         (vmlaq_u8): Likewise.
>         (vmlaq_u16): Likewise.
>         (vmlaq_u32): Likewise.
>
Richard Sandiford Aug. 12, 2020, 9:03 a.m. UTC | #2
James Greenhalgh <james.greenhalgh@arm.com> writes:
> Hi,
>
> As subject, this patch rewrites the mla intrinsics to use a + b * c rather
> than inline assembler, thereby opening them to CSE, scheduling, etc.

Looks good for the unsigned ones.  For the signed ones, there's a risk
that the functions might become subject to the usual UB for signed
overflow, rather than acting just like the instructions do.  (Realise
that isn't unique to these functions, but it'd be good not to introduce
more instances of it.)

So for the signed ones, it might be safer to cast to the unsigned type,
do the operation, and then cast back.

Thanks,
Richard

> Bootstrapped and tested on aarch64-none-linux-gnu.
>
> OK?
>
> Thanks,
> James
>
> ---
>
> gcc/Changelog:
>
> 2020-08-11  James Greenhalgh  <james.greenhalgh@arm.com>
>
> 	config/aarch64/arm_neon.h (vmla_s8): Upgrade to C rather than asm.
> 	(vmla_s16): Likewise.
> 	(vmla_s32): Likewise.
> 	(vmla_u8): Likewise.
> 	(vmla_u16): Likewise.
> 	(vmla_u32): Likewise.
> 	(vmlaq_s8): Likewise.
> 	(vmlaq_s16): Likewise.
> 	(vmlaq_s32): Likewise.
> 	(vmlaq_u8): Likewise.
> 	(vmlaq_u16): Likewise.
> 	(vmlaq_u32): Likewise.
>
> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
> index 50f8b23bc17..aa548e4e6c7 100644
> --- a/gcc/config/aarch64/arm_neon.h
> +++ b/gcc/config/aarch64/arm_neon.h
> @@ -7400,72 +7400,42 @@ __extension__ extern __inline int8x8_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmla_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
>  {
> -  int8x8_t __result;
> -  __asm__ ("mla %0.8b, %2.8b, %3.8b"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __a + __b * __c;
>  }
>  
>  __extension__ extern __inline int16x4_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmla_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
>  {
> -  int16x4_t __result;
> -  __asm__ ("mla %0.4h, %2.4h, %3.4h"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __a + __b * __c;
>  }
>  
>  __extension__ extern __inline int32x2_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmla_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
>  {
> -  int32x2_t __result;
> -  __asm__ ("mla %0.2s, %2.2s, %3.2s"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __a + __b * __c;
>  }
>  
>  __extension__ extern __inline uint8x8_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmla_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
>  {
> -  uint8x8_t __result;
> -  __asm__ ("mla %0.8b, %2.8b, %3.8b"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __a + __b * __c;
>  }
>  
>  __extension__ extern __inline uint16x4_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmla_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
>  {
> -  uint16x4_t __result;
> -  __asm__ ("mla %0.4h, %2.4h, %3.4h"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __a + __b * __c;
>  }
>  
>  __extension__ extern __inline uint32x2_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmla_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
>  {
> -  uint32x2_t __result;
> -  __asm__ ("mla %0.2s, %2.2s, %3.2s"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __a + __b * __c;
>  }
>  
>  #define vmlal_high_lane_s16(a, b, c, d)                                 \
> @@ -7941,72 +7911,42 @@ __extension__ extern __inline int8x16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmlaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
>  {
> -  int8x16_t __result;
> -  __asm__ ("mla %0.16b, %2.16b, %3.16b"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __a + __b * __c;
>  }
>  
>  __extension__ extern __inline int16x8_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmlaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
>  {
> -  int16x8_t __result;
> -  __asm__ ("mla %0.8h, %2.8h, %3.8h"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __a + __b * __c;
>  }
>  
>  __extension__ extern __inline int32x4_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmlaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
>  {
> -  int32x4_t __result;
> -  __asm__ ("mla %0.4s, %2.4s, %3.4s"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __a + __b * __c;
>  }
>  
>  __extension__ extern __inline uint8x16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmlaq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
>  {
> -  uint8x16_t __result;
> -  __asm__ ("mla %0.16b, %2.16b, %3.16b"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __a + __b * __c;
>  }
>  
>  __extension__ extern __inline uint16x8_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmlaq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
>  {
> -  uint16x8_t __result;
> -  __asm__ ("mla %0.8h, %2.8h, %3.8h"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __a + __b * __c;
>  }
>  
>  __extension__ extern __inline uint32x4_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmlaq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
>  {
> -  uint32x4_t __result;
> -  __asm__ ("mla %0.4s, %2.4s, %3.4s"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __a + __b * __c;
>  }
>  
>  __extension__ extern __inline float32x2_t
diff mbox series

Patch

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 50f8b23bc17..aa548e4e6c7 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -7400,72 +7400,42 @@  __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
 {
-  int8x8_t __result;
-  __asm__ ("mla %0.8b, %2.8b, %3.8b"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __a + __b * __c;
 }
 
 __extension__ extern __inline int16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
 {
-  int16x4_t __result;
-  __asm__ ("mla %0.4h, %2.4h, %3.4h"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __a + __b * __c;
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
 {
-  int32x2_t __result;
-  __asm__ ("mla %0.2s, %2.2s, %3.2s"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __a + __b * __c;
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
 {
-  uint8x8_t __result;
-  __asm__ ("mla %0.8b, %2.8b, %3.8b"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __a + __b * __c;
 }
 
 __extension__ extern __inline uint16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
 {
-  uint16x4_t __result;
-  __asm__ ("mla %0.4h, %2.4h, %3.4h"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __a + __b * __c;
 }
 
 __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
 {
-  uint32x2_t __result;
-  __asm__ ("mla %0.2s, %2.2s, %3.2s"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __a + __b * __c;
 }
 
 #define vmlal_high_lane_s16(a, b, c, d)                                 \
@@ -7941,72 +7911,42 @@  __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
 {
-  int8x16_t __result;
-  __asm__ ("mla %0.16b, %2.16b, %3.16b"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __a + __b * __c;
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
 {
-  int16x8_t __result;
-  __asm__ ("mla %0.8h, %2.8h, %3.8h"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __a + __b * __c;
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
 {
-  int32x4_t __result;
-  __asm__ ("mla %0.4s, %2.4s, %3.4s"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __a + __b * __c;
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
 {
-  uint8x16_t __result;
-  __asm__ ("mla %0.16b, %2.16b, %3.16b"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __a + __b * __c;
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
 {
-  uint16x8_t __result;
-  __asm__ ("mla %0.8h, %2.8h, %3.8h"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __a + __b * __c;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
 {
-  uint32x4_t __result;
-  __asm__ ("mla %0.4s, %2.4s, %3.4s"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __a + __b * __c;
 }
 
 __extension__ extern __inline float32x2_t