diff mbox

[AArch64,2/2] Replace temporary inline assembler for vget_high

Message ID 53E9E87B.4080700@arm.com
State New
Headers show

Commit Message

Alan Lawrence Aug. 12, 2014, 10:12 a.m. UTC
This patch replaces the current inline assembler for the vget_high intrinsics in 
arm_neon.h with a sequence of other calls, in a similar fashion to vget_low. 
Unlike the assembler, these are all transparent to the front-end, so should 
enable better optimization through the mid-end.

Tested check-gcc and check-g++ and aarch64-none-elf and aarch64_be-none-elf 
(including new tests in previous patch!).

Comments

Marcus Shawcroft Sept. 2, 2014, 4:34 p.m. UTC | #1
On 12 August 2014 11:12, Alan Lawrence <alan.lawrence@arm.com> wrote:
> This patch replaces the current inline assembler for the vget_high
> intrinsics in arm_neon.h with a sequence of other calls, in a similar
> fashion to vget_low. Unlike the assembler, these are all transparent to the
> front-end, so should enable better optimization through the mid-end.
>
> Tested check-gcc and check-g++ and aarch64-none-elf and aarch64_be-none-elf
> (including new tests in previous patch!).

I think we are still waiting on ChangeLogs for this and the related patch?

/Marcus
Alan Lawrence Sept. 4, 2014, 1:21 p.m. UTC | #2
Ah, right you are. (Curiously I still can't find Richard's previous message in 
Thunderbird!)

for first patch, gcc/testsuite/ChangeLog:

         * gcc.target/aarch64/vget_high_1.c: New test.
         * gcc.target/aarch64/vget_low_1.c: Likewise.

for second patch, gcc/ChangeLog:

         * config/aarch64/arm_neon.h (__GET_HIGH): New macro.
         (vget_high_f32, vget_high_f64, vget_high_p8, vget_high_p16,
         vget_high_s8, vget_high_s16, vget_high_s32, vget_high_s64,
	vget_high_u8, vget_high_u16, vget_high_u32, vget_high_u64):
	Remove temporary __asm__ and reimplement.

--Alan


Marcus Shawcroft wrote:
> On 12 August 2014 11:12, Alan Lawrence <alan.lawrence@arm.com> wrote:
>> This patch replaces the current inline assembler for the vget_high
>> intrinsics in arm_neon.h with a sequence of other calls, in a similar
>> fashion to vget_low. Unlike the assembler, these are all transparent to the
>> front-end, so should enable better optimization through the mid-end.
>>
>> Tested check-gcc and check-g++ and aarch64-none-elf and aarch64_be-none-elf
>> (including new tests in previous patch!).
> 
> I think we are still waiting on ChangeLogs for this and the related patch?
> 
> /Marcus
>
Marcus Shawcroft Sept. 4, 2014, 1:29 p.m. UTC | #3
On 4 September 2014 14:21, Alan Lawrence <alan.lawrence@arm.com> wrote:
> Ah, right you are. (Curiously I still can't find Richard's previous message
> in Thunderbird!)
>
> for first patch, gcc/testsuite/ChangeLog:
>
>         * gcc.target/aarch64/vget_high_1.c: New test.
>         * gcc.target/aarch64/vget_low_1.c: Likewise.
>
> for second patch, gcc/ChangeLog:
>
>         * config/aarch64/arm_neon.h (__GET_HIGH): New macro.
>         (vget_high_f32, vget_high_f64, vget_high_p8, vget_high_p16,
>         vget_high_s8, vget_high_s16, vget_high_s32, vget_high_s64,
>         vget_high_u8, vget_high_u16, vget_high_u32, vget_high_u64):
>         Remove temporary __asm__ and reimplement.

OK both.
/Marcus
diff mbox

Patch

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 45f545779292bbd3ceb1b9e13c980988e52bc3ec..706e07c8cda5d1513a6ea9817f6054cfb2258d48 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -4322,6 +4322,85 @@  vget_low_u64 (uint64x2_t __a)
 
 #undef __GET_LOW
 
+#define __GET_HIGH(__TYPE)					\
+  uint64x2_t tmp = vreinterpretq_u64_##__TYPE (__a);		\
+  uint64x1_t hi = vcreate_u64 (vgetq_lane_u64 (tmp, 1));	\
+  return vreinterpret_##__TYPE##_u64 (hi);
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vget_high_f32 (float32x4_t __a)
+{
+  __GET_HIGH (f32);
+}
+
+__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+vget_high_f64 (float64x2_t __a)
+{
+  __GET_HIGH (f64);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vget_high_p8 (poly8x16_t __a)
+{
+  __GET_HIGH (p8);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vget_high_p16 (poly16x8_t __a)
+{
+  __GET_HIGH (p16);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vget_high_s8 (int8x16_t __a)
+{
+  __GET_HIGH (s8);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vget_high_s16 (int16x8_t __a)
+{
+  __GET_HIGH (s16);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vget_high_s32 (int32x4_t __a)
+{
+  __GET_HIGH (s32);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vget_high_s64 (int64x2_t __a)
+{
+  __GET_HIGH (s64);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vget_high_u8 (uint8x16_t __a)
+{
+  __GET_HIGH (u8);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vget_high_u16 (uint16x8_t __a)
+{
+  __GET_HIGH (u16);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vget_high_u32 (uint32x4_t __a)
+{
+  __GET_HIGH (u32);
+}
+
+#undef __GET_HIGH
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vget_high_u64 (uint64x2_t __a)
+{
+  return vcreate_u64 (vgetq_lane_u64 (__a, 1));
+}
+
 __extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 vcombine_s8 (int8x8_t __a, int8x8_t __b)
 {
@@ -5764,138 +5843,6 @@  vfmsq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
   return result;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vget_high_f32 (float32x4_t a)
-{
-  float32x2_t result;
-  __asm__ ("ins %0.d[0], %1.d[1]"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vget_high_f64 (float64x2_t a)
-{
-  float64x1_t result;
-  __asm__ ("ins %0.d[0], %1.d[1]"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vget_high_p8 (poly8x16_t a)
-{
-  poly8x8_t result;
-  __asm__ ("ins %0.d[0], %1.d[1]"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vget_high_p16 (poly16x8_t a)
-{
-  poly16x4_t result;
-  __asm__ ("ins %0.d[0], %1.d[1]"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vget_high_s8 (int8x16_t a)
-{
-  int8x8_t result;
-  __asm__ ("ins %0.d[0], %1.d[1]"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vget_high_s16 (int16x8_t a)
-{
-  int16x4_t result;
-  __asm__ ("ins %0.d[0], %1.d[1]"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vget_high_s32 (int32x4_t a)
-{
-  int32x2_t result;
-  __asm__ ("ins %0.d[0], %1.d[1]"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vget_high_s64 (int64x2_t a)
-{
-  int64x1_t result;
-  __asm__ ("ins %0.d[0], %1.d[1]"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vget_high_u8 (uint8x16_t a)
-{
-  uint8x8_t result;
-  __asm__ ("ins %0.d[0], %1.d[1]"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vget_high_u16 (uint16x8_t a)
-{
-  uint16x4_t result;
-  __asm__ ("ins %0.d[0], %1.d[1]"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vget_high_u32 (uint32x4_t a)
-{
-  uint32x2_t result;
-  __asm__ ("ins %0.d[0], %1.d[1]"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vget_high_u64 (uint64x2_t a)
-{
-  uint64x1_t result;
-  __asm__ ("ins %0.d[0], %1.d[1]"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vhsub_s8 (int8x8_t a, int8x8_t b)
 {