Patchwork [AArch64_BE,4/4] Big-Endian lane numbering fix

login
register
mail settings
Submitter Alex Velenko
Date Jan. 16, 2014, 11:52 a.m.
Message ID <52D7C802.9050901@arm.com>
Download mbox | patch
Permalink /patch/311718/
State New
Headers show

Comments

Alex Velenko - Jan. 16, 2014, 11:52 a.m.
Hi,
In previous BE patches the way lane indexing in lanes is calculated has
been changed. To accommodate the change, arm neon intrinsics had to be
updated.

Is it okay?

/gcc/

2014-01-16  James Greenhalgh  <james.greenhalgh@arm.com>
	    Alex Velenko  <Alex.Velenko@arm.com>

	* config/aarch64/arm_neon.h (vaddv_s8): __LANE0 cleanup.
  	(vaddv_s16): Likewise.
  	(vaddv_s32): Likewise.
	(vaddv_u8): Likewise.
  	(vaddv_u16): Likewise.
  	(vaddv_u32): Likewise.
  	(vaddvq_s8): Likewise.
  	(vaddvq_s16): Likewise.
  	(vaddvq_s32): Likewise.
  	(vaddvq_s64): Likewise.
  	(vaddvq_u8): Likewise.
  	(vaddvq_u16): Likewise.
  	(vaddvq_u32): Likewise.
  	(vaddvq_u64): Likewise.
  	(vaddv_f32): Likewise.
  	(vaddvq_f32): Likewise.
  	(vaddvq_f64): Likewise.
  	(vmaxv_f32): Likewise.
  	(vmaxv_s8): Likewise.
  	(vmaxv_s16): Likewise.
  	(vmaxv_s32): Likewise.
  	(vmaxv_u8): Likewise.
  	(vmaxv_u16): Likewise.
  	(vmaxv_u32): Likewise.
  	(vmaxvq_f32): Likewise.
  	(vmaxvq_f64): Likewise.
  	(vmaxvq_s8): Likewise.
  	(vmaxvq_s16): Likewise.
  	(vmaxvq_s32): Likewise.
  	(vmaxvq_u8): Likewise.
  	(vmaxvq_u16): Likewise.
  	(vmaxvq_u32): Likewise.
  	(vmaxnmv_f32): Likewise.
  	(vmaxnmvq_f32): Likewise.
  	(vmaxnmvq_f64): Likewise.
  	(vminv_f32): Likewise.
  	(vminv_s8): Likewise.
  	(vminv_s16): Likewise.
  	(vminv_s32): Likewise.
  	(vminv_u8): Likewise.
  	(vminv_u16): Likewise.
  	(vminv_u32): Likewise.
  	(vminvq_f32): Likewise.
  	(vminvq_f64): Likewise.
  	(vminvq_s8): Likewise.
  	(vminvq_s16): Likewise.
  	(vminvq_s32): Likewise.
  	(vminvq_u8): Likewise.
  	(vminvq_u16): Likewise.
  	(vminvq_u32): Likewise.
  	(vminnmv_f32): Likewise.
  	(vminnmvq_f32): Likewise.
  	(vminnmvq_f64): Likewise.
Marcus Shawcroft - Jan. 21, 2014, 1:34 p.m.
2014/1/16 Alex Velenko <Alex.Velenko@arm.com>:
> Hi,
> In previous BE patches the way lane indexing in lanes is calculated has
> been changed. To accommodate the change, arm neon intrinsics had to be
> updated.
>
> Is it okay?

OK /Marcus
Alex Velenko - Jan. 21, 2014, 7:01 p.m.
Hi,
Can someone, please, commit this patch as I do not have privileges to
do so.
Kind regards,
Alex Velenko

On 21/01/14 13:34, Marcus Shawcroft wrote:
> 2014/1/16 Alex Velenko <Alex.Velenko@arm.com>:
>> Hi,
>> In previous BE patches the way lane indexing in lanes is calculated has
>> been changed. To accommodate the change, arm neon intrinsics had to be
>> updated.
>>
>> Is it okay?
>
> OK /Marcus
>

Patch

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 33816d4381c8cf271fc4a85db6cc668f6c031dd8..568ade61653d213da5c1826c970ee350e1fdee97 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -15307,30 +15307,24 @@  vaddd_u64 (uint64x1_t __a, uint64x1_t __b)
   return __a + __b;
 }
 
-#if __AARCH64EB__
-#define __LANE0(__t) ((__t) - 1)
-#else
-#define __LANE0(__t) 0
-#endif
-
 /* vaddv */
 
 __extension__ static __inline int8_t __attribute__ ((__always_inline__))
 vaddv_s8 (int8x8_t __a)
 {
-  return vget_lane_s8 (__builtin_aarch64_reduc_splus_v8qi (__a), __LANE0 (8));
+  return vget_lane_s8 (__builtin_aarch64_reduc_splus_v8qi (__a), 0);
 }
 
 __extension__ static __inline int16_t __attribute__ ((__always_inline__))
 vaddv_s16 (int16x4_t __a)
 {
-  return vget_lane_s16 (__builtin_aarch64_reduc_splus_v4hi (__a), __LANE0 (4));
+  return vget_lane_s16 (__builtin_aarch64_reduc_splus_v4hi (__a), 0);
 }
 
 __extension__ static __inline int32_t __attribute__ ((__always_inline__))
 vaddv_s32 (int32x2_t __a)
 {
-  return vget_lane_s32 (__builtin_aarch64_reduc_splus_v2si (__a), __LANE0 (2));
+  return vget_lane_s32 (__builtin_aarch64_reduc_splus_v2si (__a), 0);
 }
 
 __extension__ static __inline uint8_t __attribute__ ((__always_inline__))
@@ -15338,7 +15332,7 @@  vaddv_u8 (uint8x8_t __a)
 {
   return vget_lane_u8 ((uint8x8_t)
 		__builtin_aarch64_reduc_uplus_v8qi ((int8x8_t) __a),
-		__LANE0 (8));
+		0);
 }
 
 __extension__ static __inline uint16_t __attribute__ ((__always_inline__))
@@ -15346,7 +15340,7 @@  vaddv_u16 (uint16x4_t __a)
 {
   return vget_lane_u16 ((uint16x4_t)
 		__builtin_aarch64_reduc_uplus_v4hi ((int16x4_t) __a),
-		__LANE0 (4));
+		0);
 }
 
 __extension__ static __inline uint32_t __attribute__ ((__always_inline__))
@@ -15354,32 +15348,32 @@  vaddv_u32 (uint32x2_t __a)
 {
   return vget_lane_u32 ((uint32x2_t)
 		__builtin_aarch64_reduc_uplus_v2si ((int32x2_t) __a),
-		__LANE0 (2));
+		0);
 }
 
 __extension__ static __inline int8_t __attribute__ ((__always_inline__))
 vaddvq_s8 (int8x16_t __a)
 {
   return vgetq_lane_s8 (__builtin_aarch64_reduc_splus_v16qi (__a),
-			__LANE0 (16));
+			0);
 }
 
 __extension__ static __inline int16_t __attribute__ ((__always_inline__))
 vaddvq_s16 (int16x8_t __a)
 {
-  return vgetq_lane_s16 (__builtin_aarch64_reduc_splus_v8hi (__a), __LANE0 (8));
+  return vgetq_lane_s16 (__builtin_aarch64_reduc_splus_v8hi (__a), 0);
 }
 
 __extension__ static __inline int32_t __attribute__ ((__always_inline__))
 vaddvq_s32 (int32x4_t __a)
 {
-  return vgetq_lane_s32 (__builtin_aarch64_reduc_splus_v4si (__a), __LANE0 (4));
+  return vgetq_lane_s32 (__builtin_aarch64_reduc_splus_v4si (__a), 0);
 }
 
 __extension__ static __inline int64_t __attribute__ ((__always_inline__))
 vaddvq_s64 (int64x2_t __a)
 {
-  return vgetq_lane_s64 (__builtin_aarch64_reduc_splus_v2di (__a), __LANE0 (2));
+  return vgetq_lane_s64 (__builtin_aarch64_reduc_splus_v2di (__a), 0);
 }
 
 __extension__ static __inline uint8_t __attribute__ ((__always_inline__))
@@ -15387,7 +15381,7 @@  vaddvq_u8 (uint8x16_t __a)
 {
   return vgetq_lane_u8 ((uint8x16_t)
 		__builtin_aarch64_reduc_uplus_v16qi ((int8x16_t) __a),
-		__LANE0 (16));
+		0);
 }
 
 __extension__ static __inline uint16_t __attribute__ ((__always_inline__))
@@ -15395,7 +15389,7 @@  vaddvq_u16 (uint16x8_t __a)
 {
   return vgetq_lane_u16 ((uint16x8_t)
 		__builtin_aarch64_reduc_uplus_v8hi ((int16x8_t) __a),
-		__LANE0 (8));
+		0);
 }
 
 __extension__ static __inline uint32_t __attribute__ ((__always_inline__))
@@ -15403,7 +15397,7 @@  vaddvq_u32 (uint32x4_t __a)
 {
   return vgetq_lane_u32 ((uint32x4_t)
 		__builtin_aarch64_reduc_uplus_v4si ((int32x4_t) __a),
-		__LANE0 (4));
+		0);
 }
 
 __extension__ static __inline uint64_t __attribute__ ((__always_inline__))
@@ -15411,28 +15405,28 @@  vaddvq_u64 (uint64x2_t __a)
 {
   return vgetq_lane_u64 ((uint64x2_t)
 		__builtin_aarch64_reduc_uplus_v2di ((int64x2_t) __a),
-		__LANE0 (2));
+		0);
 }
 
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vaddv_f32 (float32x2_t __a)
 {
   float32x2_t __t = __builtin_aarch64_reduc_splus_v2sf (__a);
-  return vget_lane_f32 (__t, __LANE0 (2));
+  return vget_lane_f32 (__t, 0);
 }
 
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vaddvq_f32 (float32x4_t __a)
 {
   float32x4_t __t = __builtin_aarch64_reduc_splus_v4sf (__a);
-  return vgetq_lane_f32 (__t, __LANE0 (4));
+  return vgetq_lane_f32 (__t, 0);
 }
 
 __extension__ static __inline float64_t __attribute__ ((__always_inline__))
 vaddvq_f64 (float64x2_t __a)
 {
   float64x2_t __t = __builtin_aarch64_reduc_splus_v2df (__a);
-  return vgetq_lane_f64 (__t, __LANE0 (2));
+  return vgetq_lane_f64 (__t, 0);
 }
 
 /* vbsl  */
@@ -19814,25 +19808,25 @@  __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vmaxv_f32 (float32x2_t __a)
 {
   return vget_lane_f32 (__builtin_aarch64_reduc_smax_nan_v2sf (__a),
-			__LANE0 (2));
+			0);
 }
 
 __extension__ static __inline int8_t __attribute__ ((__always_inline__))
 vmaxv_s8 (int8x8_t __a)
 {
-  return vget_lane_s8 (__builtin_aarch64_reduc_smax_v8qi (__a), __LANE0 (8));
+  return vget_lane_s8 (__builtin_aarch64_reduc_smax_v8qi (__a), 0);
 }
 
 __extension__ static __inline int16_t __attribute__ ((__always_inline__))
 vmaxv_s16 (int16x4_t __a)
 {
-  return vget_lane_s16 (__builtin_aarch64_reduc_smax_v4hi (__a), __LANE0 (4));
+  return vget_lane_s16 (__builtin_aarch64_reduc_smax_v4hi (__a), 0);
 }
 
 __extension__ static __inline int32_t __attribute__ ((__always_inline__))
 vmaxv_s32 (int32x2_t __a)
 {
-  return vget_lane_s32 (__builtin_aarch64_reduc_smax_v2si (__a), __LANE0 (2));
+  return vget_lane_s32 (__builtin_aarch64_reduc_smax_v2si (__a), 0);
 }
 
 __extension__ static __inline uint8_t __attribute__ ((__always_inline__))
@@ -19840,7 +19834,7 @@  vmaxv_u8 (uint8x8_t __a)
 {
   return vget_lane_u8 ((uint8x8_t)
 		__builtin_aarch64_reduc_umax_v8qi ((int8x8_t) __a),
-		__LANE0 (8));
+		0);
 }
 
 __extension__ static __inline uint16_t __attribute__ ((__always_inline__))
@@ -19848,7 +19842,7 @@  vmaxv_u16 (uint16x4_t __a)
 {
   return vget_lane_u16 ((uint16x4_t)
 		__builtin_aarch64_reduc_umax_v4hi ((int16x4_t) __a),
-		__LANE0 (4));
+		0);
 }
 
 __extension__ static __inline uint32_t __attribute__ ((__always_inline__))
@@ -19856,39 +19850,39 @@  vmaxv_u32 (uint32x2_t __a)
 {
   return vget_lane_u32 ((uint32x2_t)
 		__builtin_aarch64_reduc_umax_v2si ((int32x2_t) __a),
-		__LANE0 (2));
+		0);
 }
 
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vmaxvq_f32 (float32x4_t __a)
 {
   return vgetq_lane_f32 (__builtin_aarch64_reduc_smax_nan_v4sf (__a),
-			 __LANE0 (4));
+			 0);
 }
 
 __extension__ static __inline float64_t __attribute__ ((__always_inline__))
 vmaxvq_f64 (float64x2_t __a)
 {
   return vgetq_lane_f64 (__builtin_aarch64_reduc_smax_nan_v2df (__a),
-			 __LANE0 (2));
+			 0);
 }
 
 __extension__ static __inline int8_t __attribute__ ((__always_inline__))
 vmaxvq_s8 (int8x16_t __a)
 {
-  return vgetq_lane_s8 (__builtin_aarch64_reduc_smax_v16qi (__a), __LANE0 (16));
+  return vgetq_lane_s8 (__builtin_aarch64_reduc_smax_v16qi (__a), 0);
 }
 
 __extension__ static __inline int16_t __attribute__ ((__always_inline__))
 vmaxvq_s16 (int16x8_t __a)
 {
-  return vgetq_lane_s16 (__builtin_aarch64_reduc_smax_v8hi (__a), __LANE0 (8));
+  return vgetq_lane_s16 (__builtin_aarch64_reduc_smax_v8hi (__a), 0);
 }
 
 __extension__ static __inline int32_t __attribute__ ((__always_inline__))
 vmaxvq_s32 (int32x4_t __a)
 {
-  return vgetq_lane_s32 (__builtin_aarch64_reduc_smax_v4si (__a), __LANE0 (4));
+  return vgetq_lane_s32 (__builtin_aarch64_reduc_smax_v4si (__a), 0);
 }
 
 __extension__ static __inline uint8_t __attribute__ ((__always_inline__))
@@ -19896,7 +19890,7 @@  vmaxvq_u8 (uint8x16_t __a)
 {
   return vgetq_lane_u8 ((uint8x16_t)
 		__builtin_aarch64_reduc_umax_v16qi ((int8x16_t) __a),
-		__LANE0 (16));
+		0);
 }
 
 __extension__ static __inline uint16_t __attribute__ ((__always_inline__))
@@ -19904,7 +19898,7 @@  vmaxvq_u16 (uint16x8_t __a)
 {
   return vgetq_lane_u16 ((uint16x8_t)
 		__builtin_aarch64_reduc_umax_v8hi ((int16x8_t) __a),
-		__LANE0 (8));
+		0);
 }
 
 __extension__ static __inline uint32_t __attribute__ ((__always_inline__))
@@ -19912,7 +19906,7 @@  vmaxvq_u32 (uint32x4_t __a)
 {
   return vgetq_lane_u32 ((uint32x4_t)
 		__builtin_aarch64_reduc_umax_v4si ((int32x4_t) __a),
-		__LANE0 (4));
+		0);
 }
 
 /* vmaxnmv  */
@@ -19921,19 +19915,19 @@  __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vmaxnmv_f32 (float32x2_t __a)
 {
   return vget_lane_f32 (__builtin_aarch64_reduc_smax_v2sf (__a),
-			__LANE0 (2));
+			0);
 }
 
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vmaxnmvq_f32 (float32x4_t __a)
 {
-  return vgetq_lane_f32 (__builtin_aarch64_reduc_smax_v4sf (__a), __LANE0 (4));
+  return vgetq_lane_f32 (__builtin_aarch64_reduc_smax_v4sf (__a), 0);
 }
 
 __extension__ static __inline float64_t __attribute__ ((__always_inline__))
 vmaxnmvq_f64 (float64x2_t __a)
 {
-  return vgetq_lane_f64 (__builtin_aarch64_reduc_smax_v2df (__a), __LANE0 (2));
+  return vgetq_lane_f64 (__builtin_aarch64_reduc_smax_v2df (__a), 0);
 }
 
 /* vmin  */
@@ -20060,26 +20054,26 @@  __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vminv_f32 (float32x2_t __a)
 {
   return vget_lane_f32 (__builtin_aarch64_reduc_smin_nan_v2sf (__a),
-			__LANE0 (2));
+			0);
 }
 
 __extension__ static __inline int8_t __attribute__ ((__always_inline__))
 vminv_s8 (int8x8_t __a)
 {
   return vget_lane_s8 (__builtin_aarch64_reduc_smin_v8qi (__a),
-		       __LANE0 (8));
+		       0);
 }
 
 __extension__ static __inline int16_t __attribute__ ((__always_inline__))
 vminv_s16 (int16x4_t __a)
 {
-  return vget_lane_s16 (__builtin_aarch64_reduc_smin_v4hi (__a), __LANE0 (4));
+  return vget_lane_s16 (__builtin_aarch64_reduc_smin_v4hi (__a), 0);
 }
 
 __extension__ static __inline int32_t __attribute__ ((__always_inline__))
 vminv_s32 (int32x2_t __a)
 {
-  return vget_lane_s32 (__builtin_aarch64_reduc_smin_v2si (__a), __LANE0 (2));
+  return vget_lane_s32 (__builtin_aarch64_reduc_smin_v2si (__a), 0);
 }
 
 __extension__ static __inline uint8_t __attribute__ ((__always_inline__))
@@ -20087,7 +20081,7 @@  vminv_u8 (uint8x8_t __a)
 {
   return vget_lane_u8 ((uint8x8_t)
 		__builtin_aarch64_reduc_umin_v8qi ((int8x8_t) __a),
-		__LANE0 (8));
+		0);
 }
 
 __extension__ static __inline uint16_t __attribute__ ((__always_inline__))
@@ -20095,7 +20089,7 @@  vminv_u16 (uint16x4_t __a)
 {
   return vget_lane_u16 ((uint16x4_t)
 		__builtin_aarch64_reduc_umin_v4hi ((int16x4_t) __a),
-		__LANE0 (4));
+		0);
 }
 
 __extension__ static __inline uint32_t __attribute__ ((__always_inline__))
@@ -20103,39 +20097,39 @@  vminv_u32 (uint32x2_t __a)
 {
   return vget_lane_u32 ((uint32x2_t)
 		__builtin_aarch64_reduc_umin_v2si ((int32x2_t) __a),
-		__LANE0 (2));
+		0);
 }
 
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vminvq_f32 (float32x4_t __a)
 {
   return vgetq_lane_f32 (__builtin_aarch64_reduc_smin_nan_v4sf (__a),
-			 __LANE0 (4));
+			 0);
 }
 
 __extension__ static __inline float64_t __attribute__ ((__always_inline__))
 vminvq_f64 (float64x2_t __a)
 {
   return vgetq_lane_f64 (__builtin_aarch64_reduc_smin_nan_v2df (__a),
-			 __LANE0 (2));
+			 0);
 }
 
 __extension__ static __inline int8_t __attribute__ ((__always_inline__))
 vminvq_s8 (int8x16_t __a)
 {
-  return vgetq_lane_s8 (__builtin_aarch64_reduc_smin_v16qi (__a), __LANE0 (16));
+  return vgetq_lane_s8 (__builtin_aarch64_reduc_smin_v16qi (__a), 0);
 }
 
 __extension__ static __inline int16_t __attribute__ ((__always_inline__))
 vminvq_s16 (int16x8_t __a)
 {
-  return vgetq_lane_s16 (__builtin_aarch64_reduc_smin_v8hi (__a), __LANE0 (8));
+  return vgetq_lane_s16 (__builtin_aarch64_reduc_smin_v8hi (__a), 0);
 }
 
 __extension__ static __inline int32_t __attribute__ ((__always_inline__))
 vminvq_s32 (int32x4_t __a)
 {
-  return vgetq_lane_s32 (__builtin_aarch64_reduc_smin_v4si (__a), __LANE0 (4));
+  return vgetq_lane_s32 (__builtin_aarch64_reduc_smin_v4si (__a), 0);
 }
 
 __extension__ static __inline uint8_t __attribute__ ((__always_inline__))
@@ -20143,7 +20137,7 @@  vminvq_u8 (uint8x16_t __a)
 {
   return vgetq_lane_u8 ((uint8x16_t)
 		__builtin_aarch64_reduc_umin_v16qi ((int8x16_t) __a),
-		__LANE0 (16));
+		0);
 }
 
 __extension__ static __inline uint16_t __attribute__ ((__always_inline__))
@@ -20151,7 +20145,7 @@  vminvq_u16 (uint16x8_t __a)
 {
   return vgetq_lane_u16 ((uint16x8_t)
 		__builtin_aarch64_reduc_umin_v8hi ((int16x8_t) __a),
-		__LANE0 (8));
+		0);
 }
 
 __extension__ static __inline uint32_t __attribute__ ((__always_inline__))
@@ -20159,7 +20153,7 @@  vminvq_u32 (uint32x4_t __a)
 {
   return vgetq_lane_u32 ((uint32x4_t)
 		__builtin_aarch64_reduc_umin_v4si ((int32x4_t) __a),
-		__LANE0 (4));
+		0);
 }
 
 /* vminnmv  */
@@ -20167,19 +20161,19 @@  vminvq_u32 (uint32x4_t __a)
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vminnmv_f32 (float32x2_t __a)
 {
-  return vget_lane_f32 (__builtin_aarch64_reduc_smin_v2sf (__a), __LANE0 (2));
+  return vget_lane_f32 (__builtin_aarch64_reduc_smin_v2sf (__a), 0);
 }
 
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vminnmvq_f32 (float32x4_t __a)
 {
-  return vgetq_lane_f32 (__builtin_aarch64_reduc_smin_v4sf (__a), __LANE0 (4));
+  return vgetq_lane_f32 (__builtin_aarch64_reduc_smin_v4sf (__a), 0);
 }
 
 __extension__ static __inline float64_t __attribute__ ((__always_inline__))
 vminnmvq_f64 (float64x2_t __a)
 {
-  return vgetq_lane_f64 (__builtin_aarch64_reduc_smin_v2df (__a), __LANE0 (2));
+  return vgetq_lane_f64 (__builtin_aarch64_reduc_smin_v2df (__a), 0);
 }
 
 /* vmla */
@@ -25218,8 +25212,6 @@  __INTERLEAVE_LIST (zip)
 
 /* End of optimal implementations in approved order.  */
 
-#undef __LANE0
-
 #undef __aarch64_vget_lane_any
 #undef __aarch64_vget_lane_f32
 #undef __aarch64_vget_lane_f64