diff mbox

[AArch64_BE,4/4] Big-Endian lane numbering fix

Message ID 52D7C802.9050901@arm.com
State New
Headers show

Commit Message

Alex Velenko Jan. 16, 2014, 11:52 a.m. UTC
Hi,
In previous BE patches the way lane indexing in lanes is calculated has
been changed. To accommodate the change, arm neon intrinsics had to be
updated.

Is it okay?

/gcc/

2014-01-16  James Greenhalgh  <james.greenhalgh@arm.com>
	    Alex Velenko  <Alex.Velenko@arm.com>

	* config/aarch64/arm_neon.h (vaddv_s8): __LANE0 cleanup.
  	(vaddv_s16): Likewise.
  	(vaddv_s32): Likewise.
	(vaddv_u8): Likewise.
  	(vaddv_u16): Likewise.
  	(vaddv_u32): Likewise.
  	(vaddvq_s8): Likewise.
  	(vaddvq_s16): Likewise.
  	(vaddvq_s32): Likewise.
  	(vaddvq_s64): Likewise.
  	(vaddvq_u8): Likewise.
  	(vaddvq_u16): Likewise.
  	(vaddvq_u32): Likewise.
  	(vaddvq_u64): Likewise.
  	(vaddv_f32): Likewise.
  	(vaddvq_f32): Likewise.
  	(vaddvq_f64): Likewise.
  	(vmaxv_f32): Likewise.
  	(vmaxv_s8): Likewise.
  	(vmaxv_s16): Likewise.
  	(vmaxv_s32): Likewise.
  	(vmaxv_u8): Likewise.
  	(vmaxv_u16): Likewise.
  	(vmaxv_u32): Likewise.
  	(vmaxvq_f32): Likewise.
  	(vmaxvq_f64): Likewise.
  	(vmaxvq_s8): Likewise.
  	(vmaxvq_s16): Likewise.
  	(vmaxvq_s32): Likewise.
  	(vmaxvq_u8): Likewise.
  	(vmaxvq_u16): Likewise.
  	(vmaxvq_u32): Likewise.
  	(vmaxnmv_f32): Likewise.
  	(vmaxnmvq_f32): Likewise.
  	(vmaxnmvq_f64): Likewise.
  	(vminv_f32): Likewise.
  	(vminv_s8): Likewise.
  	(vminv_s16): Likewise.
  	(vminv_s32): Likewise.
  	(vminv_u8): Likewise.
  	(vminv_u16): Likewise.
  	(vminv_u32): Likewise.
  	(vminvq_f32): Likewise.
  	(vminvq_f64): Likewise.
  	(vminvq_s8): Likewise.
  	(vminvq_s16): Likewise.
  	(vminvq_s32): Likewise.
  	(vminvq_u8): Likewise.
  	(vminvq_u16): Likewise.
  	(vminvq_u32): Likewise.
  	(vminnmv_f32): Likewise.
  	(vminnmvq_f32): Likewise.
  	(vminnmvq_f64): Likewise.

Comments

Marcus Shawcroft Jan. 21, 2014, 1:34 p.m. UTC | #1
2014/1/16 Alex Velenko <Alex.Velenko@arm.com>:
> Hi,
> In previous BE patches the way lane indexing in lanes is calculated has
> been changed. To accommodate the change, arm neon intrinsics had to be
> updated.
>
> Is it okay?

OK /Marcus
Alex Velenko Jan. 21, 2014, 7:01 p.m. UTC | #2
Hi,
Can someone, please, commit this patch as I do not have privileges to
do so.
Kind regards,
Alex Velenko

On 21/01/14 13:34, Marcus Shawcroft wrote:
> 2014/1/16 Alex Velenko <Alex.Velenko@arm.com>:
>> Hi,
>> In previous BE patches the way lane indexing in lanes is calculated has
>> been changed. To accommodate the change, arm neon intrinsics had to be
>> updated.
>>
>> Is it okay?
>
> OK /Marcus
>
diff mbox

Patch

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 33816d4381c8cf271fc4a85db6cc668f6c031dd8..568ade61653d213da5c1826c970ee350e1fdee97 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -15307,30 +15307,24 @@  vaddd_u64 (uint64x1_t __a, uint64x1_t __b)
   return __a + __b;
 }
 
-#if __AARCH64EB__
-#define __LANE0(__t) ((__t) - 1)
-#else
-#define __LANE0(__t) 0
-#endif
-
 /* vaddv */
 
 __extension__ static __inline int8_t __attribute__ ((__always_inline__))
 vaddv_s8 (int8x8_t __a)
 {
-  return vget_lane_s8 (__builtin_aarch64_reduc_splus_v8qi (__a), __LANE0 (8));
+  return vget_lane_s8 (__builtin_aarch64_reduc_splus_v8qi (__a), 0);
 }
 
 __extension__ static __inline int16_t __attribute__ ((__always_inline__))
 vaddv_s16 (int16x4_t __a)
 {
-  return vget_lane_s16 (__builtin_aarch64_reduc_splus_v4hi (__a), __LANE0 (4));
+  return vget_lane_s16 (__builtin_aarch64_reduc_splus_v4hi (__a), 0);
 }
 
 __extension__ static __inline int32_t __attribute__ ((__always_inline__))
 vaddv_s32 (int32x2_t __a)
 {
-  return vget_lane_s32 (__builtin_aarch64_reduc_splus_v2si (__a), __LANE0 (2));
+  return vget_lane_s32 (__builtin_aarch64_reduc_splus_v2si (__a), 0);
 }
 
 __extension__ static __inline uint8_t __attribute__ ((__always_inline__))
@@ -15338,7 +15332,7 @@  vaddv_u8 (uint8x8_t __a)
 {
   return vget_lane_u8 ((uint8x8_t)
 		__builtin_aarch64_reduc_uplus_v8qi ((int8x8_t) __a),
-		__LANE0 (8));
+		0);
 }
 
 __extension__ static __inline uint16_t __attribute__ ((__always_inline__))
@@ -15346,7 +15340,7 @@  vaddv_u16 (uint16x4_t __a)
 {
   return vget_lane_u16 ((uint16x4_t)
 		__builtin_aarch64_reduc_uplus_v4hi ((int16x4_t) __a),
-		__LANE0 (4));
+		0);
 }
 
 __extension__ static __inline uint32_t __attribute__ ((__always_inline__))
@@ -15354,32 +15348,32 @@  vaddv_u32 (uint32x2_t __a)
 {
   return vget_lane_u32 ((uint32x2_t)
 		__builtin_aarch64_reduc_uplus_v2si ((int32x2_t) __a),
-		__LANE0 (2));
+		0);
 }
 
 __extension__ static __inline int8_t __attribute__ ((__always_inline__))
 vaddvq_s8 (int8x16_t __a)
 {
   return vgetq_lane_s8 (__builtin_aarch64_reduc_splus_v16qi (__a),
-			__LANE0 (16));
+			0);
 }
 
 __extension__ static __inline int16_t __attribute__ ((__always_inline__))
 vaddvq_s16 (int16x8_t __a)
 {
-  return vgetq_lane_s16 (__builtin_aarch64_reduc_splus_v8hi (__a), __LANE0 (8));
+  return vgetq_lane_s16 (__builtin_aarch64_reduc_splus_v8hi (__a), 0);
 }
 
 __extension__ static __inline int32_t __attribute__ ((__always_inline__))
 vaddvq_s32 (int32x4_t __a)
 {
-  return vgetq_lane_s32 (__builtin_aarch64_reduc_splus_v4si (__a), __LANE0 (4));
+  return vgetq_lane_s32 (__builtin_aarch64_reduc_splus_v4si (__a), 0);
 }
 
 __extension__ static __inline int64_t __attribute__ ((__always_inline__))
 vaddvq_s64 (int64x2_t __a)
 {
-  return vgetq_lane_s64 (__builtin_aarch64_reduc_splus_v2di (__a), __LANE0 (2));
+  return vgetq_lane_s64 (__builtin_aarch64_reduc_splus_v2di (__a), 0);
 }
 
 __extension__ static __inline uint8_t __attribute__ ((__always_inline__))
@@ -15387,7 +15381,7 @@  vaddvq_u8 (uint8x16_t __a)
 {
   return vgetq_lane_u8 ((uint8x16_t)
 		__builtin_aarch64_reduc_uplus_v16qi ((int8x16_t) __a),
-		__LANE0 (16));
+		0);
 }
 
 __extension__ static __inline uint16_t __attribute__ ((__always_inline__))
@@ -15395,7 +15389,7 @@  vaddvq_u16 (uint16x8_t __a)
 {
   return vgetq_lane_u16 ((uint16x8_t)
 		__builtin_aarch64_reduc_uplus_v8hi ((int16x8_t) __a),
-		__LANE0 (8));
+		0);
 }
 
 __extension__ static __inline uint32_t __attribute__ ((__always_inline__))
@@ -15403,7 +15397,7 @@  vaddvq_u32 (uint32x4_t __a)
 {
   return vgetq_lane_u32 ((uint32x4_t)
 		__builtin_aarch64_reduc_uplus_v4si ((int32x4_t) __a),
-		__LANE0 (4));
+		0);
 }
 
 __extension__ static __inline uint64_t __attribute__ ((__always_inline__))
@@ -15411,28 +15405,28 @@  vaddvq_u64 (uint64x2_t __a)
 {
   return vgetq_lane_u64 ((uint64x2_t)
 		__builtin_aarch64_reduc_uplus_v2di ((int64x2_t) __a),
-		__LANE0 (2));
+		0);
 }
 
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vaddv_f32 (float32x2_t __a)
 {
   float32x2_t __t = __builtin_aarch64_reduc_splus_v2sf (__a);
-  return vget_lane_f32 (__t, __LANE0 (2));
+  return vget_lane_f32 (__t, 0);
 }
 
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vaddvq_f32 (float32x4_t __a)
 {
   float32x4_t __t = __builtin_aarch64_reduc_splus_v4sf (__a);
-  return vgetq_lane_f32 (__t, __LANE0 (4));
+  return vgetq_lane_f32 (__t, 0);
 }
 
 __extension__ static __inline float64_t __attribute__ ((__always_inline__))
 vaddvq_f64 (float64x2_t __a)
 {
   float64x2_t __t = __builtin_aarch64_reduc_splus_v2df (__a);
-  return vgetq_lane_f64 (__t, __LANE0 (2));
+  return vgetq_lane_f64 (__t, 0);
 }
 
 /* vbsl  */
@@ -19814,25 +19808,25 @@  __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vmaxv_f32 (float32x2_t __a)
 {
   return vget_lane_f32 (__builtin_aarch64_reduc_smax_nan_v2sf (__a),
-			__LANE0 (2));
+			0);
 }
 
 __extension__ static __inline int8_t __attribute__ ((__always_inline__))
 vmaxv_s8 (int8x8_t __a)
 {
-  return vget_lane_s8 (__builtin_aarch64_reduc_smax_v8qi (__a), __LANE0 (8));
+  return vget_lane_s8 (__builtin_aarch64_reduc_smax_v8qi (__a), 0);
 }
 
 __extension__ static __inline int16_t __attribute__ ((__always_inline__))
 vmaxv_s16 (int16x4_t __a)
 {
-  return vget_lane_s16 (__builtin_aarch64_reduc_smax_v4hi (__a), __LANE0 (4));
+  return vget_lane_s16 (__builtin_aarch64_reduc_smax_v4hi (__a), 0);
 }
 
 __extension__ static __inline int32_t __attribute__ ((__always_inline__))
 vmaxv_s32 (int32x2_t __a)
 {
-  return vget_lane_s32 (__builtin_aarch64_reduc_smax_v2si (__a), __LANE0 (2));
+  return vget_lane_s32 (__builtin_aarch64_reduc_smax_v2si (__a), 0);
 }
 
 __extension__ static __inline uint8_t __attribute__ ((__always_inline__))
@@ -19840,7 +19834,7 @@  vmaxv_u8 (uint8x8_t __a)
 {
   return vget_lane_u8 ((uint8x8_t)
 		__builtin_aarch64_reduc_umax_v8qi ((int8x8_t) __a),
-		__LANE0 (8));
+		0);
 }
 
 __extension__ static __inline uint16_t __attribute__ ((__always_inline__))
@@ -19848,7 +19842,7 @@  vmaxv_u16 (uint16x4_t __a)
 {
   return vget_lane_u16 ((uint16x4_t)
 		__builtin_aarch64_reduc_umax_v4hi ((int16x4_t) __a),
-		__LANE0 (4));
+		0);
 }
 
 __extension__ static __inline uint32_t __attribute__ ((__always_inline__))
@@ -19856,39 +19850,39 @@  vmaxv_u32 (uint32x2_t __a)
 {
   return vget_lane_u32 ((uint32x2_t)
 		__builtin_aarch64_reduc_umax_v2si ((int32x2_t) __a),
-		__LANE0 (2));
+		0);
 }
 
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vmaxvq_f32 (float32x4_t __a)
 {
   return vgetq_lane_f32 (__builtin_aarch64_reduc_smax_nan_v4sf (__a),
-			 __LANE0 (4));
+			 0);
 }
 
 __extension__ static __inline float64_t __attribute__ ((__always_inline__))
 vmaxvq_f64 (float64x2_t __a)
 {
   return vgetq_lane_f64 (__builtin_aarch64_reduc_smax_nan_v2df (__a),
-			 __LANE0 (2));
+			 0);
 }
 
 __extension__ static __inline int8_t __attribute__ ((__always_inline__))
 vmaxvq_s8 (int8x16_t __a)
 {
-  return vgetq_lane_s8 (__builtin_aarch64_reduc_smax_v16qi (__a), __LANE0 (16));
+  return vgetq_lane_s8 (__builtin_aarch64_reduc_smax_v16qi (__a), 0);
 }
 
 __extension__ static __inline int16_t __attribute__ ((__always_inline__))
 vmaxvq_s16 (int16x8_t __a)
 {
-  return vgetq_lane_s16 (__builtin_aarch64_reduc_smax_v8hi (__a), __LANE0 (8));
+  return vgetq_lane_s16 (__builtin_aarch64_reduc_smax_v8hi (__a), 0);
 }
 
 __extension__ static __inline int32_t __attribute__ ((__always_inline__))
 vmaxvq_s32 (int32x4_t __a)
 {
-  return vgetq_lane_s32 (__builtin_aarch64_reduc_smax_v4si (__a), __LANE0 (4));
+  return vgetq_lane_s32 (__builtin_aarch64_reduc_smax_v4si (__a), 0);
 }
 
 __extension__ static __inline uint8_t __attribute__ ((__always_inline__))
@@ -19896,7 +19890,7 @@  vmaxvq_u8 (uint8x16_t __a)
 {
   return vgetq_lane_u8 ((uint8x16_t)
 		__builtin_aarch64_reduc_umax_v16qi ((int8x16_t) __a),
-		__LANE0 (16));
+		0);
 }
 
 __extension__ static __inline uint16_t __attribute__ ((__always_inline__))
@@ -19904,7 +19898,7 @@  vmaxvq_u16 (uint16x8_t __a)
 {
   return vgetq_lane_u16 ((uint16x8_t)
 		__builtin_aarch64_reduc_umax_v8hi ((int16x8_t) __a),
-		__LANE0 (8));
+		0);
 }
 
 __extension__ static __inline uint32_t __attribute__ ((__always_inline__))
@@ -19912,7 +19906,7 @@  vmaxvq_u32 (uint32x4_t __a)
 {
   return vgetq_lane_u32 ((uint32x4_t)
 		__builtin_aarch64_reduc_umax_v4si ((int32x4_t) __a),
-		__LANE0 (4));
+		0);
 }
 
 /* vmaxnmv  */
@@ -19921,19 +19915,19 @@  __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vmaxnmv_f32 (float32x2_t __a)
 {
   return vget_lane_f32 (__builtin_aarch64_reduc_smax_v2sf (__a),
-			__LANE0 (2));
+			0);
 }
 
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vmaxnmvq_f32 (float32x4_t __a)
 {
-  return vgetq_lane_f32 (__builtin_aarch64_reduc_smax_v4sf (__a), __LANE0 (4));
+  return vgetq_lane_f32 (__builtin_aarch64_reduc_smax_v4sf (__a), 0);
 }
 
 __extension__ static __inline float64_t __attribute__ ((__always_inline__))
 vmaxnmvq_f64 (float64x2_t __a)
 {
-  return vgetq_lane_f64 (__builtin_aarch64_reduc_smax_v2df (__a), __LANE0 (2));
+  return vgetq_lane_f64 (__builtin_aarch64_reduc_smax_v2df (__a), 0);
 }
 
 /* vmin  */
@@ -20060,26 +20054,26 @@  __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vminv_f32 (float32x2_t __a)
 {
   return vget_lane_f32 (__builtin_aarch64_reduc_smin_nan_v2sf (__a),
-			__LANE0 (2));
+			0);
 }
 
 __extension__ static __inline int8_t __attribute__ ((__always_inline__))
 vminv_s8 (int8x8_t __a)
 {
   return vget_lane_s8 (__builtin_aarch64_reduc_smin_v8qi (__a),
-		       __LANE0 (8));
+		       0);
 }
 
 __extension__ static __inline int16_t __attribute__ ((__always_inline__))
 vminv_s16 (int16x4_t __a)
 {
-  return vget_lane_s16 (__builtin_aarch64_reduc_smin_v4hi (__a), __LANE0 (4));
+  return vget_lane_s16 (__builtin_aarch64_reduc_smin_v4hi (__a), 0);
 }
 
 __extension__ static __inline int32_t __attribute__ ((__always_inline__))
 vminv_s32 (int32x2_t __a)
 {
-  return vget_lane_s32 (__builtin_aarch64_reduc_smin_v2si (__a), __LANE0 (2));
+  return vget_lane_s32 (__builtin_aarch64_reduc_smin_v2si (__a), 0);
 }
 
 __extension__ static __inline uint8_t __attribute__ ((__always_inline__))
@@ -20087,7 +20081,7 @@  vminv_u8 (uint8x8_t __a)
 {
   return vget_lane_u8 ((uint8x8_t)
 		__builtin_aarch64_reduc_umin_v8qi ((int8x8_t) __a),
-		__LANE0 (8));
+		0);
 }
 
 __extension__ static __inline uint16_t __attribute__ ((__always_inline__))
@@ -20095,7 +20089,7 @@  vminv_u16 (uint16x4_t __a)
 {
   return vget_lane_u16 ((uint16x4_t)
 		__builtin_aarch64_reduc_umin_v4hi ((int16x4_t) __a),
-		__LANE0 (4));
+		0);
 }
 
 __extension__ static __inline uint32_t __attribute__ ((__always_inline__))
@@ -20103,39 +20097,39 @@  vminv_u32 (uint32x2_t __a)
 {
   return vget_lane_u32 ((uint32x2_t)
 		__builtin_aarch64_reduc_umin_v2si ((int32x2_t) __a),
-		__LANE0 (2));
+		0);
 }
 
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vminvq_f32 (float32x4_t __a)
 {
   return vgetq_lane_f32 (__builtin_aarch64_reduc_smin_nan_v4sf (__a),
-			 __LANE0 (4));
+			 0);
 }
 
 __extension__ static __inline float64_t __attribute__ ((__always_inline__))
 vminvq_f64 (float64x2_t __a)
 {
   return vgetq_lane_f64 (__builtin_aarch64_reduc_smin_nan_v2df (__a),
-			 __LANE0 (2));
+			 0);
 }
 
 __extension__ static __inline int8_t __attribute__ ((__always_inline__))
 vminvq_s8 (int8x16_t __a)
 {
-  return vgetq_lane_s8 (__builtin_aarch64_reduc_smin_v16qi (__a), __LANE0 (16));
+  return vgetq_lane_s8 (__builtin_aarch64_reduc_smin_v16qi (__a), 0);
 }
 
 __extension__ static __inline int16_t __attribute__ ((__always_inline__))
 vminvq_s16 (int16x8_t __a)
 {
-  return vgetq_lane_s16 (__builtin_aarch64_reduc_smin_v8hi (__a), __LANE0 (8));
+  return vgetq_lane_s16 (__builtin_aarch64_reduc_smin_v8hi (__a), 0);
 }
 
 __extension__ static __inline int32_t __attribute__ ((__always_inline__))
 vminvq_s32 (int32x4_t __a)
 {
-  return vgetq_lane_s32 (__builtin_aarch64_reduc_smin_v4si (__a), __LANE0 (4));
+  return vgetq_lane_s32 (__builtin_aarch64_reduc_smin_v4si (__a), 0);
 }
 
 __extension__ static __inline uint8_t __attribute__ ((__always_inline__))
@@ -20143,7 +20137,7 @@  vminvq_u8 (uint8x16_t __a)
 {
   return vgetq_lane_u8 ((uint8x16_t)
 		__builtin_aarch64_reduc_umin_v16qi ((int8x16_t) __a),
-		__LANE0 (16));
+		0);
 }
 
 __extension__ static __inline uint16_t __attribute__ ((__always_inline__))
@@ -20151,7 +20145,7 @@  vminvq_u16 (uint16x8_t __a)
 {
   return vgetq_lane_u16 ((uint16x8_t)
 		__builtin_aarch64_reduc_umin_v8hi ((int16x8_t) __a),
-		__LANE0 (8));
+		0);
 }
 
 __extension__ static __inline uint32_t __attribute__ ((__always_inline__))
@@ -20159,7 +20153,7 @@  vminvq_u32 (uint32x4_t __a)
 {
   return vgetq_lane_u32 ((uint32x4_t)
 		__builtin_aarch64_reduc_umin_v4si ((int32x4_t) __a),
-		__LANE0 (4));
+		0);
 }
 
 /* vminnmv  */
@@ -20167,19 +20161,19 @@  vminvq_u32 (uint32x4_t __a)
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vminnmv_f32 (float32x2_t __a)
 {
-  return vget_lane_f32 (__builtin_aarch64_reduc_smin_v2sf (__a), __LANE0 (2));
+  return vget_lane_f32 (__builtin_aarch64_reduc_smin_v2sf (__a), 0);
 }
 
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vminnmvq_f32 (float32x4_t __a)
 {
-  return vgetq_lane_f32 (__builtin_aarch64_reduc_smin_v4sf (__a), __LANE0 (4));
+  return vgetq_lane_f32 (__builtin_aarch64_reduc_smin_v4sf (__a), 0);
 }
 
 __extension__ static __inline float64_t __attribute__ ((__always_inline__))
 vminnmvq_f64 (float64x2_t __a)
 {
-  return vgetq_lane_f64 (__builtin_aarch64_reduc_smin_v2df (__a), __LANE0 (2));
+  return vgetq_lane_f64 (__builtin_aarch64_reduc_smin_v2df (__a), 0);
 }
 
 /* vmla */
@@ -25218,8 +25212,6 @@  __INTERLEAVE_LIST (zip)
 
 /* End of optimal implementations in approved order.  */
 
-#undef __LANE0
-
 #undef __aarch64_vget_lane_any
 #undef __aarch64_vget_lane_f32
 #undef __aarch64_vget_lane_f64