From patchwork Thu Jan 16 11:52:34 2014 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Alex Velenko X-Patchwork-Id: 311718 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 467192C0098 for ; Thu, 16 Jan 2014 22:52:50 +1100 (EST) DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender :message-id:date:from:mime-version:to:cc:subject:content-type; q=dns; s=default; b=Mgh2i7t78RaA0Cs2dymXbKeD/S5RFwVYc58QWatuw06 TyMGvU51sksHLLjzZlxMqG2fpSoYEjR8iejNLR/4QwgBqNFAzqXljX7IsTKxDhra RQJjfddx8Zj5MxV0gS1FfPfwJiu9eZiliRkzyaUMjA8IAAapsYmtQfaXVqZ1GP7I = DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender :message-id:date:from:mime-version:to:cc:subject:content-type; s=default; bh=s30yVOezcoKgGXcwdPl5XFO7k28=; b=ChBqjnsGrGske3keV Av8uw9Hfk/cjVZqAaF2euBHMckVItcZhL7NUo11xJc/Ihfu+PulAw6y0W1vWRWm3 8q1VuJ8C4MjmX9hOWOavD6HK+5Ys4zonbw7eyUR3vqOH7Osu0j1bimxSWuzsXfHS OyMujQWQw38mnfMl5B9yfR6EpA= Received: (qmail 19078 invoked by alias); 16 Jan 2014 11:52:43 -0000 Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Delivered-To: mailing list gcc-patches@gcc.gnu.org Received: (qmail 19067 invoked by uid 89); 16 Jan 2014 11:52:42 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-2.6 required=5.0 tests=AWL, BAYES_00, RCVD_IN_DNSWL_LOW, RP_MATCHES_RCVD, SPF_PASS autolearn=ham version=3.3.2 X-HELO: cam-smtp0.cambridge.arm.com Received: from fw-tnat.cambridge.arm.com (HELO cam-smtp0.cambridge.arm.com) (217.140.96.21) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with (AES256-SHA encrypted) ESMTPS; Thu, 16 Jan 2014 11:52:38 +0000 Received: from [10.1.207.145] (e104458-lin.cambridge.arm.com [10.1.207.145]) by cam-smtp0.cambridge.arm.com (8.13.8/8.13.8) with ESMTP id s0GBqYgf028666; Thu, 16 Jan 2014 11:52:34 GMT Message-ID: <52D7C802.9050901@arm.com> Date: Thu, 16 Jan 2014 11:52:34 +0000 From: Alex Velenko User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:17.0) Gecko/20130804 Thunderbird/17.0.8 MIME-Version: 1.0 To: "gcc-patches@gcc.gnu.org" CC: Marcus Shawcroft Subject: [PATCH AArch64_BE 4/4] Big-Endian lane numbering fix X-IsSubscribed: yes Hi, In previous BE patches the way lane indexing in lanes is calculated has been changed. To accommodate the change, arm neon intrinsics had to be updated. Is it okay? /gcc/ 2014-01-16 James Greenhalgh Alex Velenko * config/aarch64/arm_neon.h (vaddv_s8): __LANE0 cleanup. (vaddv_s16): Likewise. (vaddv_s32): Likewise. (vaddv_u8): Likewise. (vaddv_u16): Likewise. (vaddv_u32): Likewise. (vaddvq_s8): Likewise. (vaddvq_s16): Likewise. (vaddvq_s32): Likewise. (vaddvq_s64): Likewise. (vaddvq_u8): Likewise. (vaddvq_u16): Likewise. (vaddvq_u32): Likewise. (vaddvq_u64): Likewise. (vaddv_f32): Likewise. (vaddvq_f32): Likewise. (vaddvq_f64): Likewise. (vmaxv_f32): Likewise. (vmaxv_s8): Likewise. (vmaxv_s16): Likewise. (vmaxv_s32): Likewise. (vmaxv_u8): Likewise. (vmaxv_u16): Likewise. (vmaxv_u32): Likewise. (vmaxvq_f32): Likewise. (vmaxvq_f64): Likewise. (vmaxvq_s8): Likewise. (vmaxvq_s16): Likewise. (vmaxvq_s32): Likewise. (vmaxvq_u8): Likewise. (vmaxvq_u16): Likewise. (vmaxvq_u32): Likewise. (vmaxnmv_f32): Likewise. (vmaxnmvq_f32): Likewise. (vmaxnmvq_f64): Likewise. (vminv_f32): Likewise. (vminv_s8): Likewise. (vminv_s16): Likewise. (vminv_s32): Likewise. (vminv_u8): Likewise. (vminv_u16): Likewise. (vminv_u32): Likewise. (vminvq_f32): Likewise. (vminvq_f64): Likewise. (vminvq_s8): Likewise. (vminvq_s16): Likewise. (vminvq_s32): Likewise. (vminvq_u8): Likewise. (vminvq_u16): Likewise. (vminvq_u32): Likewise. (vminnmv_f32): Likewise. (vminnmvq_f32): Likewise. (vminnmvq_f64): Likewise. diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 33816d4381c8cf271fc4a85db6cc668f6c031dd8..568ade61653d213da5c1826c970ee350e1fdee97 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -15307,30 +15307,24 @@ vaddd_u64 (uint64x1_t __a, uint64x1_t __b) return __a + __b; } -#if __AARCH64EB__ -#define __LANE0(__t) ((__t) - 1) -#else -#define __LANE0(__t) 0 -#endif - /* vaddv */ __extension__ static __inline int8_t __attribute__ ((__always_inline__)) vaddv_s8 (int8x8_t __a) { - return vget_lane_s8 (__builtin_aarch64_reduc_splus_v8qi (__a), __LANE0 (8)); + return vget_lane_s8 (__builtin_aarch64_reduc_splus_v8qi (__a), 0); } __extension__ static __inline int16_t __attribute__ ((__always_inline__)) vaddv_s16 (int16x4_t __a) { - return vget_lane_s16 (__builtin_aarch64_reduc_splus_v4hi (__a), __LANE0 (4)); + return vget_lane_s16 (__builtin_aarch64_reduc_splus_v4hi (__a), 0); } __extension__ static __inline int32_t __attribute__ ((__always_inline__)) vaddv_s32 (int32x2_t __a) { - return vget_lane_s32 (__builtin_aarch64_reduc_splus_v2si (__a), __LANE0 (2)); + return vget_lane_s32 (__builtin_aarch64_reduc_splus_v2si (__a), 0); } __extension__ static __inline uint8_t __attribute__ ((__always_inline__)) @@ -15338,7 +15332,7 @@ vaddv_u8 (uint8x8_t __a) { return vget_lane_u8 ((uint8x8_t) __builtin_aarch64_reduc_uplus_v8qi ((int8x8_t) __a), - __LANE0 (8)); + 0); } __extension__ static __inline uint16_t __attribute__ ((__always_inline__)) @@ -15346,7 +15340,7 @@ vaddv_u16 (uint16x4_t __a) { return vget_lane_u16 ((uint16x4_t) __builtin_aarch64_reduc_uplus_v4hi ((int16x4_t) __a), - __LANE0 (4)); + 0); } __extension__ static __inline uint32_t __attribute__ ((__always_inline__)) @@ -15354,32 +15348,32 @@ vaddv_u32 (uint32x2_t __a) { return vget_lane_u32 ((uint32x2_t) __builtin_aarch64_reduc_uplus_v2si ((int32x2_t) __a), - __LANE0 (2)); + 0); } __extension__ static __inline int8_t __attribute__ ((__always_inline__)) vaddvq_s8 (int8x16_t __a) { return vgetq_lane_s8 (__builtin_aarch64_reduc_splus_v16qi (__a), - __LANE0 (16)); + 0); } __extension__ static __inline int16_t __attribute__ ((__always_inline__)) vaddvq_s16 (int16x8_t __a) { - return vgetq_lane_s16 (__builtin_aarch64_reduc_splus_v8hi (__a), __LANE0 (8)); + return vgetq_lane_s16 (__builtin_aarch64_reduc_splus_v8hi (__a), 0); } __extension__ static __inline int32_t __attribute__ ((__always_inline__)) vaddvq_s32 (int32x4_t __a) { - return vgetq_lane_s32 (__builtin_aarch64_reduc_splus_v4si (__a), __LANE0 (4)); + return vgetq_lane_s32 (__builtin_aarch64_reduc_splus_v4si (__a), 0); } __extension__ static __inline int64_t __attribute__ ((__always_inline__)) vaddvq_s64 (int64x2_t __a) { - return vgetq_lane_s64 (__builtin_aarch64_reduc_splus_v2di (__a), __LANE0 (2)); + return vgetq_lane_s64 (__builtin_aarch64_reduc_splus_v2di (__a), 0); } __extension__ static __inline uint8_t __attribute__ ((__always_inline__)) @@ -15387,7 +15381,7 @@ vaddvq_u8 (uint8x16_t __a) { return vgetq_lane_u8 ((uint8x16_t) __builtin_aarch64_reduc_uplus_v16qi ((int8x16_t) __a), - __LANE0 (16)); + 0); } __extension__ static __inline uint16_t __attribute__ ((__always_inline__)) @@ -15395,7 +15389,7 @@ vaddvq_u16 (uint16x8_t __a) { return vgetq_lane_u16 ((uint16x8_t) __builtin_aarch64_reduc_uplus_v8hi ((int16x8_t) __a), - __LANE0 (8)); + 0); } __extension__ static __inline uint32_t __attribute__ ((__always_inline__)) @@ -15403,7 +15397,7 @@ vaddvq_u32 (uint32x4_t __a) { return vgetq_lane_u32 ((uint32x4_t) __builtin_aarch64_reduc_uplus_v4si ((int32x4_t) __a), - __LANE0 (4)); + 0); } __extension__ static __inline uint64_t __attribute__ ((__always_inline__)) @@ -15411,28 +15405,28 @@ vaddvq_u64 (uint64x2_t __a) { return vgetq_lane_u64 ((uint64x2_t) __builtin_aarch64_reduc_uplus_v2di ((int64x2_t) __a), - __LANE0 (2)); + 0); } __extension__ static __inline float32_t __attribute__ ((__always_inline__)) vaddv_f32 (float32x2_t __a) { float32x2_t __t = __builtin_aarch64_reduc_splus_v2sf (__a); - return vget_lane_f32 (__t, __LANE0 (2)); + return vget_lane_f32 (__t, 0); } __extension__ static __inline float32_t __attribute__ ((__always_inline__)) vaddvq_f32 (float32x4_t __a) { float32x4_t __t = __builtin_aarch64_reduc_splus_v4sf (__a); - return vgetq_lane_f32 (__t, __LANE0 (4)); + return vgetq_lane_f32 (__t, 0); } __extension__ static __inline float64_t __attribute__ ((__always_inline__)) vaddvq_f64 (float64x2_t __a) { float64x2_t __t = __builtin_aarch64_reduc_splus_v2df (__a); - return vgetq_lane_f64 (__t, __LANE0 (2)); + return vgetq_lane_f64 (__t, 0); } /* vbsl */ @@ -19814,25 +19808,25 @@ __extension__ static __inline float32_t __attribute__ ((__always_inline__)) vmaxv_f32 (float32x2_t __a) { return vget_lane_f32 (__builtin_aarch64_reduc_smax_nan_v2sf (__a), - __LANE0 (2)); + 0); } __extension__ static __inline int8_t __attribute__ ((__always_inline__)) vmaxv_s8 (int8x8_t __a) { - return vget_lane_s8 (__builtin_aarch64_reduc_smax_v8qi (__a), __LANE0 (8)); + return vget_lane_s8 (__builtin_aarch64_reduc_smax_v8qi (__a), 0); } __extension__ static __inline int16_t __attribute__ ((__always_inline__)) vmaxv_s16 (int16x4_t __a) { - return vget_lane_s16 (__builtin_aarch64_reduc_smax_v4hi (__a), __LANE0 (4)); + return vget_lane_s16 (__builtin_aarch64_reduc_smax_v4hi (__a), 0); } __extension__ static __inline int32_t __attribute__ ((__always_inline__)) vmaxv_s32 (int32x2_t __a) { - return vget_lane_s32 (__builtin_aarch64_reduc_smax_v2si (__a), __LANE0 (2)); + return vget_lane_s32 (__builtin_aarch64_reduc_smax_v2si (__a), 0); } __extension__ static __inline uint8_t __attribute__ ((__always_inline__)) @@ -19840,7 +19834,7 @@ vmaxv_u8 (uint8x8_t __a) { return vget_lane_u8 ((uint8x8_t) __builtin_aarch64_reduc_umax_v8qi ((int8x8_t) __a), - __LANE0 (8)); + 0); } __extension__ static __inline uint16_t __attribute__ ((__always_inline__)) @@ -19848,7 +19842,7 @@ vmaxv_u16 (uint16x4_t __a) { return vget_lane_u16 ((uint16x4_t) __builtin_aarch64_reduc_umax_v4hi ((int16x4_t) __a), - __LANE0 (4)); + 0); } __extension__ static __inline uint32_t __attribute__ ((__always_inline__)) @@ -19856,39 +19850,39 @@ vmaxv_u32 (uint32x2_t __a) { return vget_lane_u32 ((uint32x2_t) __builtin_aarch64_reduc_umax_v2si ((int32x2_t) __a), - __LANE0 (2)); + 0); } __extension__ static __inline float32_t __attribute__ ((__always_inline__)) vmaxvq_f32 (float32x4_t __a) { return vgetq_lane_f32 (__builtin_aarch64_reduc_smax_nan_v4sf (__a), - __LANE0 (4)); + 0); } __extension__ static __inline float64_t __attribute__ ((__always_inline__)) vmaxvq_f64 (float64x2_t __a) { return vgetq_lane_f64 (__builtin_aarch64_reduc_smax_nan_v2df (__a), - __LANE0 (2)); + 0); } __extension__ static __inline int8_t __attribute__ ((__always_inline__)) vmaxvq_s8 (int8x16_t __a) { - return vgetq_lane_s8 (__builtin_aarch64_reduc_smax_v16qi (__a), __LANE0 (16)); + return vgetq_lane_s8 (__builtin_aarch64_reduc_smax_v16qi (__a), 0); } __extension__ static __inline int16_t __attribute__ ((__always_inline__)) vmaxvq_s16 (int16x8_t __a) { - return vgetq_lane_s16 (__builtin_aarch64_reduc_smax_v8hi (__a), __LANE0 (8)); + return vgetq_lane_s16 (__builtin_aarch64_reduc_smax_v8hi (__a), 0); } __extension__ static __inline int32_t __attribute__ ((__always_inline__)) vmaxvq_s32 (int32x4_t __a) { - return vgetq_lane_s32 (__builtin_aarch64_reduc_smax_v4si (__a), __LANE0 (4)); + return vgetq_lane_s32 (__builtin_aarch64_reduc_smax_v4si (__a), 0); } __extension__ static __inline uint8_t __attribute__ ((__always_inline__)) @@ -19896,7 +19890,7 @@ vmaxvq_u8 (uint8x16_t __a) { return vgetq_lane_u8 ((uint8x16_t) __builtin_aarch64_reduc_umax_v16qi ((int8x16_t) __a), - __LANE0 (16)); + 0); } __extension__ static __inline uint16_t __attribute__ ((__always_inline__)) @@ -19904,7 +19898,7 @@ vmaxvq_u16 (uint16x8_t __a) { return vgetq_lane_u16 ((uint16x8_t) __builtin_aarch64_reduc_umax_v8hi ((int16x8_t) __a), - __LANE0 (8)); + 0); } __extension__ static __inline uint32_t __attribute__ ((__always_inline__)) @@ -19912,7 +19906,7 @@ vmaxvq_u32 (uint32x4_t __a) { return vgetq_lane_u32 ((uint32x4_t) __builtin_aarch64_reduc_umax_v4si ((int32x4_t) __a), - __LANE0 (4)); + 0); } /* vmaxnmv */ @@ -19921,19 +19915,19 @@ __extension__ static __inline float32_t __attribute__ ((__always_inline__)) vmaxnmv_f32 (float32x2_t __a) { return vget_lane_f32 (__builtin_aarch64_reduc_smax_v2sf (__a), - __LANE0 (2)); + 0); } __extension__ static __inline float32_t __attribute__ ((__always_inline__)) vmaxnmvq_f32 (float32x4_t __a) { - return vgetq_lane_f32 (__builtin_aarch64_reduc_smax_v4sf (__a), __LANE0 (4)); + return vgetq_lane_f32 (__builtin_aarch64_reduc_smax_v4sf (__a), 0); } __extension__ static __inline float64_t __attribute__ ((__always_inline__)) vmaxnmvq_f64 (float64x2_t __a) { - return vgetq_lane_f64 (__builtin_aarch64_reduc_smax_v2df (__a), __LANE0 (2)); + return vgetq_lane_f64 (__builtin_aarch64_reduc_smax_v2df (__a), 0); } /* vmin */ @@ -20060,26 +20054,26 @@ __extension__ static __inline float32_t __attribute__ ((__always_inline__)) vminv_f32 (float32x2_t __a) { return vget_lane_f32 (__builtin_aarch64_reduc_smin_nan_v2sf (__a), - __LANE0 (2)); + 0); } __extension__ static __inline int8_t __attribute__ ((__always_inline__)) vminv_s8 (int8x8_t __a) { return vget_lane_s8 (__builtin_aarch64_reduc_smin_v8qi (__a), - __LANE0 (8)); + 0); } __extension__ static __inline int16_t __attribute__ ((__always_inline__)) vminv_s16 (int16x4_t __a) { - return vget_lane_s16 (__builtin_aarch64_reduc_smin_v4hi (__a), __LANE0 (4)); + return vget_lane_s16 (__builtin_aarch64_reduc_smin_v4hi (__a), 0); } __extension__ static __inline int32_t __attribute__ ((__always_inline__)) vminv_s32 (int32x2_t __a) { - return vget_lane_s32 (__builtin_aarch64_reduc_smin_v2si (__a), __LANE0 (2)); + return vget_lane_s32 (__builtin_aarch64_reduc_smin_v2si (__a), 0); } __extension__ static __inline uint8_t __attribute__ ((__always_inline__)) @@ -20087,7 +20081,7 @@ vminv_u8 (uint8x8_t __a) { return vget_lane_u8 ((uint8x8_t) __builtin_aarch64_reduc_umin_v8qi ((int8x8_t) __a), - __LANE0 (8)); + 0); } __extension__ static __inline uint16_t __attribute__ ((__always_inline__)) @@ -20095,7 +20089,7 @@ vminv_u16 (uint16x4_t __a) { return vget_lane_u16 ((uint16x4_t) __builtin_aarch64_reduc_umin_v4hi ((int16x4_t) __a), - __LANE0 (4)); + 0); } __extension__ static __inline uint32_t __attribute__ ((__always_inline__)) @@ -20103,39 +20097,39 @@ vminv_u32 (uint32x2_t __a) { return vget_lane_u32 ((uint32x2_t) __builtin_aarch64_reduc_umin_v2si ((int32x2_t) __a), - __LANE0 (2)); + 0); } __extension__ static __inline float32_t __attribute__ ((__always_inline__)) vminvq_f32 (float32x4_t __a) { return vgetq_lane_f32 (__builtin_aarch64_reduc_smin_nan_v4sf (__a), - __LANE0 (4)); + 0); } __extension__ static __inline float64_t __attribute__ ((__always_inline__)) vminvq_f64 (float64x2_t __a) { return vgetq_lane_f64 (__builtin_aarch64_reduc_smin_nan_v2df (__a), - __LANE0 (2)); + 0); } __extension__ static __inline int8_t __attribute__ ((__always_inline__)) vminvq_s8 (int8x16_t __a) { - return vgetq_lane_s8 (__builtin_aarch64_reduc_smin_v16qi (__a), __LANE0 (16)); + return vgetq_lane_s8 (__builtin_aarch64_reduc_smin_v16qi (__a), 0); } __extension__ static __inline int16_t __attribute__ ((__always_inline__)) vminvq_s16 (int16x8_t __a) { - return vgetq_lane_s16 (__builtin_aarch64_reduc_smin_v8hi (__a), __LANE0 (8)); + return vgetq_lane_s16 (__builtin_aarch64_reduc_smin_v8hi (__a), 0); } __extension__ static __inline int32_t __attribute__ ((__always_inline__)) vminvq_s32 (int32x4_t __a) { - return vgetq_lane_s32 (__builtin_aarch64_reduc_smin_v4si (__a), __LANE0 (4)); + return vgetq_lane_s32 (__builtin_aarch64_reduc_smin_v4si (__a), 0); } __extension__ static __inline uint8_t __attribute__ ((__always_inline__)) @@ -20143,7 +20137,7 @@ vminvq_u8 (uint8x16_t __a) { return vgetq_lane_u8 ((uint8x16_t) __builtin_aarch64_reduc_umin_v16qi ((int8x16_t) __a), - __LANE0 (16)); + 0); } __extension__ static __inline uint16_t __attribute__ ((__always_inline__)) @@ -20151,7 +20145,7 @@ vminvq_u16 (uint16x8_t __a) { return vgetq_lane_u16 ((uint16x8_t) __builtin_aarch64_reduc_umin_v8hi ((int16x8_t) __a), - __LANE0 (8)); + 0); } __extension__ static __inline uint32_t __attribute__ ((__always_inline__)) @@ -20159,7 +20153,7 @@ vminvq_u32 (uint32x4_t __a) { return vgetq_lane_u32 ((uint32x4_t) __builtin_aarch64_reduc_umin_v4si ((int32x4_t) __a), - __LANE0 (4)); + 0); } /* vminnmv */ @@ -20167,19 +20161,19 @@ vminvq_u32 (uint32x4_t __a) __extension__ static __inline float32_t __attribute__ ((__always_inline__)) vminnmv_f32 (float32x2_t __a) { - return vget_lane_f32 (__builtin_aarch64_reduc_smin_v2sf (__a), __LANE0 (2)); + return vget_lane_f32 (__builtin_aarch64_reduc_smin_v2sf (__a), 0); } __extension__ static __inline float32_t __attribute__ ((__always_inline__)) vminnmvq_f32 (float32x4_t __a) { - return vgetq_lane_f32 (__builtin_aarch64_reduc_smin_v4sf (__a), __LANE0 (4)); + return vgetq_lane_f32 (__builtin_aarch64_reduc_smin_v4sf (__a), 0); } __extension__ static __inline float64_t __attribute__ ((__always_inline__)) vminnmvq_f64 (float64x2_t __a) { - return vgetq_lane_f64 (__builtin_aarch64_reduc_smin_v2df (__a), __LANE0 (2)); + return vgetq_lane_f64 (__builtin_aarch64_reduc_smin_v2df (__a), 0); } /* vmla */ @@ -25218,8 +25212,6 @@ __INTERLEAVE_LIST (zip) /* End of optimal implementations in approved order. */ -#undef __LANE0 - #undef __aarch64_vget_lane_any #undef __aarch64_vget_lane_f32 #undef __aarch64_vget_lane_f64