From patchwork Wed May 1 11:01:57 2013 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: James Greenhalgh X-Patchwork-Id: 240762 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) (using TLSv1 with cipher DHE-RSA-AES256-SHA (256/256 bits)) (Client CN "localhost", Issuer "www.qmailtoaster.com" (not verified)) by ozlabs.org (Postfix) with ESMTPS id 75B242C00C4 for ; Wed, 1 May 2013 21:02:30 +1000 (EST) DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:from :to:cc:subject:date:message-id:mime-version:content-type; q=dns; s=default; b=wo5CV23bGzYnpsSSCD8KdDFWJKSdNe04PK6YC80ZxAhbz+g0jU +ly3Us/Gu88oz0gsbx3BnEFtv+nMex19AR+TIDDGTExdjIba+5MWN7+CrjeLdDP1 Mi+S3v4VF1lulYz9j5mQFZ3U4UJgxCn553eZ3AVce08YvGW4BDVul11Fg= DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:from :to:cc:subject:date:message-id:mime-version:content-type; s= default; bh=f0+0g0/wGNuQZ4H1IizPWZzlzE8=; b=aDeDEfDLXnM+JEaSPP7C VxVg0mr9BneN4AT8XQ5fuGBXsPqlM7gzWHjD/yJ8AAfrlIuiQztJG31kV27a/zs8 0cy+t5jdnsb2RPAR8799PYscIj5BAuwkKcCzOjPa+CJLTJsp7iLToTclsVwYU1nW VnyOok/GcRYFZgZBwASINV8= Received: (qmail 4560 invoked by alias); 1 May 2013 11:02:22 -0000 Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Delivered-To: mailing list gcc-patches@gcc.gnu.org Received: (qmail 4551 invoked by uid 89); 1 May 2013 11:02:22 -0000 X-Spam-SWARE-Status: No, score=-4.5 required=5.0 tests=AWL, BAYES_00, RCVD_IN_DNSWL_LOW, RCVD_IN_HOSTKARMA_W, RCVD_IN_HOSTKARMA_WL, SPF_PASS, TW_MV, TW_NM, TW_NV, TW_XN, TW_XV autolearn=ham version=3.3.1 Received: from service87.mimecast.com (HELO service87.mimecast.com) (91.220.42.44) by sourceware.org (qpsmtpd/0.84/v0.84-167-ge50287c) with ESMTP; Wed, 01 May 2013 11:02:19 +0000 Received: from cam-owa1.Emea.Arm.com (fw-tnat.cambridge.arm.com [217.140.96.21]) by service87.mimecast.com; Wed, 01 May 2013 12:02:02 +0100 Received: from e106375-lin.cambridge.arm.com ([10.1.255.212]) by cam-owa1.Emea.Arm.com with Microsoft SMTPSVC(6.0.3790.0); Wed, 1 May 2013 12:02:01 +0100 From: James Greenhalgh To: gcc-patches@gcc.gnu.org Cc: marcus.shawcroft@arm.com Subject: [AArch64] Rewrite v_<8, 16, 32, 64> intrinsics using builtins. Date: Wed, 1 May 2013 12:01:57 +0100 Message-Id: <1367406117-23139-1-git-send-email-james.greenhalgh@arm.com> MIME-Version: 1.0 X-MC-Unique: 113050112020204701 X-Virus-Found: No Hi, This patch takes the v_<8, 16, 32, 64> intrinsics and rewrites them using builtins. We also add a new compile and execute test for the maximum and minimum reductions. Regression tested on aarch64-none-elf with no regressions. Thanks, James --- gcc/ 2013-05-01 James Greenhalgh * config/aarch64/arm_neon.h (v_<8, 16, 32, 64>): Rewrite using builtins. gcc/testsuite/ 2013-05-01 James Greenhalgh * gcc.target/aarch64/vect-vmaxv.c: New. * gcc.target/aarch64/vect-vfmaxv.c: Likewise. diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 30a59a7..cdefa86 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -8027,303 +8027,6 @@ vld1q_u64 (const uint64_t * a) return result; } -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vmaxnm_f32 (float32x2_t a, float32x2_t b) -{ - float32x2_t result; - __asm__ ("fmaxnm %0.2s,%1.2s,%2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vmaxnmq_f32 (float32x4_t a, float32x4_t b) -{ - float32x4_t result; - __asm__ ("fmaxnm %0.4s,%1.4s,%2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vmaxnmq_f64 (float64x2_t a, float64x2_t b) -{ - float64x2_t result; - __asm__ ("fmaxnm %0.2d,%1.2d,%2.2d" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32_t __attribute__ ((__always_inline__)) -vmaxnmvq_f32 (float32x4_t a) -{ - float32_t result; - __asm__ ("fmaxnmv %s0,%1.4s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int8_t __attribute__ ((__always_inline__)) -vmaxv_s8 (int8x8_t a) -{ - int8_t result; - __asm__ ("smaxv %b0,%1.8b" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16_t __attribute__ ((__always_inline__)) -vmaxv_s16 (int16x4_t a) -{ - int16_t result; - __asm__ ("smaxv %h0,%1.4h" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint8_t __attribute__ ((__always_inline__)) -vmaxv_u8 (uint8x8_t a) -{ - uint8_t result; - __asm__ ("umaxv %b0,%1.8b" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) -vmaxv_u16 (uint16x4_t a) -{ - uint16_t result; - __asm__ ("umaxv %h0,%1.4h" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32_t __attribute__ ((__always_inline__)) -vmaxvq_f32 (float32x4_t a) -{ - float32_t result; - __asm__ ("fmaxv %s0,%1.4s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int8_t __attribute__ ((__always_inline__)) -vmaxvq_s8 (int8x16_t a) -{ - int8_t result; - __asm__ ("smaxv %b0,%1.16b" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16_t __attribute__ ((__always_inline__)) -vmaxvq_s16 (int16x8_t a) -{ - int16_t result; - __asm__ ("smaxv %h0,%1.8h" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32_t __attribute__ ((__always_inline__)) -vmaxvq_s32 (int32x4_t a) -{ - int32_t result; - __asm__ ("smaxv %s0,%1.4s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint8_t __attribute__ ((__always_inline__)) -vmaxvq_u8 (uint8x16_t a) -{ - uint8_t result; - __asm__ ("umaxv %b0,%1.16b" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) -vmaxvq_u16 (uint16x8_t a) -{ - uint16_t result; - __asm__ ("umaxv %h0,%1.8h" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) -vmaxvq_u32 (uint32x4_t a) -{ - uint32_t result; - __asm__ ("umaxv %s0,%1.4s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32_t __attribute__ ((__always_inline__)) -vminnmvq_f32 (float32x4_t a) -{ - float32_t result; - __asm__ ("fminnmv %s0,%1.4s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int8_t __attribute__ ((__always_inline__)) -vminv_s8 (int8x8_t a) -{ - int8_t result; - __asm__ ("sminv %b0,%1.8b" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16_t __attribute__ ((__always_inline__)) -vminv_s16 (int16x4_t a) -{ - int16_t result; - __asm__ ("sminv %h0,%1.4h" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint8_t __attribute__ ((__always_inline__)) -vminv_u8 (uint8x8_t a) -{ - uint8_t result; - __asm__ ("uminv %b0,%1.8b" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) -vminv_u16 (uint16x4_t a) -{ - uint16_t result; - __asm__ ("uminv %h0,%1.4h" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32_t __attribute__ ((__always_inline__)) -vminvq_f32 (float32x4_t a) -{ - float32_t result; - __asm__ ("fminv %s0,%1.4s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int8_t __attribute__ ((__always_inline__)) -vminvq_s8 (int8x16_t a) -{ - int8_t result; - __asm__ ("sminv %b0,%1.16b" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16_t __attribute__ ((__always_inline__)) -vminvq_s16 (int16x8_t a) -{ - int16_t result; - __asm__ ("sminv %h0,%1.8h" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32_t __attribute__ ((__always_inline__)) -vminvq_s32 (int32x4_t a) -{ - int32_t result; - __asm__ ("sminv %s0,%1.4s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint8_t __attribute__ ((__always_inline__)) -vminvq_u8 (uint8x16_t a) -{ - uint8_t result; - __asm__ ("uminv %b0,%1.16b" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) -vminvq_u16 (uint16x8_t a) -{ - uint16_t result; - __asm__ ("uminv %h0,%1.8h" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) -vminvq_u32 (uint32x4_t a) -{ - uint32_t result; - __asm__ ("uminv %s0,%1.4s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - #define vmla_lane_f32(a, b, c, d) \ __extension__ \ ({ \ @@ -17308,70 +17011,6 @@ vaddv_u32 (uint32x2_t a) return result; } -__extension__ static __inline float32_t __attribute__ ((__always_inline__)) -vmaxnmv_f32 (float32x2_t a) -{ - float32_t result; - __asm__ ("fmaxnmp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : ); - return result; -} - -__extension__ static __inline float32_t __attribute__ ((__always_inline__)) -vminnmv_f32 (float32x2_t a) -{ - float32_t result; - __asm__ ("fminnmp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : ); - return result; -} - -__extension__ static __inline float64_t __attribute__ ((__always_inline__)) -vmaxnmvq_f64 (float64x2_t a) -{ - float64_t result; - __asm__ ("fmaxnmp %0.2d, %1.2d, %1.2d" : "=w"(result) : "w"(a) : ); - return result; -} - -__extension__ static __inline int32_t __attribute__ ((__always_inline__)) -vmaxv_s32 (int32x2_t a) -{ - int32_t result; - __asm__ ("smaxp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : ); - return result; -} - -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) -vmaxv_u32 (uint32x2_t a) -{ - uint32_t result; - __asm__ ("umaxp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : ); - return result; -} - -__extension__ static __inline float64_t __attribute__ ((__always_inline__)) -vminnmvq_f64 (float64x2_t a) -{ - float64_t result; - __asm__ ("fminnmp %0.2d, %1.2d, %1.2d" : "=w"(result) : "w"(a) : ); - return result; -} - -__extension__ static __inline int32_t __attribute__ ((__always_inline__)) -vminv_s32 (int32x2_t a) -{ - int32_t result; - __asm__ ("sminp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : ); - return result; -} - -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) -vminv_u32 (uint32x2_t a) -{ - uint32_t result; - __asm__ ("uminp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : ); - return result; -} - __extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) vpaddd_s64 (int64x2_t __a) { @@ -21637,7 +21276,145 @@ vmaxq_u32 (uint32x4_t __a, uint32x4_t __b) (int32x4_t) __b); } -/* vmin */ +/* vmaxnm */ + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vmaxnm_f32 (float32x2_t __a, float32x2_t __b) +{ + return __builtin_aarch64_smaxv2sf (__a, __b); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vmaxnmq_f32 (float32x4_t __a, float32x4_t __b) +{ + return __builtin_aarch64_smaxv4sf (__a, __b); +} + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vmaxnmq_f64 (float64x2_t __a, float64x2_t __b) +{ + return __builtin_aarch64_smaxv2df (__a, __b); +} + +/* vmaxv */ + +__extension__ static __inline float32_t __attribute__ ((__always_inline__)) +vmaxv_f32 (float32x2_t __a) +{ + return vget_lane_f32 (__builtin_aarch64_reduc_smax_nan_v2sf (__a), 0); +} + +__extension__ static __inline int8_t __attribute__ ((__always_inline__)) +vmaxv_s8 (int8x8_t __a) +{ + return vget_lane_s8 (__builtin_aarch64_reduc_smax_v8qi (__a), 0); +} + +__extension__ static __inline int16_t __attribute__ ((__always_inline__)) +vmaxv_s16 (int16x4_t __a) +{ + return vget_lane_s16 (__builtin_aarch64_reduc_smax_v4hi (__a), 0); +} + +__extension__ static __inline int32_t __attribute__ ((__always_inline__)) +vmaxv_s32 (int32x2_t __a) +{ + return vget_lane_s32 (__builtin_aarch64_reduc_smax_v2si (__a), 0); +} + +__extension__ static __inline uint8_t __attribute__ ((__always_inline__)) +vmaxv_u8 (uint8x8_t __a) +{ + return vget_lane_u8 ((uint8x8_t) + __builtin_aarch64_reduc_umax_v8qi ((int8x8_t) __a), 0); +} + +__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) +vmaxv_u16 (uint16x4_t __a) +{ + return vget_lane_u16 ((uint16x4_t) + __builtin_aarch64_reduc_umax_v4hi ((int16x4_t) __a), 0); +} + +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vmaxv_u32 (uint32x2_t __a) +{ + return vget_lane_u32 ((uint32x2_t) + __builtin_aarch64_reduc_umax_v2si ((int32x2_t) __a), 0); +} + +__extension__ static __inline float32_t __attribute__ ((__always_inline__)) +vmaxvq_f32 (float32x4_t __a) +{ + return vgetq_lane_f32 (__builtin_aarch64_reduc_smax_nan_v4sf (__a), 0); +} + +__extension__ static __inline float64_t __attribute__ ((__always_inline__)) +vmaxvq_f64 (float64x2_t __a) +{ + return vgetq_lane_f64 (__builtin_aarch64_reduc_smax_nan_v2df (__a), 0); +} + +__extension__ static __inline int8_t __attribute__ ((__always_inline__)) +vmaxvq_s8 (int8x16_t __a) +{ + return vgetq_lane_s8 (__builtin_aarch64_reduc_smax_v16qi (__a), 0); +} + +__extension__ static __inline int16_t __attribute__ ((__always_inline__)) +vmaxvq_s16 (int16x8_t __a) +{ + return vgetq_lane_s16 (__builtin_aarch64_reduc_smax_v8hi (__a), 0); +} + +__extension__ static __inline int32_t __attribute__ ((__always_inline__)) +vmaxvq_s32 (int32x4_t __a) +{ + return vgetq_lane_s32 (__builtin_aarch64_reduc_smax_v4si (__a), 0); +} + +__extension__ static __inline uint8_t __attribute__ ((__always_inline__)) +vmaxvq_u8 (uint8x16_t __a) +{ + return vgetq_lane_u8 ((uint8x16_t) + __builtin_aarch64_reduc_umax_v16qi ((int8x16_t) __a), 0); +} + +__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) +vmaxvq_u16 (uint16x8_t __a) +{ + return vgetq_lane_u16 ((uint16x8_t) + __builtin_aarch64_reduc_umax_v8hi ((int16x8_t) __a), 0); +} + +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vmaxvq_u32 (uint32x4_t __a) +{ + return vgetq_lane_u32 ((uint32x4_t) + __builtin_aarch64_reduc_umax_v4si ((int32x4_t) __a), 0); +} + +/* vmaxnmv */ + +__extension__ static __inline float32_t __attribute__ ((__always_inline__)) +vmaxnmv_f32 (float32x2_t __a) +{ + return vget_lane_f32 (__builtin_aarch64_reduc_smax_v2sf (__a), 0); +} + +__extension__ static __inline float32_t __attribute__ ((__always_inline__)) +vmaxnmvq_f32 (float32x4_t __a) +{ + return vgetq_lane_f32 (__builtin_aarch64_reduc_smax_v4sf (__a), 0); +} + +__extension__ static __inline float64_t __attribute__ ((__always_inline__)) +vmaxnmvq_f64 (float64x2_t __a) +{ + return vgetq_lane_f64 (__builtin_aarch64_reduc_smax_v2df (__a), 0); +} + +/* vmin */ __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) vmin_f32 (float32x2_t __a, float32x2_t __b) @@ -21735,6 +21512,144 @@ vminq_u32 (uint32x4_t __a, uint32x4_t __b) (int32x4_t) __b); } +/* vminnm */ + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vminnm_f32 (float32x2_t __a, float32x2_t __b) +{ + return __builtin_aarch64_sminv2sf (__a, __b); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vminnmq_f32 (float32x4_t __a, float32x4_t __b) +{ + return __builtin_aarch64_sminv4sf (__a, __b); +} + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vminnmq_f64 (float64x2_t __a, float64x2_t __b) +{ + return __builtin_aarch64_sminv2df (__a, __b); +} + +/* vminv */ + +__extension__ static __inline float32_t __attribute__ ((__always_inline__)) +vminv_f32 (float32x2_t __a) +{ + return vget_lane_f32 (__builtin_aarch64_reduc_smin_nan_v2sf (__a), 0); +} + +__extension__ static __inline int8_t __attribute__ ((__always_inline__)) +vminv_s8 (int8x8_t __a) +{ + return vget_lane_s8 (__builtin_aarch64_reduc_smin_v8qi (__a), 0); +} + +__extension__ static __inline int16_t __attribute__ ((__always_inline__)) +vminv_s16 (int16x4_t __a) +{ + return vget_lane_s16 (__builtin_aarch64_reduc_smin_v4hi (__a), 0); +} + +__extension__ static __inline int32_t __attribute__ ((__always_inline__)) +vminv_s32 (int32x2_t __a) +{ + return vget_lane_s32 (__builtin_aarch64_reduc_smin_v2si (__a), 0); +} + +__extension__ static __inline uint8_t __attribute__ ((__always_inline__)) +vminv_u8 (uint8x8_t __a) +{ + return vget_lane_u8 ((uint8x8_t) + __builtin_aarch64_reduc_umin_v8qi ((int8x8_t) __a), 0); +} + +__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) +vminv_u16 (uint16x4_t __a) +{ + return vget_lane_u16 ((uint16x4_t) + __builtin_aarch64_reduc_umin_v4hi ((int16x4_t) __a), 0); +} + +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vminv_u32 (uint32x2_t __a) +{ + return vget_lane_u32 ((uint32x2_t) + __builtin_aarch64_reduc_umin_v2si ((int32x2_t) __a), 0); +} + +__extension__ static __inline float32_t __attribute__ ((__always_inline__)) +vminvq_f32 (float32x4_t __a) +{ + return vgetq_lane_f32 (__builtin_aarch64_reduc_smin_nan_v4sf (__a), 0); +} + +__extension__ static __inline float64_t __attribute__ ((__always_inline__)) +vminvq_f64 (float64x2_t __a) +{ + return vgetq_lane_f64 (__builtin_aarch64_reduc_smin_nan_v2df (__a), 0); +} + +__extension__ static __inline int8_t __attribute__ ((__always_inline__)) +vminvq_s8 (int8x16_t __a) +{ + return vgetq_lane_s8 (__builtin_aarch64_reduc_smin_v16qi (__a), 0); +} + +__extension__ static __inline int16_t __attribute__ ((__always_inline__)) +vminvq_s16 (int16x8_t __a) +{ + return vgetq_lane_s16 (__builtin_aarch64_reduc_smin_v8hi (__a), 0); +} + +__extension__ static __inline int32_t __attribute__ ((__always_inline__)) +vminvq_s32 (int32x4_t __a) +{ + return vgetq_lane_s32 (__builtin_aarch64_reduc_smin_v4si (__a), 0); +} + +__extension__ static __inline uint8_t __attribute__ ((__always_inline__)) +vminvq_u8 (uint8x16_t __a) +{ + return vgetq_lane_u8 ((uint8x16_t) + __builtin_aarch64_reduc_umin_v16qi ((int8x16_t) __a), 0); +} + +__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) +vminvq_u16 (uint16x8_t __a) +{ + return vgetq_lane_u16 ((uint16x8_t) + __builtin_aarch64_reduc_umin_v8hi ((int16x8_t) __a), 0); +} + +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vminvq_u32 (uint32x4_t __a) +{ + return vgetq_lane_u32 ((uint32x4_t) + __builtin_aarch64_reduc_umin_v4si ((int32x4_t) __a), 0); +} + +/* vminnmv */ + +__extension__ static __inline float32_t __attribute__ ((__always_inline__)) +vminnmv_f32 (float32x2_t __a) +{ + return vget_lane_f32 (__builtin_aarch64_reduc_smin_v2sf (__a), 0); +} + +__extension__ static __inline float32_t __attribute__ ((__always_inline__)) +vminnmvq_f32 (float32x4_t __a) +{ + return vgetq_lane_f32 (__builtin_aarch64_reduc_smin_v4sf (__a), 0); +} + +__extension__ static __inline float64_t __attribute__ ((__always_inline__)) +vminnmvq_f64 (float64x2_t __a) +{ + return vgetq_lane_f64 (__builtin_aarch64_reduc_smin_v2df (__a), 0); +} + /* vmla */ __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) diff --git a/gcc/testsuite/gcc.target/aarch64/vect-vfmaxv.c b/gcc/testsuite/gcc.target/aarch64/vect-vfmaxv.c new file mode 100644 index 0000000..58a57a1 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vect-vfmaxv.c @@ -0,0 +1,169 @@ +/* { dg-do run } */ +/* { dg-options "-O3 --save-temps -ffast-math" } */ + +#include + +extern void abort (void); + +extern float fabsf (float); +extern double fabs (double); +extern int isnan (double); +extern float fmaxf (float, float); +extern float fminf (float, float); +extern double fmax (double, double); +extern double fmin (double, double); + +#define NUM_TESTS 16 +#define DELTA 0.000001 +#define NAN (0.0 / 0.0) + +float input_float32[] = {0.1f, -0.1f, 0.4f, 10.3f, + 200.0f, -800.0f, -13.0f, -0.5f, + NAN, -870.0f, 10.4f, 310.11f, + 0.0f, -865.0f, -2213.0f, -1.5f}; + +double input_float64[] = {0.1, -0.1, 0.4, 10.3, + 200.0, -800.0, -13.0, -0.5, + NAN, -870.0, 10.4, 310.11, + 0.0, -865.0, -2213.0, -1.5}; + +#define EQUALF(a, b) (fabsf (a - b) < DELTA) +#define EQUALD(a, b) (fabs (a - b) < DELTA) + +/* Floating point 'unordered' variants. */ + +#undef TEST +#define TEST(MAXMIN, CMP_OP, SUFFIX, Q, TYPE, LANES, FLOAT) \ +int \ +test_v##MAXMIN##v##SUFFIX##_##TYPE##x##LANES##_t (void) \ +{ \ + int i, j; \ + int moves = (NUM_TESTS - LANES) + 1; \ + TYPE##_t out_l[NUM_TESTS]; \ + TYPE##_t out_v[NUM_TESTS]; \ + \ + /* Calculate linearly. */ \ + for (i = 0; i < moves; i++) \ + { \ + out_l[i] = input_##TYPE[i]; \ + for (j = 0; j < LANES; j++) \ + { \ + if (isnan (out_l[i])) \ + continue; \ + if (isnan (input_##TYPE[i + j]) \ + || input_##TYPE[i + j] CMP_OP out_l[i]) \ + out_l[i] = input_##TYPE[i + j]; \ + } \ + } \ + \ + /* Calculate using vector reduction intrinsics. */ \ + for (i = 0; i < moves; i++) \ + { \ + TYPE##x##LANES##_t t1 = vld1##Q##_##SUFFIX (input_##TYPE + i); \ + out_v[i] = v##MAXMIN##v##Q##_##SUFFIX (t1); \ + } \ + \ + /* Compare. */ \ + for (i = 0; i < moves; i++) \ + { \ + if (!EQUAL##FLOAT (out_v[i], out_l[i]) \ + && !(isnan (out_v[i]) && isnan (out_l[i]))) \ + return 0; \ + } \ + return 1; \ +} + +#define BUILD_VARIANTS(TYPE, STYPE, W32, W64, F) \ +TEST (max, >, STYPE, , TYPE, W32, F) \ +TEST (max, >, STYPE, q, TYPE, W64, F) \ +TEST (min, <, STYPE, , TYPE, W32, F) \ +TEST (min, <, STYPE, q, TYPE, W64, F) + +BUILD_VARIANTS (float32, f32, 2, 4, F) +/* { dg-final { scan-assembler "fmaxp\\ts\[0-9\]+, v\[0-9\]+\.2s" } } */ +/* { dg-final { scan-assembler "fminp\\ts\[0-9\]+, v\[0-9\]+\.2s" } } */ +/* { dg-final { scan-assembler "fmaxv\\ts\[0-9\]+, v\[0-9\]+\.4s" } } */ +/* { dg-final { scan-assembler "fminv\\ts\[0-9\]+, v\[0-9\]+\.4s" } } */ +TEST (max, >, f64, q, float64, 2, D) +/* { dg-final { scan-assembler "fmaxp\\td\[0-9\]+, v\[0-9\]+\.2d" } } */ +TEST (min, <, f64, q, float64, 2, D) +/* { dg-final { scan-assembler "fminp\\td\[0-9\]+, v\[0-9\]+\.2d" } } */ + +/* Floating point 'nm' variants. */ + +#undef TEST +#define TEST(MAXMIN, F, SUFFIX, Q, TYPE, LANES, FLOAT) \ +int \ +test_v##MAXMIN##nmv##SUFFIX##_##TYPE##x##LANES##_t (void) \ +{ \ + int i, j; \ + int moves = (NUM_TESTS - LANES) + 1; \ + TYPE##_t out_l[NUM_TESTS]; \ + TYPE##_t out_v[NUM_TESTS]; \ + \ + /* Calculate linearly. */ \ + for (i = 0; i < moves; i++) \ + { \ + out_l[i] = input_##TYPE[i]; \ + for (j = 0; j < LANES; j++) \ + out_l[i] = f##MAXMIN##F (input_##TYPE[i + j], out_l[i]); \ + } \ + \ + /* Calculate using vector reduction intrinsics. */ \ + for (i = 0; i < moves; i++) \ + { \ + TYPE##x##LANES##_t t1 = vld1##Q##_##SUFFIX (input_##TYPE + i); \ + out_v[i] = v##MAXMIN##nmv##Q##_##SUFFIX (t1); \ + } \ + \ + /* Compare. */ \ + for (i = 0; i < moves; i++) \ + { \ + if (!EQUAL##FLOAT (out_v[i], out_l[i])) \ + return 0; \ + } \ + return 1; \ +} + +TEST (max, f, f32, , float32, 2, D) +/* { dg-final { scan-assembler "fmaxnmp\\ts\[0-9\]+, v\[0-9\]+\.2s" } } */ +TEST (min, f, f32, , float32, 2, D) +/* { dg-final { scan-assembler "fminnmp\\ts\[0-9\]+, v\[0-9\]+\.2s" } } */ +TEST (max, f, f32, q, float32, 4, D) +/* { dg-final { scan-assembler "fmaxnmv\\ts\[0-9\]+, v\[0-9\]+\.4s" } } */ +TEST (min, f, f32, q, float32, 4, D) +/* { dg-final { scan-assembler "fminnmv\\ts\[0-9\]+, v\[0-9\]+\.4s" } } */ +TEST (max, , f64, q, float64, 2, D) +/* { dg-final { scan-assembler "fmaxnmp\\td\[0-9\]+, v\[0-9\]+\.2d" } } */ +TEST (min, , f64, q, float64, 2, D) +/* { dg-final { scan-assembler "fminnmp\\td\[0-9\]+, v\[0-9\]+\.2d" } } */ + +#undef TEST +#define TEST(MAXMIN, CMP_OP, SUFFIX, Q, TYPE, LANES, FLOAT) \ +{ \ + if (!test_v##MAXMIN##v##SUFFIX##_##TYPE##x##LANES##_t ()) \ + abort (); \ +} + +int +main (int argc, char **argv) +{ + BUILD_VARIANTS (float32, f32, 2, 4, F) + TEST (max, >, f64, q, float64, 2, D) + TEST (min, <, f64, q, float64, 2, D) + +#undef TEST +#define TEST(MAXMIN, CMP_OP, SUFFIX, Q, TYPE, LANES, FLOAT) \ +{ \ + if (!test_v##MAXMIN##nmv##SUFFIX##_##TYPE##x##LANES##_t ()) \ + abort (); \ +} + + BUILD_VARIANTS (float32, f32, 2, 4, F) + TEST (max, >, f64, q, float64, 2, D) + TEST (min, <, f64, q, float64, 2, D) + + return 0; +} + +/* { dg-final { cleanup-saved-temps } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/vect-vmaxv.c b/gcc/testsuite/gcc.target/aarch64/vect-vmaxv.c new file mode 100644 index 0000000..212e133 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vect-vmaxv.c @@ -0,0 +1,117 @@ +/* { dg-do run } */ +/* { dg-options "-O3 --save-temps -ffast-math" } */ + +#include + +extern void abort (void); + +#define NUM_TESTS 16 +#define DELTA 0.000001 + +int8_t input_int8[] = {1, 56, 2, -9, -90, 23, 54, 76, + -4, 34, 110, -110, 6, 4, 75, -34}; +int16_t input_int16[] = {1, 56, 2, -9, -90, 23, 54, 76, + -4, 34, 110, -110, 6, 4, 75, -34}; +int32_t input_int32[] = {1, 56, 2, -9, -90, 23, 54, 76, + -4, 34, 110, -110, 6, 4, 75, -34}; + +uint8_t input_uint8[] = {1, 56, 2, 9, 90, 23, 54, 76, + 4, 34, 110, 110, 6, 4, 75, 34}; +uint16_t input_uint16[] = {1, 56, 2, 9, 90, 23, 54, 76, + 4, 34, 110, 110, 6, 4, 75, 34}; +uint32_t input_uint32[] = {1, 56, 2, 9, 90, 23, 54, 76, + 4, 34, 110, 110, 6, 4, 75, 34}; + +#define EQUAL(a, b) (a == b) + +#define TEST(MAXMIN, CMP_OP, SUFFIX, Q, TYPE, LANES) \ +int \ +test_v##MAXMIN##v##SUFFIX##_##TYPE##x##LANES##_t (void) \ +{ \ + int i, j; \ + int moves = (NUM_TESTS - LANES) + 1; \ + TYPE##_t out_l[NUM_TESTS]; \ + TYPE##_t out_v[NUM_TESTS]; \ + \ + /* Calculate linearly. */ \ + for (i = 0; i < moves; i++) \ + { \ + out_l[i] = input_##TYPE[i]; \ + for (j = 0; j < LANES; j++) \ + out_l[i] = input_##TYPE[i + j] CMP_OP out_l[i] ? \ + input_##TYPE[i + j] : out_l[i]; \ + } \ + \ + /* Calculate using vector reduction intrinsics. */ \ + for (i = 0; i < moves; i++) \ + { \ + TYPE##x##LANES##_t t1 = vld1##Q##_##SUFFIX (input_##TYPE + i); \ + out_v[i] = v##MAXMIN##v##Q##_##SUFFIX (t1); \ + } \ + \ + /* Compare. */ \ + for (i = 0; i < moves; i++) \ + { \ + if (!EQUAL (out_v[i], out_l[i])) \ + return 0; \ + } \ + return 1; \ +} + +#define BUILD_VARIANTS(TYPE, STYPE, W32, W64) \ +TEST (max, >, STYPE, , TYPE, W32) \ +TEST (max, >, STYPE, q, TYPE, W64) \ +TEST (min, <, STYPE, , TYPE, W32) \ +TEST (min, <, STYPE, q, TYPE, W64) + +BUILD_VARIANTS (int8, s8, 8, 16) +/* { dg-final { scan-assembler "smaxv\\tb\[0-9\]+, v\[0-9\]+\.8b" } } */ +/* { dg-final { scan-assembler "sminv\\tb\[0-9\]+, v\[0-9\]+\.8b" } } */ +/* { dg-final { scan-assembler "smaxv\\tb\[0-9\]+, v\[0-9\]+\.16b" } } */ +/* { dg-final { scan-assembler "sminv\\tb\[0-9\]+, v\[0-9\]+\.16b" } } */ +BUILD_VARIANTS (uint8, u8, 8, 16) +/* { dg-final { scan-assembler "umaxv\\tb\[0-9\]+, v\[0-9\]+\.8b" } } */ +/* { dg-final { scan-assembler "uminv\\tb\[0-9\]+, v\[0-9\]+\.8b" } } */ +/* { dg-final { scan-assembler "umaxv\\tb\[0-9\]+, v\[0-9\]+\.16b" } } */ +/* { dg-final { scan-assembler "uminv\\tb\[0-9\]+, v\[0-9\]+\.16b" } } */ +BUILD_VARIANTS (int16, s16, 4, 8) +/* { dg-final { scan-assembler "smaxv\\th\[0-9\]+, v\[0-9\]+\.4h" } } */ +/* { dg-final { scan-assembler "sminv\\th\[0-9\]+, v\[0-9\]+\.4h" } } */ +/* { dg-final { scan-assembler "smaxv\\th\[0-9\]+, v\[0-9\]+\.8h" } } */ +/* { dg-final { scan-assembler "sminv\\th\[0-9\]+, v\[0-9\]+\.8h" } } */ +BUILD_VARIANTS (uint16, u16, 4, 8) +/* { dg-final { scan-assembler "umaxv\\th\[0-9\]+, v\[0-9\]+\.4h" } } */ +/* { dg-final { scan-assembler "uminv\\th\[0-9\]+, v\[0-9\]+\.4h" } } */ +/* { dg-final { scan-assembler "umaxv\\th\[0-9\]+, v\[0-9\]+\.8h" } } */ +/* { dg-final { scan-assembler "uminv\\th\[0-9\]+, v\[0-9\]+\.8h" } } */ +BUILD_VARIANTS (int32, s32, 2, 4) +/* { dg-final { scan-assembler "smaxp\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s, v\[0-9\]+\.2s" } } */ +/* { dg-final { scan-assembler "sminp\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s, v\[0-9\]+\.2s" } } */ +/* { dg-final { scan-assembler "smaxv\\ts\[0-9\]+, v\[0-9\]+\.4s" } } */ +/* { dg-final { scan-assembler "sminv\\ts\[0-9\]+, v\[0-9\]+\.4s" } } */ +BUILD_VARIANTS (uint32, u32, 2, 4) +/* { dg-final { scan-assembler "umaxp\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s, v\[0-9\]+\.2s" } } */ +/* { dg-final { scan-assembler "uminp\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s, v\[0-9\]+\.2s" } } */ +/* { dg-final { scan-assembler "umaxv\\ts\[0-9\]+, v\[0-9\]+\.4s" } } */ +/* { dg-final { scan-assembler "uminv\\ts\[0-9\]+, v\[0-9\]+\.4s" } } */ + +#undef TEST +#define TEST(MAXMIN, CMP_OP, SUFFIX, Q, TYPE, LANES) \ +{ \ + if (!test_v##MAXMIN##v##SUFFIX##_##TYPE##x##LANES##_t ()) \ + abort (); \ +} + +int +main (int argc, char **argv) +{ + BUILD_VARIANTS (int8, s8, 8, 16) + BUILD_VARIANTS (uint8, u8, 8, 16) + BUILD_VARIANTS (int16, s16, 4, 8) + BUILD_VARIANTS (uint16, u16, 4, 8) + BUILD_VARIANTS (int32, s32, 2, 4) + BUILD_VARIANTS (uint32, u32, 2, 4) + return 0; +} + +/* { dg-final { cleanup-saved-temps } } */