[AArch64/ARM,2/3] Detect EXT patterns to vec_perm_const, use for EXT intrinsics

Message ID	535820F9.40901@arm.com
State	New
Headers	show Return-Path: <gcc-patches-return-365773-incoming=patchwork.ozlabs.org@gcc.gnu.org> DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender :message-id:date:from:mime-version:to:subject:references :in-reply-to:content-type; q=dns; s=default; b=hULfkTY+rw03XQNJW XwiOGaC6liLCXLRaUV/1mAXQVRRJk34g/nuYBBkLxe30kXRWum8AWwQB4LuACUzs rAdOJMK5sAyBmhwvpiQVWJ2paln9jre4FcqKzS355Z2Ufo5LUiFSE8cyXIoIb1rr Ij7toH3P92MjL7NE6L+seSxmRg= Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org Message-ID: <535820F9.40901@arm.com> Date: Wed, 23 Apr 2014 21:22:17 +0100 From: Alan Lawrence <alan.lawrence@arm.com> User-Agent: Thunderbird 2.0.0.24 (X11/20101213) MIME-Version: 1.0 To: "gcc-patches@gcc.gnu.org" <gcc-patches@gcc.gnu.org> Subject: [AArch64/ARM 2/3] Detect EXT patterns to vec_perm_const, use for EXT intrinsics References: <53581A47.80204@arm.com> In-Reply-To: <53581A47.80204@arm.com> Content-Type: multipart/mixed; boundary="------------030601050604000301030402"

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c index 55cfe0a..307f60c 100644 --- a/gcc/config/aarch64/aarch64-builtins.c +++ b/gcc/config/aarch64/aarch64-builtins.c @@ -153,6 +153,10 @@ aarch64_types_binop_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_none, qualifier_none, qualifier_maybe_immediate }; #define TYPES_BINOP (aarch64_types_binop_qualifiers) static enum aarch64_type_qualifiers +aarch64_types_binopv_qualifiers[SIMD_MAX_BUILTIN_ARGS] + = { qualifier_void, qualifier_none, qualifier_none }; +#define TYPES_BINOPV (aarch64_types_binopv_qualifiers) +static enum aarch64_type_qualifiers aarch64_types_binopu_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_unsigned, qualifier_unsigned, qualifier_unsigned }; #define TYPES_BINOPU (aarch64_types_binopu_qualifiers) diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index c9b7570..985acdb 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -393,3 +393,6 @@ /* Implemented by aarch64_crypto_pmull<mode>. */ VAR1 (BINOPP, crypto_pmull, 0, di) VAR1 (BINOPP, crypto_pmull, 0, v2di) + + /* Meta-op to check lane bounds of immediate in aarch64_expand_builtin. */ + VAR1 (BINOPV, im_lane_bound, 0, si) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 73aee2c..ceb3003 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -4057,6 +4057,35 @@ [(set_attr "type" "neon_permute<q>")] ) +;; Note immediate (third) operand is lane index not byte index. +(define_insn "aarch64_ext<mode>" + [(set (match_operand:VALL 0 "register_operand" "=w") + (unspec:VALL [(match_operand:VALL 1 "register_operand" "w") + (match_operand:VALL 2 "register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_EXT))] + "TARGET_SIMD" +{ + operands[3] = GEN_INT (INTVAL (operands[3]) + * GET_MODE_SIZE (GET_MODE_INNER (<MODE>mode))); + return "ext\\t%0.<Vbtype>, %1.<Vbtype>, %2.<Vbtype>, #%3"; +} + [(set_attr "type" "neon_ext<q>")] +) + +;; This exists solely to check the arguments to the corresponding __builtin. +;; Used where we want an error for out-of-range indices which would otherwise +;; be silently wrapped (e.g. the mask to a __builtin_shuffle). +(define_expand "aarch64_im_lane_boundsi" + [(match_operand:SI 0 "immediate_operand" "i") + (match_operand:SI 1 "immediate_operand" "i")] + "TARGET_SIMD" +{ + aarch64_simd_lane_bounds (operands[0], 0, INTVAL (operands[1])); + DONE; +} +) + (define_insn "aarch64_st2<mode>_dreg" [(set (match_operand:TI 0 "aarch64_simd_struct_operand" "=Utv") (unspec:TI [(match_operand:OI 1 "register_operand" "w") diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index a3147ee..138bb8a 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -8033,6 +8033,70 @@ aarch64_evpc_zip (struct expand_vec_perm_d *d) return true; } +/* Recognize patterns for the EXT insn. */ + +static bool +aarch64_evpc_ext (struct expand_vec_perm_d *d) +{ + unsigned int i, nelt = d->nelt; + rtx (*gen) (rtx, rtx, rtx, rtx); + rtx offset; + + unsigned int location = d->perm[0]; /* Always < nelt. */ + + /* Check if the extracted indices are increasing by one. */ + for (i = 1; i < nelt; i++) + { + unsigned int required = location + i; + if (d->one_vector_p) + { + /* We'll pass the same vector in twice, so allow indices to wrap. */ + required &= (nelt - 1); + } + if (d->perm[i] != required) + return false; + } + + /* The mid-end handles masks that just return one of the input vectors. */ + gcc_assert (location != 0); + + switch (d->vmode) + { + case V16QImode: gen = gen_aarch64_extv16qi; break; + case V8QImode: gen = gen_aarch64_extv8qi; break; + case V4HImode: gen = gen_aarch64_extv4hi; break; + case V8HImode: gen = gen_aarch64_extv8hi; break; + case V2SImode: gen = gen_aarch64_extv2si; break; + case V4SImode: gen = gen_aarch64_extv4si; break; + case V2SFmode: gen = gen_aarch64_extv2sf; break; + case V4SFmode: gen = gen_aarch64_extv4sf; break; + case V2DImode: gen = gen_aarch64_extv2di; break; + case V2DFmode: gen = gen_aarch64_extv2df; break; + default: + return false; + } + + /* Success! */ + if (d->testing_p) + return true; + + if (BYTES_BIG_ENDIAN) + { + /* After setup, we want the high elements of the first vector (stored + at the LSB end of the register), and the low elements of the second + vector (stored at the MSB end of the register). So swap. */ + rtx temp = d->op0; + d->op0 = d->op1; + d->op1 = temp; + /* location != 0 (above), so safe to assume (nelt - location) < nelt. */ + location = nelt - location; + } + + offset = GEN_INT (location); + emit_insn (gen (d->target, d->op0, d->op1, offset)); + return true; +} + static bool aarch64_evpc_dup (struct expand_vec_perm_d *d) { @@ -8133,7 +8197,9 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) if (TARGET_SIMD) { - if (aarch64_evpc_zip (d)) + if (aarch64_evpc_ext (d)) + return true; + else if (aarch64_evpc_zip (d)) return true; else if (aarch64_evpc_uzp (d)) return true; diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 747a292..701bfa0 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -5414,318 +5414,6 @@ vcvtxd_f32_f64 (float64_t a) return result; } -#define vext_f32(a, b, c) \ - __extension__ \ - ({ \ - float32x2_t b_ = (b); \ - float32x2_t a_ = (a); \ - float32x2_t result; \ - __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*4" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vext_f64(a, b, c) \ - __extension__ \ - ({ \ - float64x1_t b_ = (b); \ - float64x1_t a_ = (a); \ - float64x1_t result; \ - __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*8" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vext_p8(a, b, c) \ - __extension__ \ - ({ \ - poly8x8_t b_ = (b); \ - poly8x8_t a_ = (a); \ - poly8x8_t result; \ - __asm__ ("ext %0.8b,%1.8b,%2.8b,%3" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vext_p16(a, b, c) \ - __extension__ \ - ({ \ - poly16x4_t b_ = (b); \ - poly16x4_t a_ = (a); \ - poly16x4_t result; \ - __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*2" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vext_s8(a, b, c) \ - __extension__ \ - ({ \ - int8x8_t b_ = (b); \ - int8x8_t a_ = (a); \ - int8x8_t result; \ - __asm__ ("ext %0.8b,%1.8b,%2.8b,%3" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vext_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x4_t b_ = (b); \ - int16x4_t a_ = (a); \ - int16x4_t result; \ - __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*2" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vext_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x2_t b_ = (b); \ - int32x2_t a_ = (a); \ - int32x2_t result; \ - __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*4" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vext_s64(a, b, c) \ - __extension__ \ - ({ \ - int64x1_t b_ = (b); \ - int64x1_t a_ = (a); \ - int64x1_t result; \ - __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*8" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vext_u8(a, b, c) \ - __extension__ \ - ({ \ - uint8x8_t b_ = (b); \ - uint8x8_t a_ = (a); \ - uint8x8_t result; \ - __asm__ ("ext %0.8b,%1.8b,%2.8b,%3" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vext_u16(a, b, c) \ - __extension__ \ - ({ \ - uint16x4_t b_ = (b); \ - uint16x4_t a_ = (a); \ - uint16x4_t result; \ - __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*2" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vext_u32(a, b, c) \ - __extension__ \ - ({ \ - uint32x2_t b_ = (b); \ - uint32x2_t a_ = (a); \ - uint32x2_t result; \ - __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*4" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vext_u64(a, b, c) \ - __extension__ \ - ({ \ - uint64x1_t b_ = (b); \ - uint64x1_t a_ = (a); \ - uint64x1_t result; \ - __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*8" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vextq_f32(a, b, c) \ - __extension__ \ - ({ \ - float32x4_t b_ = (b); \ - float32x4_t a_ = (a); \ - float32x4_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*4" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vextq_f64(a, b, c) \ - __extension__ \ - ({ \ - float64x2_t b_ = (b); \ - float64x2_t a_ = (a); \ - float64x2_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*8" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vextq_p8(a, b, c) \ - __extension__ \ - ({ \ - poly8x16_t b_ = (b); \ - poly8x16_t a_ = (a); \ - poly8x16_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vextq_p16(a, b, c) \ - __extension__ \ - ({ \ - poly16x8_t b_ = (b); \ - poly16x8_t a_ = (a); \ - poly16x8_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*2" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vextq_s8(a, b, c) \ - __extension__ \ - ({ \ - int8x16_t b_ = (b); \ - int8x16_t a_ = (a); \ - int8x16_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vextq_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x8_t b_ = (b); \ - int16x8_t a_ = (a); \ - int16x8_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*2" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vextq_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x4_t b_ = (b); \ - int32x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*4" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vextq_s64(a, b, c) \ - __extension__ \ - ({ \ - int64x2_t b_ = (b); \ - int64x2_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*8" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vextq_u8(a, b, c) \ - __extension__ \ - ({ \ - uint8x16_t b_ = (b); \ - uint8x16_t a_ = (a); \ - uint8x16_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vextq_u16(a, b, c) \ - __extension__ \ - ({ \ - uint16x8_t b_ = (b); \ - uint16x8_t a_ = (a); \ - uint16x8_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*2" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vextq_u32(a, b, c) \ - __extension__ \ - ({ \ - uint32x4_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*4" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vextq_u64(a, b, c) \ - __extension__ \ - ({ \ - uint64x2_t b_ = (b); \ - uint64x2_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*8" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) vfma_f32 (float32x2_t a, float32x2_t b, float32x2_t c) { @@ -18489,6 +18177,292 @@ vdupd_laneq_u64 (uint64x2_t __a, const int __b) return __aarch64_vgetq_lane_u64 (__a, __b); } +/* vext */ + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vext_f32 (float32x2_t __a, float32x2_t __b, __const int __c) +{ + __builtin_aarch64_im_lane_boundsi (__c, 2); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__b, __a, (uint32x2_t) {2-__c, 3-__c}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {__c, __c+1}); +#endif +} + +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) +vext_f64 (float64x1_t __a, float64x1_t __b, __const int __c) +{ + /* The only possible index to the assembler instruction returns element 0. */ + __builtin_aarch64_im_lane_boundsi (__c, 1); + return __a; +} +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vext_p8 (poly8x8_t __a, poly8x8_t __b, __const int __c) +{ + __builtin_aarch64_im_lane_boundsi (__c, 8); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__b, __a, (uint8x8_t) + {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c}); +#else + return __builtin_shuffle (__a, __b, + (uint8x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7}); +#endif +} + +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vext_p16 (poly16x4_t __a, poly16x4_t __b, __const int __c) +{ + __builtin_aarch64_im_lane_boundsi (__c, 4); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__b, __a, + (uint16x4_t) {4-__c, 5-__c, 6-__c, 7-__c}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {__c, __c+1, __c+2, __c+3}); +#endif +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vext_s8 (int8x8_t __a, int8x8_t __b, __const int __c) +{ + __builtin_aarch64_im_lane_boundsi (__c, 8); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__b, __a, (uint8x8_t) + {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c}); +#else + return __builtin_shuffle (__a, __b, + (uint8x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7}); +#endif +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vext_s16 (int16x4_t __a, int16x4_t __b, __const int __c) +{ + __builtin_aarch64_im_lane_boundsi (__c, 4); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__b, __a, + (uint16x4_t) {4-__c, 5-__c, 6-__c, 7-__c}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {__c, __c+1, __c+2, __c+3}); +#endif +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vext_s32 (int32x2_t __a, int32x2_t __b, __const int __c) +{ + __builtin_aarch64_im_lane_boundsi (__c, 2); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__b, __a, (uint32x2_t) {2-__c, 3-__c}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {__c, __c+1}); +#endif +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vext_s64 (int64x1_t __a, int64x1_t __b, __const int __c) +{ + /* The only possible index to the assembler instruction returns element 0. */ + __builtin_aarch64_im_lane_boundsi (__c, 1); + return __a; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vext_u8 (uint8x8_t __a, uint8x8_t __b, __const int __c) +{ + __builtin_aarch64_im_lane_boundsi (__c, 8); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__b, __a, (uint8x8_t) + {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c}); +#else + return __builtin_shuffle (__a, __b, + (uint8x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7}); +#endif +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vext_u16 (uint16x4_t __a, uint16x4_t __b, __const int __c) +{ + __builtin_aarch64_im_lane_boundsi (__c, 4); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__b, __a, + (uint16x4_t) {4-__c, 5-__c, 6-__c, 7-__c}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {__c, __c+1, __c+2, __c+3}); +#endif +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vext_u32 (uint32x2_t __a, uint32x2_t __b, __const int __c) +{ + __builtin_aarch64_im_lane_boundsi (__c, 2); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__b, __a, (uint32x2_t) {2-__c, 3-__c}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {__c, __c+1}); +#endif +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vext_u64 (uint64x1_t __a, uint64x1_t __b, __const int __c) +{ + /* The only possible index to the assembler instruction returns element 0. */ + __builtin_aarch64_im_lane_boundsi (__c, 1); + return __a; +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vextq_f32 (float32x4_t __a, float32x4_t __b, __const int __c) +{ + __builtin_aarch64_im_lane_boundsi (__c, 4); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__b, __a, + (uint32x4_t) {4-__c, 5-__c, 6-__c, 7-__c}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {__c, __c+1, __c+2, __c+3}); +#endif +} + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vextq_f64 (float64x2_t __a, float64x2_t __b, __const int __c) +{ + __builtin_aarch64_im_lane_boundsi (__c, 2); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__b, __a, (uint64x2_t) {2-__c, 3-__c}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {__c, __c+1}); +#endif +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vextq_p8 (poly8x16_t __a, poly8x16_t __b, __const int __c) +{ + __builtin_aarch64_im_lane_boundsi (__c, 16); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__b, __a, (uint8x16_t) + {16-__c, 17-__c, 18-__c, 19-__c, 20-__c, 21-__c, 22-__c, 23-__c, + 24-__c, 25-__c, 26-__c, 27-__c, 28-__c, 29-__c, 30-__c, 31-__c}); +#else + return __builtin_shuffle (__a, __b, (uint8x16_t) + {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7, + __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15}); +#endif +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vextq_p16 (poly16x8_t __a, poly16x8_t __b, __const int __c) +{ + __builtin_aarch64_im_lane_boundsi (__c, 8); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__b, __a, (uint16x8_t) + {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c}); +#else + return __builtin_shuffle (__a, __b, + (uint16x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7}); +#endif +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vextq_s8 (int8x16_t __a, int8x16_t __b, __const int __c) +{ + __builtin_aarch64_im_lane_boundsi (__c, 16); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__b, __a, (uint8x16_t) + {16-__c, 17-__c, 18-__c, 19-__c, 20-__c, 21-__c, 22-__c, 23-__c, + 24-__c, 25-__c, 26-__c, 27-__c, 28-__c, 29-__c, 30-__c, 31-__c}); +#else + return __builtin_shuffle (__a, __b, (uint8x16_t) + {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7, + __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15}); +#endif +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vextq_s16 (int16x8_t __a, int16x8_t __b, __const int __c) +{ + __builtin_aarch64_im_lane_boundsi (__c, 8); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__b, __a, (uint16x8_t) + {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c}); +#else + return __builtin_shuffle (__a, __b, + (uint16x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7}); +#endif +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vextq_s32 (int32x4_t __a, int32x4_t __b, __const int __c) +{ + __builtin_aarch64_im_lane_boundsi (__c, 4); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__b, __a, + (uint32x4_t) {4-__c, 5-__c, 6-__c, 7-__c}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {__c, __c+1, __c+2, __c+3}); +#endif +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vextq_s64 (int64x2_t __a, int64x2_t __b, __const int __c) +{ + __builtin_aarch64_im_lane_boundsi (__c, 2); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__b, __a, (uint64x2_t) {2-__c, 3-__c}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {__c, __c+1}); +#endif +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vextq_u8 (uint8x16_t __a, uint8x16_t __b, __const int __c) +{ + __builtin_aarch64_im_lane_boundsi (__c, 16); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__b, __a, (uint8x16_t) + {16-__c, 17-__c, 18-__c, 19-__c, 20-__c, 21-__c, 22-__c, 23-__c, + 24-__c, 25-__c, 26-__c, 27-__c, 28-__c, 29-__c, 30-__c, 31-__c}); +#else + return __builtin_shuffle (__a, __b, (uint8x16_t) + {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7, + __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15}); +#endif +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vextq_u16 (uint16x8_t __a, uint16x8_t __b, __const int __c) +{ + __builtin_aarch64_im_lane_boundsi (__c, 8); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__b, __a, (uint16x8_t) + {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c}); +#else + return __builtin_shuffle (__a, __b, + (uint16x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7}); +#endif +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vextq_u32 (uint32x4_t __a, uint32x4_t __b, __const int __c) +{ + __builtin_aarch64_im_lane_boundsi (__c, 4); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__b, __a, + (uint32x4_t) {4-__c, 5-__c, 6-__c, 7-__c}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {__c, __c+1, __c+2, __c+3}); +#endif +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vextq_u64 (uint64x2_t __a, uint64x2_t __b, __const int __c) +{ + __builtin_aarch64_im_lane_boundsi (__c, 2); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__b, __a, (uint64x2_t) {2-__c, 3-__c}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {__c, __c+1}); +#endif +} + /* vfma_lane */ __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index f1339b8..aa14fae 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -267,6 +267,7 @@ UNSPEC_UZP2 ; Used in vector permute patterns. UNSPEC_TRN1 ; Used in vector permute patterns. UNSPEC_TRN2 ; Used in vector permute patterns. + UNSPEC_EXT ; Used in aarch64-simd.md. UNSPEC_AESE ; Used in aarch64-simd.md. UNSPEC_AESD ; Used in aarch64-simd.md. UNSPEC_AESMC ; Used in aarch64-simd.md.

[AArch64/ARM,2/3] Detect EXT patterns to vec_perm_const, use for EXT intrinsics

Commit Message

Comments

Patch