[ARM] PR51980 / PR49081 Improve Neon permute intrinsics.

Message ID	CACUk7=W9md0xyA7tj4QtVsC4Tw0iQC5NEiKiDzJkA4erN=713A@mail.gmail.com
State	New
Headers	show Return-Path: <gcc-patches-return-322163-incoming=patchwork.ozlabs.org@gcc.gnu.org> Comment: DKIM? See http://www.dkim.org Comment: DomainKeys? See http://antispam.yahoo.com/domainkeys DomainKey-Signature: a=rsa-sha1; q=dns; c=nofws; s=default; d=gcc.gnu.org; h=Received:Received:X-SWARE-Spam-Status:X-Spam-Check-By:Received:Received:X-Google-DKIM-Signature:MIME-Version:Received:Received:In-Reply-To:References:Date:Message-ID:Subject:From:To:Cc:Content-Type:X-Gm-Message-State:X-IsSubscribed:Mailing-List:Precedence:List-Id:List-Unsubscribe:List-Archive:List-Post:List-Help:Sender:Delivered-To; b=DqaFOx61mU0fhOPMiPO9xMOJOTXOW7K2V1dSV0vT8fPFUyk1WbILW4+y2aMolw 4pfOWCeiIsYU44/UlEdX4a4htu4zHmruKdDFiZNECUszOC4aJ022m1Yx8Cx0GOU8 KsKKjNUdHc4Unuo7alE/qt4dBFJgCV00mlVKaY3EjdseQ=; MIME-Version: 1.0 In-Reply-To: <20120620122924.60a253ee@octopus> References: <CACUk7=XGXxcp+N2W526TjUbro74mYaxyk3cUoFTsCHDjNs8ZVA@mail.gmail.com> <20120620122924.60a253ee@octopus> Date: Thu, 5 Jul 2012 17:51:25 +0100 Message-ID: <CACUk7=W9md0xyA7tj4QtVsC4Tw0iQC5NEiKiDzJkA4erN=713A@mail.gmail.com> Subject: Re: [Patch ARM] PR51980 / PR49081 Improve Neon permute intrinsics. From: Ramana Radhakrishnan <ramana.radhakrishnan@linaro.org> To: Julian Brown <julian@codesourcery.com> Cc: gcc-patches <gcc-patches@gcc.gnu.org> Content-Type: multipart/mixed; boundary=f46d042fd88851561004c417f538 Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org

Index: gcc/testsuite/gcc.target/arm/neon/vzipu32.c =================================================================== --- gcc/testsuite/gcc.target/arm/neon/vzipu32.c (revision 189288) +++ gcc/testsuite/gcc.target/arm/neon/vzipu32.c (working copy) @@ -17,5 +17,5 @@ out_uint32x2x2_t = vzip_u32 (arg0_uint32x2_t, arg1_uint32x2_t); } -/* { dg-final { scan-assembler "vzip\.32\[ \]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */ +/* { dg-final { scan-assembler "vuzp\.32\[ \]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */ /* { dg-final { cleanup-saved-temps } } */ Index: gcc/testsuite/gcc.target/arm/neon/vzipf32.c =================================================================== --- gcc/testsuite/gcc.target/arm/neon/vzipf32.c (revision 189288) +++ gcc/testsuite/gcc.target/arm/neon/vzipf32.c (working copy) @@ -17,5 +17,5 @@ out_float32x2x2_t = vzip_f32 (arg0_float32x2_t, arg1_float32x2_t); } -/* { dg-final { scan-assembler "vzip\.32\[ \]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */ +/* { dg-final { scan-assembler "vuzp\.32\[ \]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */ /* { dg-final { cleanup-saved-temps } } */ Index: gcc/testsuite/gcc.target/arm/neon/vtrns32.c =================================================================== --- gcc/testsuite/gcc.target/arm/neon/vtrns32.c (revision 189288) +++ gcc/testsuite/gcc.target/arm/neon/vtrns32.c (working copy) @@ -17,5 +17,5 @@ out_int32x2x2_t = vtrn_s32 (arg0_int32x2_t, arg1_int32x2_t); } -/* { dg-final { scan-assembler "vtrn\.32\[ \]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */ +/* { dg-final { scan-assembler "vuzp\.32\[ \]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */ /* { dg-final { cleanup-saved-temps } } */ Index: gcc/testsuite/gcc.target/arm/neon/vtrnu32.c =================================================================== --- gcc/testsuite/gcc.target/arm/neon/vtrnu32.c (revision 189288) +++ gcc/testsuite/gcc.target/arm/neon/vtrnu32.c (working copy) @@ -17,5 +17,5 @@ out_uint32x2x2_t = vtrn_u32 (arg0_uint32x2_t, arg1_uint32x2_t); } -/* { dg-final { scan-assembler "vtrn\.32\[ \]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */ +/* { dg-final { scan-assembler "vuzp\.32\[ \]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */ /* { dg-final { cleanup-saved-temps } } */ Index: gcc/testsuite/gcc.target/arm/neon/vtrnf32.c =================================================================== --- gcc/testsuite/gcc.target/arm/neon/vtrnf32.c (revision 189288) +++ gcc/testsuite/gcc.target/arm/neon/vtrnf32.c (working copy) @@ -17,5 +17,5 @@ out_float32x2x2_t = vtrn_f32 (arg0_float32x2_t, arg1_float32x2_t); } -/* { dg-final { scan-assembler "vtrn\.32\[ \]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */ +/* { dg-final { scan-assembler "vuzp\.32\[ \]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */ /* { dg-final { cleanup-saved-temps } } */ Index: gcc/testsuite/gcc.target/arm/neon/vzips32.c =================================================================== --- gcc/testsuite/gcc.target/arm/neon/vzips32.c (revision 189288) +++ gcc/testsuite/gcc.target/arm/neon/vzips32.c (working copy) @@ -17,5 +17,5 @@ out_int32x2x2_t = vzip_s32 (arg0_int32x2_t, arg1_int32x2_t); } -/* { dg-final { scan-assembler "vzip\.32\[ \]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */ +/* { dg-final { scan-assembler "vuzp\.32\[ \]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */ /* { dg-final { cleanup-saved-temps } } */ Index: gcc/config/arm/neon.ml =================================================================== --- gcc/config/arm/neon.ml (revision 189288) +++ gcc/config/arm/neon.ml (working copy) @@ -201,6 +201,42 @@ (* Reinterpret casts. *) | Vreinterp +let rev_elems revsize elsize nelts _ = + let mask = (revsize / elsize) - 1 in + let arr = Array.init nelts + (fun i -> i lxor mask) in + Array.to_list arr + +let permute_range i stride nelts increment = + let rec build i = function + 0 -> [] + | nelts -> i :: (i + stride) :: build (i + increment) (pred nelts) in + build i nelts + +(* Generate a list of integers suitable for vzip. *) +let zip_range i stride nelts = permute_range i stride nelts 1 + +(* Generate a list of integers suitable for vunzip. *) +let uzip_range i stride nelts = permute_range i stride nelts 4 + +(* Generate a list of integers suitable for trn. *) +let trn_range i stride nelts = permute_range i stride nelts 2 + +let zip_elems _ nelts part = + match part with + `lo -> zip_range 0 nelts (nelts / 2) + | `hi -> zip_range (nelts / 2) nelts (nelts / 2) + +let uzip_elems _ nelts part = + match part with + `lo -> uzip_range 0 2 (nelts / 2) + | `hi -> uzip_range 1 2 (nelts / 2) + +let trn_elems _ nelts part = + match part with + `lo -> trn_range 0 nelts (nelts / 2) + | `hi -> trn_range 1 nelts (nelts / 2) + (* Features used for documentation, to distinguish between some instruction variants, and to signal special requirements (e.g. swapping arguments). *) @@ -214,7 +250,10 @@ | Flipped of string (* Builtin name to use with flipped arguments. *) | InfoWord (* Pass an extra word for signage/rounding etc. (always passed for All _, Long, Wide, Narrow shape_forms. *) - | ReturnPtr (* Pass explicit pointer to return value as first argument. *) + (* Implement builtin as shuffle. The parameter is a function which returns + masks suitable for __builtin_shuffle: arguments are (element size, + number of elements, high/low part selector). *) + | Use_shuffle of (int -> int -> [`lo|`hi] -> int list) (* A specification as to the shape of instruction expected upon disassembly, used if it differs from the shape used to build the intrinsic prototype. Multiple entries in the constructor's argument @@ -706,8 +745,10 @@ let su_8_32 = [S8; S16; S32; U8; U16; U32] let su_8_64 = S64 :: U64 :: su_8_32 let su_16_64 = [S16; S32; S64; U16; U32; U64] +let pf_su_8_16 = [P8; P16; S8; S16; U8; U16] let pf_su_8_32 = P8 :: P16 :: F32 :: su_8_32 let pf_su_8_64 = P8 :: P16 :: F32 :: su_8_64 +let suf_32 = [S32; U32; F32] let ops = [ @@ -1317,12 +1358,18 @@ pf_su_8_64; (* Reverse elements. *) - Vrev64, [], All (2, Dreg), "vrev64", bits_1, P8 :: P16 :: F32 :: su_8_32; - Vrev64, [], All (2, Qreg), "vrev64Q", bits_1, P8 :: P16 :: F32 :: su_8_32; - Vrev32, [], All (2, Dreg), "vrev32", bits_1, [P8; P16; S8; U8; S16; U16]; - Vrev32, [], All (2, Qreg), "vrev32Q", bits_1, [P8; P16; S8; U8; S16; U16]; - Vrev16, [], All (2, Dreg), "vrev16", bits_1, [P8; S8; U8]; - Vrev16, [], All (2, Qreg), "vrev16Q", bits_1, [P8; S8; U8]; + Vrev64, [Use_shuffle (rev_elems 64)], All (2, Dreg), "vrev64", bits_1, + P8 :: P16 :: F32 :: su_8_32; + Vrev64, [Use_shuffle (rev_elems 64)], All (2, Qreg), "vrev64Q", bits_1, + P8 :: P16 :: F32 :: su_8_32; + Vrev32, [Use_shuffle (rev_elems 32)], All (2, Dreg), "vrev32", bits_1, + [P8; P16; S8; U8; S16; U16]; + Vrev32, [Use_shuffle (rev_elems 32)], All (2, Qreg), "vrev32Q", bits_1, + [P8; P16; S8; U8; S16; U16]; + Vrev16, [Use_shuffle (rev_elems 16)], All (2, Dreg), "vrev16", bits_1, + [P8; S8; U8]; + Vrev16, [Use_shuffle (rev_elems 16)], All (2, Qreg), "vrev16Q", bits_1, + [P8; S8; U8]; (* Bit selection. *) Vbsl, @@ -1336,25 +1383,19 @@ Use_operands [| Qreg; Qreg; Qreg; Qreg |], "vbslQ", bit_select, pf_su_8_64; - (* Transpose elements. **NOTE** ReturnPtr goes some of the way towards - generating good code for intrinsics which return structure types -- - builtins work well by themselves (and understand that the values being - stored on e.g. the stack also reside in registers, so can optimise the - stores away entirely if the results are used immediately), but - intrinsics are very much less efficient. Maybe something can be improved - re: inlining, or tweaking the ABI used for intrinsics (a special call - attribute?). - *) - Vtrn, [ReturnPtr], Pair_result Dreg, "vtrn", bits_2, pf_su_8_32; - Vtrn, [ReturnPtr], Pair_result Qreg, "vtrnQ", bits_2, pf_su_8_32; - + Vtrn, [Use_shuffle trn_elems], Pair_result Dreg, "vtrn", bits_2, pf_su_8_16; + Vtrn, [Use_shuffle trn_elems; Instruction_name ["vuzp"]], Pair_result Dreg, "vtrn", bits_2, suf_32; + Vtrn, [Use_shuffle trn_elems], Pair_result Qreg, "vtrnQ", bits_2, pf_su_8_32; (* Zip elements. *) - Vzip, [ReturnPtr], Pair_result Dreg, "vzip", bits_2, pf_su_8_32; - Vzip, [ReturnPtr], Pair_result Qreg, "vzipQ", bits_2, pf_su_8_32; + Vzip, [Use_shuffle zip_elems], Pair_result Dreg, "vzip", bits_2, pf_su_8_16; + Vzip, [Use_shuffle zip_elems; Instruction_name ["vuzp"]], Pair_result Dreg, "vzip", bits_2, suf_32; + Vzip, [Use_shuffle zip_elems], Pair_result Qreg, "vzipQ", bits_2, pf_su_8_32; (* Unzip elements. *) - Vuzp, [ReturnPtr], Pair_result Dreg, "vuzp", bits_2, pf_su_8_32; - Vuzp, [ReturnPtr], Pair_result Qreg, "vuzpQ", bits_2, pf_su_8_32; + Vuzp, [Use_shuffle uzip_elems], Pair_result Dreg, "vuzp", bits_2, + pf_su_8_32; + Vuzp, [Use_shuffle uzip_elems], Pair_result Qreg, "vuzpQ", bits_2, + pf_su_8_32; (* Element/structure loads. VLD1 variants. *) Vldx 1, Index: gcc/config/arm/arm_neon.h =================================================================== --- gcc/config/arm/arm_neon.h (revision 189288) +++ gcc/config/arm/arm_neon.h (working copy) @@ -7047,217 +7047,217 @@ __extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) vrev64_s8 (int8x8_t __a) { - return (int8x8_t)__builtin_neon_vrev64v8qi (__a, 1); + return (int8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 }); } __extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) vrev64_s16 (int16x4_t __a) { - return (int16x4_t)__builtin_neon_vrev64v4hi (__a, 1); + return (int16x4_t) __builtin_shuffle (__a, (uint16x4_t) { 3, 2, 1, 0 }); } __extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) vrev64_s32 (int32x2_t __a) { - return (int32x2_t)__builtin_neon_vrev64v2si (__a, 1); + return (int32x2_t) __builtin_shuffle (__a, (uint32x2_t) { 1, 0 }); } __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) vrev64_f32 (float32x2_t __a) { - return (float32x2_t)__builtin_neon_vrev64v2sf (__a, 3); + return (float32x2_t) __builtin_shuffle (__a, (uint32x2_t) { 1, 0 }); } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) vrev64_u8 (uint8x8_t __a) { - return (uint8x8_t)__builtin_neon_vrev64v8qi ((int8x8_t) __a, 0); + return (uint8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 }); } __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) vrev64_u16 (uint16x4_t __a) { - return (uint16x4_t)__builtin_neon_vrev64v4hi ((int16x4_t) __a, 0); + return (uint16x4_t) __builtin_shuffle (__a, (uint16x4_t) { 3, 2, 1, 0 }); } __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) vrev64_u32 (uint32x2_t __a) { - return (uint32x2_t)__builtin_neon_vrev64v2si ((int32x2_t) __a, 0); + return (uint32x2_t) __builtin_shuffle (__a, (uint32x2_t) { 1, 0 }); } __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) vrev64_p8 (poly8x8_t __a) { - return (poly8x8_t)__builtin_neon_vrev64v8qi ((int8x8_t) __a, 2); + return (poly8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 }); } __extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) vrev64_p16 (poly16x4_t __a) { - return (poly16x4_t)__builtin_neon_vrev64v4hi ((int16x4_t) __a, 2); + return (poly16x4_t) __builtin_shuffle (__a, (uint16x4_t) { 3, 2, 1, 0 }); } __extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) vrev64q_s8 (int8x16_t __a) { - return (int8x16_t)__builtin_neon_vrev64v16qi (__a, 1); + return (int8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }); } __extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) vrev64q_s16 (int16x8_t __a) { - return (int16x8_t)__builtin_neon_vrev64v8hi (__a, 1); + return (int16x8_t) __builtin_shuffle (__a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 }); } __extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) vrev64q_s32 (int32x4_t __a) { - return (int32x4_t)__builtin_neon_vrev64v4si (__a, 1); + return (int32x4_t) __builtin_shuffle (__a, (uint32x4_t) { 1, 0, 3, 2 }); } __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) vrev64q_f32 (float32x4_t __a) { - return (float32x4_t)__builtin_neon_vrev64v4sf (__a, 3); + return (float32x4_t) __builtin_shuffle (__a, (uint32x4_t) { 1, 0, 3, 2 }); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) vrev64q_u8 (uint8x16_t __a) { - return (uint8x16_t)__builtin_neon_vrev64v16qi ((int8x16_t) __a, 0); + return (uint8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }); } __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) vrev64q_u16 (uint16x8_t __a) { - return (uint16x8_t)__builtin_neon_vrev64v8hi ((int16x8_t) __a, 0); + return (uint16x8_t) __builtin_shuffle (__a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 }); } __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) vrev64q_u32 (uint32x4_t __a) { - return (uint32x4_t)__builtin_neon_vrev64v4si ((int32x4_t) __a, 0); + return (uint32x4_t) __builtin_shuffle (__a, (uint32x4_t) { 1, 0, 3, 2 }); } __extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) vrev64q_p8 (poly8x16_t __a) { - return (poly8x16_t)__builtin_neon_vrev64v16qi ((int8x16_t) __a, 2); + return (poly8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }); } __extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) vrev64q_p16 (poly16x8_t __a) { - return (poly16x8_t)__builtin_neon_vrev64v8hi ((int16x8_t) __a, 2); + return (poly16x8_t) __builtin_shuffle (__a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 }); } __extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) vrev32_s8 (int8x8_t __a) { - return (int8x8_t)__builtin_neon_vrev32v8qi (__a, 1); + return (int8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 }); } __extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) vrev32_s16 (int16x4_t __a) { - return (int16x4_t)__builtin_neon_vrev32v4hi (__a, 1); + return (int16x4_t) __builtin_shuffle (__a, (uint16x4_t) { 1, 0, 3, 2 }); } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) vrev32_u8 (uint8x8_t __a) { - return (uint8x8_t)__builtin_neon_vrev32v8qi ((int8x8_t) __a, 0); + return (uint8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 }); } __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) vrev32_u16 (uint16x4_t __a) { - return (uint16x4_t)__builtin_neon_vrev32v4hi ((int16x4_t) __a, 0); + return (uint16x4_t) __builtin_shuffle (__a, (uint16x4_t) { 1, 0, 3, 2 }); } __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) vrev32_p8 (poly8x8_t __a) { - return (poly8x8_t)__builtin_neon_vrev32v8qi ((int8x8_t) __a, 2); + return (poly8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 }); } __extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) vrev32_p16 (poly16x4_t __a) { - return (poly16x4_t)__builtin_neon_vrev32v4hi ((int16x4_t) __a, 2); + return (poly16x4_t) __builtin_shuffle (__a, (uint16x4_t) { 1, 0, 3, 2 }); } __extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) vrev32q_s8 (int8x16_t __a) { - return (int8x16_t)__builtin_neon_vrev32v16qi (__a, 1); + return (int8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }); } __extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) vrev32q_s16 (int16x8_t __a) { - return (int16x8_t)__builtin_neon_vrev32v8hi (__a, 1); + return (int16x8_t) __builtin_shuffle (__a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 }); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) vrev32q_u8 (uint8x16_t __a) { - return (uint8x16_t)__builtin_neon_vrev32v16qi ((int8x16_t) __a, 0); + return (uint8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }); } __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) vrev32q_u16 (uint16x8_t __a) { - return (uint16x8_t)__builtin_neon_vrev32v8hi ((int16x8_t) __a, 0); + return (uint16x8_t) __builtin_shuffle (__a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 }); } __extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) vrev32q_p8 (poly8x16_t __a) { - return (poly8x16_t)__builtin_neon_vrev32v16qi ((int8x16_t) __a, 2); + return (poly8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }); } __extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) vrev32q_p16 (poly16x8_t __a) { - return (poly16x8_t)__builtin_neon_vrev32v8hi ((int16x8_t) __a, 2); + return (poly16x8_t) __builtin_shuffle (__a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 }); } __extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) vrev16_s8 (int8x8_t __a) { - return (int8x8_t)__builtin_neon_vrev16v8qi (__a, 1); + return (int8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 }); } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) vrev16_u8 (uint8x8_t __a) { - return (uint8x8_t)__builtin_neon_vrev16v8qi ((int8x8_t) __a, 0); + return (uint8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 }); } __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) vrev16_p8 (poly8x8_t __a) { - return (poly8x8_t)__builtin_neon_vrev16v8qi ((int8x8_t) __a, 2); + return (poly8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 }); } __extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) vrev16q_s8 (int8x16_t __a) { - return (int8x16_t)__builtin_neon_vrev16v16qi (__a, 1); + return (int8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 }); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) vrev16q_u8 (uint8x16_t __a) { - return (uint8x16_t)__builtin_neon_vrev16v16qi ((int8x16_t) __a, 0); + return (uint8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 }); } __extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) vrev16q_p8 (poly8x16_t __a) { - return (poly8x16_t)__builtin_neon_vrev16v16qi ((int8x16_t) __a, 2); + return (poly8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 }); } __extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) @@ -7396,7 +7396,8 @@ vtrn_s8 (int8x8_t __a, int8x8_t __b) { int8x8x2_t __rv; - __builtin_neon_vtrnv8qi (&__rv.val[0], __a, __b); + __rv.val[0] = (int8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 0, 8, 2, 10, 4, 12, 6, 14 }); + __rv.val[1] = (int8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 1, 9, 3, 11, 5, 13, 7, 15 }); return __rv; } @@ -7404,31 +7405,17 @@ vtrn_s16 (int16x4_t __a, int16x4_t __b) { int16x4x2_t __rv; - __builtin_neon_vtrnv4hi (&__rv.val[0], __a, __b); + __rv.val[0] = (int16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 0, 4, 2, 6 }); + __rv.val[1] = (int16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 1, 5, 3, 7 }); return __rv; } -__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__)) -vtrn_s32 (int32x2_t __a, int32x2_t __b) -{ - int32x2x2_t __rv; - __builtin_neon_vtrnv2si (&__rv.val[0], __a, __b); - return __rv; -} - -__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__)) -vtrn_f32 (float32x2_t __a, float32x2_t __b) -{ - float32x2x2_t __rv; - __builtin_neon_vtrnv2sf (&__rv.val[0], __a, __b); - return __rv; -} - __extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__)) vtrn_u8 (uint8x8_t __a, uint8x8_t __b) { uint8x8x2_t __rv; - __builtin_neon_vtrnv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b); + __rv.val[0] = (uint8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 0, 8, 2, 10, 4, 12, 6, 14 }); + __rv.val[1] = (uint8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 1, 9, 3, 11, 5, 13, 7, 15 }); return __rv; } @@ -7436,23 +7423,17 @@ vtrn_u16 (uint16x4_t __a, uint16x4_t __b) { uint16x4x2_t __rv; - __builtin_neon_vtrnv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b); + __rv.val[0] = (uint16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 0, 4, 2, 6 }); + __rv.val[1] = (uint16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 1, 5, 3, 7 }); return __rv; } -__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__)) -vtrn_u32 (uint32x2_t __a, uint32x2_t __b) -{ - uint32x2x2_t __rv; - __builtin_neon_vtrnv2si ((int32x2_t *) &__rv.val[0], (int32x2_t) __a, (int32x2_t) __b); - return __rv; -} - __extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__)) vtrn_p8 (poly8x8_t __a, poly8x8_t __b) { poly8x8x2_t __rv; - __builtin_neon_vtrnv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b); + __rv.val[0] = (poly8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 0, 8, 2, 10, 4, 12, 6, 14 }); + __rv.val[1] = (poly8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 1, 9, 3, 11, 5, 13, 7, 15 }); return __rv; } @@ -7460,15 +7441,44 @@ vtrn_p16 (poly16x4_t __a, poly16x4_t __b) { poly16x4x2_t __rv; - __builtin_neon_vtrnv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b); + __rv.val[0] = (poly16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 0, 4, 2, 6 }); + __rv.val[1] = (poly16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 1, 5, 3, 7 }); return __rv; } +__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__)) +vtrn_s32 (int32x2_t __a, int32x2_t __b) +{ + int32x2x2_t __rv; + __rv.val[0] = (int32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 0, 2 }); + __rv.val[1] = (int32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 1, 3 }); + return __rv; +} + +__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__)) +vtrn_f32 (float32x2_t __a, float32x2_t __b) +{ + float32x2x2_t __rv; + __rv.val[0] = (float32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 0, 2 }); + __rv.val[1] = (float32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 1, 3 }); + return __rv; +} + +__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__)) +vtrn_u32 (uint32x2_t __a, uint32x2_t __b) +{ + uint32x2x2_t __rv; + __rv.val[0] = (uint32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 0, 2 }); + __rv.val[1] = (uint32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 1, 3 }); + return __rv; +} + __extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__)) vtrnq_s8 (int8x16_t __a, int8x16_t __b) { int8x16x2_t __rv; - __builtin_neon_vtrnv16qi (&__rv.val[0], __a, __b); + __rv.val[0] = (int8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 }); + __rv.val[1] = (int8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 }); return __rv; } @@ -7476,7 +7486,8 @@ vtrnq_s16 (int16x8_t __a, int16x8_t __b) { int16x8x2_t __rv; - __builtin_neon_vtrnv8hi (&__rv.val[0], __a, __b); + __rv.val[0] = (int16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 0, 8, 2, 10, 4, 12, 6, 14 }); + __rv.val[1] = (int16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 1, 9, 3, 11, 5, 13, 7, 15 }); return __rv; } @@ -7484,7 +7495,8 @@ vtrnq_s32 (int32x4_t __a, int32x4_t __b) { int32x4x2_t __rv; - __builtin_neon_vtrnv4si (&__rv.val[0], __a, __b); + __rv.val[0] = (int32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 4, 2, 6 }); + __rv.val[1] = (int32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 1, 5, 3, 7 }); return __rv; } @@ -7492,7 +7504,8 @@ vtrnq_f32 (float32x4_t __a, float32x4_t __b) { float32x4x2_t __rv; - __builtin_neon_vtrnv4sf (&__rv.val[0], __a, __b); + __rv.val[0] = (float32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 4, 2, 6 }); + __rv.val[1] = (float32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 1, 5, 3, 7 }); return __rv; } @@ -7500,7 +7513,8 @@ vtrnq_u8 (uint8x16_t __a, uint8x16_t __b) { uint8x16x2_t __rv; - __builtin_neon_vtrnv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b); + __rv.val[0] = (uint8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 }); + __rv.val[1] = (uint8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 }); return __rv; } @@ -7508,7 +7522,8 @@ vtrnq_u16 (uint16x8_t __a, uint16x8_t __b) { uint16x8x2_t __rv; - __builtin_neon_vtrnv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b); + __rv.val[0] = (uint16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 0, 8, 2, 10, 4, 12, 6, 14 }); + __rv.val[1] = (uint16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 1, 9, 3, 11, 5, 13, 7, 15 }); return __rv; } @@ -7516,7 +7531,8 @@ vtrnq_u32 (uint32x4_t __a, uint32x4_t __b) { uint32x4x2_t __rv; - __builtin_neon_vtrnv4si ((int32x4_t *) &__rv.val[0], (int32x4_t) __a, (int32x4_t) __b); + __rv.val[0] = (uint32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 4, 2, 6 }); + __rv.val[1] = (uint32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 1, 5, 3, 7 }); return __rv; } @@ -7524,7 +7540,8 @@ vtrnq_p8 (poly8x16_t __a, poly8x16_t __b) { poly8x16x2_t __rv; - __builtin_neon_vtrnv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b); + __rv.val[0] = (poly8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 }); + __rv.val[1] = (poly8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 }); return __rv; } @@ -7532,7 +7549,8 @@ vtrnq_p16 (poly16x8_t __a, poly16x8_t __b) { poly16x8x2_t __rv; - __builtin_neon_vtrnv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b); + __rv.val[0] = (poly16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 0, 8, 2, 10, 4, 12, 6, 14 }); + __rv.val[1] = (poly16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 1, 9, 3, 11, 5, 13, 7, 15 }); return __rv; } @@ -7540,7 +7558,8 @@ vzip_s8 (int8x8_t __a, int8x8_t __b) { int8x8x2_t __rv; - __builtin_neon_vzipv8qi (&__rv.val[0], __a, __b); + __rv.val[0] = (int8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 0, 8, 1, 9, 2, 10, 3, 11 }); + __rv.val[1] = (int8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 4, 12, 5, 13, 6, 14, 7, 15 }); return __rv; } @@ -7548,31 +7567,17 @@ vzip_s16 (int16x4_t __a, int16x4_t __b) { int16x4x2_t __rv; - __builtin_neon_vzipv4hi (&__rv.val[0], __a, __b); + __rv.val[0] = (int16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 0, 4, 1, 5 }); + __rv.val[1] = (int16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 2, 6, 3, 7 }); return __rv; } -__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__)) -vzip_s32 (int32x2_t __a, int32x2_t __b) -{ - int32x2x2_t __rv; - __builtin_neon_vzipv2si (&__rv.val[0], __a, __b); - return __rv; -} - -__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__)) -vzip_f32 (float32x2_t __a, float32x2_t __b) -{ - float32x2x2_t __rv; - __builtin_neon_vzipv2sf (&__rv.val[0], __a, __b); - return __rv; -} - __extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__)) vzip_u8 (uint8x8_t __a, uint8x8_t __b) { uint8x8x2_t __rv; - __builtin_neon_vzipv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b); + __rv.val[0] = (uint8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 0, 8, 1, 9, 2, 10, 3, 11 }); + __rv.val[1] = (uint8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 4, 12, 5, 13, 6, 14, 7, 15 }); return __rv; } @@ -7580,23 +7585,17 @@ vzip_u16 (uint16x4_t __a, uint16x4_t __b) { uint16x4x2_t __rv; - __builtin_neon_vzipv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b); + __rv.val[0] = (uint16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 0, 4, 1, 5 }); + __rv.val[1] = (uint16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 2, 6, 3, 7 }); return __rv; } -__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__)) -vzip_u32 (uint32x2_t __a, uint32x2_t __b) -{ - uint32x2x2_t __rv; - __builtin_neon_vzipv2si ((int32x2_t *) &__rv.val[0], (int32x2_t) __a, (int32x2_t) __b); - return __rv; -} - __extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__)) vzip_p8 (poly8x8_t __a, poly8x8_t __b) { poly8x8x2_t __rv; - __builtin_neon_vzipv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b); + __rv.val[0] = (poly8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 0, 8, 1, 9, 2, 10, 3, 11 }); + __rv.val[1] = (poly8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 4, 12, 5, 13, 6, 14, 7, 15 }); return __rv; } @@ -7604,15 +7603,44 @@ vzip_p16 (poly16x4_t __a, poly16x4_t __b) { poly16x4x2_t __rv; - __builtin_neon_vzipv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b); + __rv.val[0] = (poly16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 0, 4, 1, 5 }); + __rv.val[1] = (poly16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 2, 6, 3, 7 }); return __rv; } +__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__)) +vzip_s32 (int32x2_t __a, int32x2_t __b) +{ + int32x2x2_t __rv; + __rv.val[0] = (int32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 0, 2 }); + __rv.val[1] = (int32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 1, 3 }); + return __rv; +} + +__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__)) +vzip_f32 (float32x2_t __a, float32x2_t __b) +{ + float32x2x2_t __rv; + __rv.val[0] = (float32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 0, 2 }); + __rv.val[1] = (float32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 1, 3 }); + return __rv; +} + +__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__)) +vzip_u32 (uint32x2_t __a, uint32x2_t __b) +{ + uint32x2x2_t __rv; + __rv.val[0] = (uint32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 0, 2 }); + __rv.val[1] = (uint32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 1, 3 }); + return __rv; +} + __extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__)) vzipq_s8 (int8x16_t __a, int8x16_t __b) { int8x16x2_t __rv; - __builtin_neon_vzipv16qi (&__rv.val[0], __a, __b); + __rv.val[0] = (int8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 }); + __rv.val[1] = (int8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 }); return __rv; } @@ -7620,7 +7648,8 @@ vzipq_s16 (int16x8_t __a, int16x8_t __b) { int16x8x2_t __rv; - __builtin_neon_vzipv8hi (&__rv.val[0], __a, __b); + __rv.val[0] = (int16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 0, 8, 1, 9, 2, 10, 3, 11 }); + __rv.val[1] = (int16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 4, 12, 5, 13, 6, 14, 7, 15 }); return __rv; } @@ -7628,7 +7657,8 @@ vzipq_s32 (int32x4_t __a, int32x4_t __b) { int32x4x2_t __rv; - __builtin_neon_vzipv4si (&__rv.val[0], __a, __b); + __rv.val[0] = (int32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 4, 1, 5 }); + __rv.val[1] = (int32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 6, 3, 7 }); return __rv; } @@ -7636,7 +7666,8 @@ vzipq_f32 (float32x4_t __a, float32x4_t __b) { float32x4x2_t __rv; - __builtin_neon_vzipv4sf (&__rv.val[0], __a, __b); + __rv.val[0] = (float32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 4, 1, 5 }); + __rv.val[1] = (float32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 6, 3, 7 }); return __rv; } @@ -7644,7 +7675,8 @@ vzipq_u8 (uint8x16_t __a, uint8x16_t __b) { uint8x16x2_t __rv; - __builtin_neon_vzipv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b); + __rv.val[0] = (uint8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 }); + __rv.val[1] = (uint8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 }); return __rv; } @@ -7652,7 +7684,8 @@ vzipq_u16 (uint16x8_t __a, uint16x8_t __b) { uint16x8x2_t __rv; - __builtin_neon_vzipv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b); + __rv.val[0] = (uint16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 0, 8, 1, 9, 2, 10, 3, 11 }); + __rv.val[1] = (uint16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 4, 12, 5, 13, 6, 14, 7, 15 }); return __rv; } @@ -7660,7 +7693,8 @@ vzipq_u32 (uint32x4_t __a, uint32x4_t __b) { uint32x4x2_t __rv; - __builtin_neon_vzipv4si ((int32x4_t *) &__rv.val[0], (int32x4_t) __a, (int32x4_t) __b); + __rv.val[0] = (uint32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 4, 1, 5 }); + __rv.val[1] = (uint32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 6, 3, 7 }); return __rv; } @@ -7668,7 +7702,8 @@ vzipq_p8 (poly8x16_t __a, poly8x16_t __b) { poly8x16x2_t __rv; - __builtin_neon_vzipv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b); + __rv.val[0] = (poly8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 }); + __rv.val[1] = (poly8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 }); return __rv; } @@ -7676,7 +7711,8 @@ vzipq_p16 (poly16x8_t __a, poly16x8_t __b) { poly16x8x2_t __rv; - __builtin_neon_vzipv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b); + __rv.val[0] = (poly16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 0, 8, 1, 9, 2, 10, 3, 11 }); + __rv.val[1] = (poly16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 4, 12, 5, 13, 6, 14, 7, 15 }); return __rv; } @@ -7684,7 +7720,8 @@ vuzp_s8 (int8x8_t __a, int8x8_t __b) { int8x8x2_t __rv; - __builtin_neon_vuzpv8qi (&__rv.val[0], __a, __b); + __rv.val[0] = (int8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 0, 2, 4, 6, 8, 10, 12, 14 }); + __rv.val[1] = (int8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 1, 3, 5, 7, 9, 11, 13, 15 }); return __rv; } @@ -7692,7 +7729,8 @@ vuzp_s16 (int16x4_t __a, int16x4_t __b) { int16x4x2_t __rv; - __builtin_neon_vuzpv4hi (&__rv.val[0], __a, __b); + __rv.val[0] = (int16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 0, 2, 4, 6 }); + __rv.val[1] = (int16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 1, 3, 5, 7 }); return __rv; } @@ -7700,7 +7738,8 @@ vuzp_s32 (int32x2_t __a, int32x2_t __b) { int32x2x2_t __rv; - __builtin_neon_vuzpv2si (&__rv.val[0], __a, __b); + __rv.val[0] = (int32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 0, 2 }); + __rv.val[1] = (int32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 1, 3 }); return __rv; } @@ -7708,7 +7747,8 @@ vuzp_f32 (float32x2_t __a, float32x2_t __b) { float32x2x2_t __rv; - __builtin_neon_vuzpv2sf (&__rv.val[0], __a, __b); + __rv.val[0] = (float32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 0, 2 }); + __rv.val[1] = (float32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 1, 3 }); return __rv; } @@ -7716,7 +7756,8 @@ vuzp_u8 (uint8x8_t __a, uint8x8_t __b) { uint8x8x2_t __rv; - __builtin_neon_vuzpv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b); + __rv.val[0] = (uint8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 0, 2, 4, 6, 8, 10, 12, 14 }); + __rv.val[1] = (uint8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 1, 3, 5, 7, 9, 11, 13, 15 }); return __rv; } @@ -7724,7 +7765,8 @@ vuzp_u16 (uint16x4_t __a, uint16x4_t __b) { uint16x4x2_t __rv; - __builtin_neon_vuzpv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b); + __rv.val[0] = (uint16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 0, 2, 4, 6 }); + __rv.val[1] = (uint16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 1, 3, 5, 7 }); return __rv; } @@ -7732,7 +7774,8 @@ vuzp_u32 (uint32x2_t __a, uint32x2_t __b) { uint32x2x2_t __rv; - __builtin_neon_vuzpv2si ((int32x2_t *) &__rv.val[0], (int32x2_t) __a, (int32x2_t) __b); + __rv.val[0] = (uint32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 0, 2 }); + __rv.val[1] = (uint32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 1, 3 }); return __rv; } @@ -7740,7 +7783,8 @@ vuzp_p8 (poly8x8_t __a, poly8x8_t __b) { poly8x8x2_t __rv; - __builtin_neon_vuzpv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b); + __rv.val[0] = (poly8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 0, 2, 4, 6, 8, 10, 12, 14 }); + __rv.val[1] = (poly8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 1, 3, 5, 7, 9, 11, 13, 15 }); return __rv; } @@ -7748,7 +7792,8 @@ vuzp_p16 (poly16x4_t __a, poly16x4_t __b) { poly16x4x2_t __rv; - __builtin_neon_vuzpv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b); + __rv.val[0] = (poly16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 0, 2, 4, 6 }); + __rv.val[1] = (poly16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 1, 3, 5, 7 }); return __rv; } @@ -7756,7 +7801,8 @@ vuzpq_s8 (int8x16_t __a, int8x16_t __b) { int8x16x2_t __rv; - __builtin_neon_vuzpv16qi (&__rv.val[0], __a, __b); + __rv.val[0] = (int8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 }); + __rv.val[1] = (int8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 }); return __rv; } @@ -7764,7 +7810,8 @@ vuzpq_s16 (int16x8_t __a, int16x8_t __b) { int16x8x2_t __rv; - __builtin_neon_vuzpv8hi (&__rv.val[0], __a, __b); + __rv.val[0] = (int16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 0, 2, 4, 6, 8, 10, 12, 14 }); + __rv.val[1] = (int16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 1, 3, 5, 7, 9, 11, 13, 15 }); return __rv; } @@ -7772,7 +7819,8 @@ vuzpq_s32 (int32x4_t __a, int32x4_t __b) { int32x4x2_t __rv; - __builtin_neon_vuzpv4si (&__rv.val[0], __a, __b); + __rv.val[0] = (int32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 2, 4, 6 }); + __rv.val[1] = (int32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 1, 3, 5, 7 }); return __rv; } @@ -7780,7 +7828,8 @@ vuzpq_f32 (float32x4_t __a, float32x4_t __b) { float32x4x2_t __rv; - __builtin_neon_vuzpv4sf (&__rv.val[0], __a, __b); + __rv.val[0] = (float32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 2, 4, 6 }); + __rv.val[1] = (float32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 1, 3, 5, 7 }); return __rv; } @@ -7788,7 +7837,8 @@ vuzpq_u8 (uint8x16_t __a, uint8x16_t __b) { uint8x16x2_t __rv; - __builtin_neon_vuzpv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b); + __rv.val[0] = (uint8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 }); + __rv.val[1] = (uint8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 }); return __rv; } @@ -7796,7 +7846,8 @@ vuzpq_u16 (uint16x8_t __a, uint16x8_t __b) { uint16x8x2_t __rv; - __builtin_neon_vuzpv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b); + __rv.val[0] = (uint16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 0, 2, 4, 6, 8, 10, 12, 14 }); + __rv.val[1] = (uint16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 1, 3, 5, 7, 9, 11, 13, 15 }); return __rv; } @@ -7804,7 +7855,8 @@ vuzpq_u32 (uint32x4_t __a, uint32x4_t __b) { uint32x4x2_t __rv; - __builtin_neon_vuzpv4si ((int32x4_t *) &__rv.val[0], (int32x4_t) __a, (int32x4_t) __b); + __rv.val[0] = (uint32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 2, 4, 6 }); + __rv.val[1] = (uint32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 1, 3, 5, 7 }); return __rv; } @@ -7812,7 +7864,8 @@ vuzpq_p8 (poly8x16_t __a, poly8x16_t __b) { poly8x16x2_t __rv; - __builtin_neon_vuzpv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b); + __rv.val[0] = (poly8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 }); + __rv.val[1] = (poly8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 }); return __rv; } @@ -7820,7 +7873,8 @@ vuzpq_p16 (poly16x8_t __a, poly16x8_t __b) { poly16x8x2_t __rv; - __builtin_neon_vuzpv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b); + __rv.val[0] = (poly16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 0, 2, 4, 6, 8, 10, 12, 14 }); + __rv.val[1] = (poly16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 1, 3, 5, 7, 9, 11, 13, 15 }); return __rv; } Index: gcc/config/arm/neon-gen.ml =================================================================== --- gcc/config/arm/neon-gen.ml (revision 189288) +++ gcc/config/arm/neon-gen.ml (working copy) @@ -91,15 +91,14 @@ end; open_braceblock ffmt; let rec print_lines = function - [] -> () + [] -> () + | "" :: lines -> print_lines lines | [line] -> Format.printf "%s" line - | line::lines -> Format.printf "%s@," line; print_lines lines in + | line::lines -> Format.printf "%s@," line ; print_lines lines in print_lines body; close_braceblock ffmt; end_function ffmt -let return_by_ptr features = List.mem ReturnPtr features - let union_string num elts base = let itype = inttype_for_array num elts in let iname = string_of_inttype itype @@ -141,29 +140,76 @@ (* Return a tuple of a list of declarations to go at the start of the function, and a list of statements needed to return THING. *) -let return arity return_by_ptr thing = +let return arity thing = match arity with Arity0 (ret) | Arity1 (ret, _) | Arity2 (ret, _, _) | Arity3 (ret, _, _, _) | Arity4 (ret, _, _, _, _) -> - match ret with - T_arrayof (num, vec) -> - if return_by_ptr then - let sname = string_of_vectype ret in - [Printf.sprintf "%s __rv;" sname], - [thing ^ ";"; "return __rv;"] - else + begin match ret with + T_arrayof (num, vec) -> let uname = union_string num vec "__rv" in [uname ^ ";"], ["__rv.__o = " ^ thing ^ ";"; "return __rv.__i;"] - | T_void -> [], [thing ^ ";"] - | _ -> - [], ["return " ^ (cast_for_return ret) ^ thing ^ ";"] + | T_void -> + [], [thing ^ ";"] + | _ -> + [], ["return " ^ (cast_for_return ret) ^ thing ^ ";"] + end +let mask_shape_for_shuffle = function + All (num, reg) -> All (num, reg) + | Pair_result reg -> All (2, reg) + | _ -> failwith "mask_for_shuffle" + +let mask_elems shuffle shape elttype part = + let elem_size = elt_width elttype in + let num_elems = + match regmap shape 0 with + Dreg -> 64 / elem_size + | Qreg -> 128 / elem_size + | _ -> failwith "mask_elems" in + shuffle elem_size num_elems part + +(* Return a tuple of a list of declarations 0and a list of statements needed + to implement an intrinsic using __builtin_shuffle. SHUFFLE is a function + which returns a list of elements suitable for using as a mask. *) + +let shuffle_fn shuffle shape arity elttype = + let mshape = mask_shape_for_shuffle shape in + let masktype = type_for_elt mshape (unsigned_of_elt elttype) 0 in + let masktype_str = string_of_vectype masktype in + let shuffle_res = type_for_elt mshape elttype 0 in + let shuffle_res_str = string_of_vectype shuffle_res in + match arity with + Arity0 (ret) | Arity1 (ret, _) | Arity2 (ret, _, _) | Arity3 (ret, _, _, _) + | Arity4 (ret, _, _, _, _) -> + begin match ret with + T_arrayof (num, vec) -> + let elems1 = mask_elems shuffle mshape elttype `lo + and elems2 = mask_elems shuffle mshape elttype `hi in + let mask1 = (String.concat ", " (List.map string_of_int elems1)) + and mask2 = (String.concat ", " (List.map string_of_int elems2)) in + let shuf1 = Printf.sprintf + "__rv.val[0] = (%s) __builtin_shuffle (__a, __b, (%s) { %s });" + shuffle_res_str masktype_str mask1 + and shuf2 = Printf.sprintf + "__rv.val[1] = (%s) __builtin_shuffle (__a, __b, (%s) { %s });" + shuffle_res_str masktype_str mask2 in + [Printf.sprintf "%s __rv;" (string_of_vectype ret);], + [shuf1; shuf2; "return __rv;"] + | _ -> + let elems = mask_elems shuffle mshape elttype `lo in + let mask = (String.concat ", " (List.map string_of_int elems)) in + let shuf = Printf.sprintf + "return (%s) __builtin_shuffle (__a, (%s) { %s });" shuffle_res_str masktype_str mask in + [""], + [shuf] + end + let rec element_type ctype = match ctype with T_arrayof (_, v) -> element_type v | _ -> ctype -let params return_by_ptr ps = +let params ps = let pdecls = ref [] in let ptype t p = match t with @@ -180,13 +226,7 @@ | Arity3 (_, t1, t2, t3) -> [ptype t1 "__a"; ptype t2 "__b"; ptype t3 "__c"] | Arity4 (_, t1, t2, t3, t4) -> [ptype t1 "__a"; ptype t2 "__b"; ptype t3 "__c"; ptype t4 "__d"] in - match ps with - Arity0 ret | Arity1 (ret, _) | Arity2 (ret, _, _) | Arity3 (ret, _, _, _) - | Arity4 (ret, _, _, _, _) -> - if return_by_ptr then - !pdecls, add_cast (T_ptrto (element_type ret)) "&__rv.val[0]" :: plist - else - !pdecls, plist + !pdecls, plist let modify_params features plist = let is_flipped = @@ -239,17 +279,27 @@ and srcmode = mode_of_elt src shape in string_of_mode dstmode ^ string_of_mode srcmode +let get_shuffle features = + try + match List.find (function Use_shuffle _ -> true | _ -> false) features with + Use_shuffle fn -> Some fn + | _ -> None + with Not_found -> None + let print_variant opcode features shape name (ctype, asmtype, elttype) = let bits = infoword_value elttype features in let modesuf = mode_suffix elttype shape in - let return_by_ptr = return_by_ptr features in - let pdecls, paramlist = params return_by_ptr ctype in - let paramlist' = modify_params features paramlist in - let paramlist'' = extra_word shape features paramlist' bits in - let parstr = String.concat ", " paramlist'' in - let builtin = Printf.sprintf "__builtin_neon_%s%s (%s)" - (builtin_name features name) modesuf parstr in - let rdecls, stmts = return ctype return_by_ptr builtin in + let pdecls, paramlist = params ctype in + let rdecls, stmts = + match get_shuffle features with + Some shuffle -> shuffle_fn shuffle shape ctype elttype + | None -> + let paramlist' = modify_params features paramlist in + let paramlist'' = extra_word shape features paramlist' bits in + let parstr = String.concat ", " paramlist'' in + let builtin = Printf.sprintf "__builtin_neon_%s%s (%s)" + (builtin_name features name) modesuf parstr in + return ctype builtin in let body = pdecls @ rdecls @ stmts and fnname = (intrinsic_name name) ^ "_" ^ (string_of_elt elttype) in print_function ctype fnname body

[ARM] PR51980 / PR49081 Improve Neon permute intrinsics.

Commit Message

Patch