Patchwork [ARM] PR51980 / PR49081 Improve Neon permute intrinsics.

login
register
mail settings
Submitter Ramana Radhakrishnan
Date July 5, 2012, 4:51 p.m.
Message ID <CACUk7=W9md0xyA7tj4QtVsC4Tw0iQC5NEiKiDzJkA4erN=713A@mail.gmail.com>
Download mbox | patch
Permalink /patch/169225/
State New
Headers show

Comments

Ramana Radhakrishnan - July 5, 2012, 4:51 p.m.
On 20 June 2012 12:29, Julian Brown <julian@codesourcery.com> wrote:
> On Wed, 20 Jun 2012 11:56:39 +0100
> Ramana Radhakrishnan <ramana.radhakrishnan@linaro.org> wrote:
>
>> Hi,
>>
>> This patch helps use the __builtin_shuffle intrinsics to implement the
>> Neon permute intrinsics following on from Julian's and my patch last
>> week. It needed support for __builtin_shuffle in the C++ frontend
>> which is now in and has been for the past few days , so I'm a little
>> happier with this going in now.The changes to Julian's  patch are to
>> drop the "mask" generation and now this directly generates the vector
>> constants instead.
>
> A small stylistic point I noticed: in,
>
>    let rec print_lines = function
>      [] -> ()
> -  | [line] -> Format.printf "%s" line
> -  | line::lines -> Format.printf "%s@," line; print_lines lines in
> +  | [line] -> if line <> "" then Format.printf "%s" line else ()
> +  | line::lines -> (if line <> "" then Format.printf "%s@," line);
>                                                    print_lines lines in
>    print_lines body; close_braceblock ffmt;
>    end_function ffmt
>
> You can use constant strings in pattern matches, so this can be just:
>
>   let rec print_lines = function
>     [] | ""::_ -> ()
>   | [line] -> Format.printf...
>   | line::lines -> Format.printf...
>
> You didn't need the brackets () around the if, btw. It's semantically
> quite like C: only a single statement after the "then" is conditional.
> If you want multiple statements conditionalised, the idiomatic
> way to do it is use begin...end (equivalent to { } in C) after the then
> keyword.
>

This is what I committed finally.

2012-07-05  Ramana Radhakrishnan  <ramana.radhakrishnan@linaro.org>
	    Julian Brown  <julian@codesourcery.com>

        PR target/49891
        PR target/51980
        * config/arm/neon-gen.ml (return_by_ptr): Delete.
        (print_function): Handle empty strings.
        (return): Delete use of return_by_ptr.
        (mask_shape_for_shuffle): New function.
        (mask_elems): Likewise.
        (shuffle_fn): Likewise.
        (params): Simplify and remove use of return_by_ptr.
        (get_shuffle): New function.
        (print_variant): Update.
        * config/arm/neon.ml (rev_elems): New function.
        (permute_range): Likewise.
        (zip_range): Likewise.
        (uzip_range): Likewise.
        (trn_range): Likewise.
        (zip_elems): Likewise.
        (uzip_elems): Likewise.
        (trn_elems): Likewise.
        (features): New enumeration Use_shuffle. Delete ReturnPtr.
        (pf_su_8_16): New.
        (suf_32): New.
        (ops): Update entries for Vrev64, Vrev32, Vrev16, Vtr, Vzip, Vuzp.
        * config/arm/arm_neon.h: Regenerate.



2012-07-05  Ramana Radhakrishnan  <ramana.radhakrishnan@linaro.org>

        PR target/49891
        PR target/51980
        * gcc/testsuite/gcc.target/arm/neon/vtrnf32.c: Update.
        * gcc/testsuite/gcc.target/arm/neon/vtrns32.c: Update.
        * gcc/testsuite/gcc.target/arm/neon/vtrnu32.c: Update.
        * gcc/testsuite/gcc.target/arm/neon/vzipf32.c: Update.
        * gcc/testsuite/gcc.target/arm/neon/vzips32.c: Update.
        * gcc/testsuite/gcc.target/arm/neon/vzipu32.c: Update.




Ramana

> HTH,
>
> Julian

Patch

Index: gcc/testsuite/gcc.target/arm/neon/vzipu32.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vzipu32.c	(revision 189288)
+++ gcc/testsuite/gcc.target/arm/neon/vzipu32.c	(working copy)
@@ -17,5 +17,5 @@ 
   out_uint32x2x2_t = vzip_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
 }
 
-/* { dg-final { scan-assembler "vzip\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vuzp\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vzipf32.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vzipf32.c	(revision 189288)
+++ gcc/testsuite/gcc.target/arm/neon/vzipf32.c	(working copy)
@@ -17,5 +17,5 @@ 
   out_float32x2x2_t = vzip_f32 (arg0_float32x2_t, arg1_float32x2_t);
 }
 
-/* { dg-final { scan-assembler "vzip\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vuzp\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vtrns32.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vtrns32.c	(revision 189288)
+++ gcc/testsuite/gcc.target/arm/neon/vtrns32.c	(working copy)
@@ -17,5 +17,5 @@ 
   out_int32x2x2_t = vtrn_s32 (arg0_int32x2_t, arg1_int32x2_t);
 }
 
-/* { dg-final { scan-assembler "vtrn\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vuzp\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vtrnu32.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vtrnu32.c	(revision 189288)
+++ gcc/testsuite/gcc.target/arm/neon/vtrnu32.c	(working copy)
@@ -17,5 +17,5 @@ 
   out_uint32x2x2_t = vtrn_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
 }
 
-/* { dg-final { scan-assembler "vtrn\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vuzp\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vtrnf32.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vtrnf32.c	(revision 189288)
+++ gcc/testsuite/gcc.target/arm/neon/vtrnf32.c	(working copy)
@@ -17,5 +17,5 @@ 
   out_float32x2x2_t = vtrn_f32 (arg0_float32x2_t, arg1_float32x2_t);
 }
 
-/* { dg-final { scan-assembler "vtrn\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vuzp\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vzips32.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vzips32.c	(revision 189288)
+++ gcc/testsuite/gcc.target/arm/neon/vzips32.c	(working copy)
@@ -17,5 +17,5 @@ 
   out_int32x2x2_t = vzip_s32 (arg0_int32x2_t, arg1_int32x2_t);
 }
 
-/* { dg-final { scan-assembler "vzip\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vuzp\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/config/arm/neon.ml
===================================================================
--- gcc/config/arm/neon.ml	(revision 189288)
+++ gcc/config/arm/neon.ml	(working copy)
@@ -201,6 +201,42 @@ 
   (* Reinterpret casts.  *)
   | Vreinterp
 
+let rev_elems revsize elsize nelts _ =
+  let mask = (revsize / elsize) - 1 in
+  let arr = Array.init nelts
+    (fun i -> i lxor mask) in
+  Array.to_list arr
+
+let permute_range i stride nelts increment =
+  let rec build i = function
+    0 -> []
+  | nelts -> i :: (i + stride) :: build (i + increment) (pred nelts) in
+  build i nelts
+
+(* Generate a list of integers suitable for vzip.  *)
+let zip_range i stride nelts = permute_range i stride nelts 1
+
+(* Generate a list of integers suitable for vunzip.  *)
+let uzip_range i stride nelts = permute_range i stride nelts 4
+
+(* Generate a list of integers suitable for trn.  *)
+let trn_range i stride nelts = permute_range i stride nelts 2
+
+let zip_elems _ nelts part =
+  match part with
+    `lo -> zip_range 0 nelts (nelts / 2)
+  | `hi -> zip_range (nelts / 2) nelts (nelts / 2)
+
+let uzip_elems _ nelts part =
+  match part with
+    `lo -> uzip_range 0 2 (nelts / 2)
+  | `hi -> uzip_range 1 2 (nelts / 2)
+
+let trn_elems _ nelts part =
+  match part with
+    `lo -> trn_range 0 nelts (nelts / 2)
+  | `hi -> trn_range 1 nelts (nelts / 2)
+
 (* Features used for documentation, to distinguish between some instruction
    variants, and to signal special requirements (e.g. swapping arguments).  *)
 
@@ -214,7 +250,10 @@ 
   | Flipped of string  (* Builtin name to use with flipped arguments.  *)
   | InfoWord  (* Pass an extra word for signage/rounding etc. (always passed
                  for All _, Long, Wide, Narrow shape_forms.  *)
-  | ReturnPtr  (* Pass explicit pointer to return value as first argument.  *)
+    (* Implement builtin as shuffle.  The parameter is a function which returns
+       masks suitable for __builtin_shuffle: arguments are (element size,
+       number of elements, high/low part selector).  *)
+  | Use_shuffle of (int -> int -> [`lo|`hi] -> int list)
     (* A specification as to the shape of instruction expected upon
        disassembly, used if it differs from the shape used to build the
        intrinsic prototype.  Multiple entries in the constructor's argument
@@ -706,8 +745,10 @@ 
 let su_8_32 = [S8; S16; S32; U8; U16; U32]
 let su_8_64 = S64 :: U64 :: su_8_32
 let su_16_64 = [S16; S32; S64; U16; U32; U64]
+let pf_su_8_16 = [P8; P16; S8; S16; U8; U16]
 let pf_su_8_32 = P8 :: P16 :: F32 :: su_8_32
 let pf_su_8_64 = P8 :: P16 :: F32 :: su_8_64
+let suf_32 = [S32; U32; F32]
 
 let ops =
   [
@@ -1317,12 +1358,18 @@ 
       pf_su_8_64;
 
     (* Reverse elements.  *)
-    Vrev64, [], All (2, Dreg), "vrev64", bits_1, P8 :: P16 :: F32 :: su_8_32;
-    Vrev64, [], All (2, Qreg), "vrev64Q", bits_1, P8 :: P16 :: F32 :: su_8_32;
-    Vrev32, [], All (2, Dreg), "vrev32", bits_1, [P8; P16; S8; U8; S16; U16];
-    Vrev32, [], All (2, Qreg), "vrev32Q", bits_1, [P8; P16; S8; U8; S16; U16];
-    Vrev16, [], All (2, Dreg), "vrev16", bits_1, [P8; S8; U8];
-    Vrev16, [], All (2, Qreg), "vrev16Q", bits_1, [P8; S8; U8];
+    Vrev64, [Use_shuffle (rev_elems 64)], All (2, Dreg), "vrev64", bits_1,
+      P8 :: P16 :: F32 :: su_8_32;
+    Vrev64, [Use_shuffle (rev_elems 64)], All (2, Qreg), "vrev64Q", bits_1,
+      P8 :: P16 :: F32 :: su_8_32;
+    Vrev32, [Use_shuffle (rev_elems 32)], All (2, Dreg), "vrev32", bits_1,
+      [P8; P16; S8; U8; S16; U16];
+    Vrev32, [Use_shuffle (rev_elems 32)], All (2, Qreg), "vrev32Q", bits_1,
+      [P8; P16; S8; U8; S16; U16];
+    Vrev16, [Use_shuffle (rev_elems 16)], All (2, Dreg), "vrev16", bits_1,
+      [P8; S8; U8];
+    Vrev16, [Use_shuffle (rev_elems 16)], All (2, Qreg), "vrev16Q", bits_1,
+      [P8; S8; U8];
 
     (* Bit selection.  *)
     Vbsl,
@@ -1336,25 +1383,19 @@ 
       Use_operands [| Qreg; Qreg; Qreg; Qreg |], "vbslQ", bit_select,
       pf_su_8_64;
 
-    (* Transpose elements.  **NOTE** ReturnPtr goes some of the way towards
-       generating good code for intrinsics which return structure types --
-       builtins work well by themselves (and understand that the values being
-       stored on e.g. the stack also reside in registers, so can optimise the
-       stores away entirely if the results are used immediately), but
-       intrinsics are very much less efficient. Maybe something can be improved
-       re: inlining, or tweaking the ABI used for intrinsics (a special call
-       attribute?).
-    *)
-    Vtrn, [ReturnPtr], Pair_result Dreg, "vtrn", bits_2, pf_su_8_32;
-    Vtrn, [ReturnPtr], Pair_result Qreg, "vtrnQ", bits_2, pf_su_8_32;
-
+    Vtrn, [Use_shuffle trn_elems], Pair_result Dreg, "vtrn", bits_2, pf_su_8_16;
+    Vtrn, [Use_shuffle trn_elems; Instruction_name ["vuzp"]], Pair_result Dreg, "vtrn", bits_2, suf_32;
+    Vtrn, [Use_shuffle trn_elems], Pair_result Qreg, "vtrnQ", bits_2, pf_su_8_32;
     (* Zip elements.  *)
-    Vzip, [ReturnPtr], Pair_result Dreg, "vzip", bits_2, pf_su_8_32;
-    Vzip, [ReturnPtr], Pair_result Qreg, "vzipQ", bits_2, pf_su_8_32;
+    Vzip, [Use_shuffle zip_elems], Pair_result Dreg, "vzip", bits_2, pf_su_8_16;
+    Vzip, [Use_shuffle zip_elems; Instruction_name ["vuzp"]], Pair_result Dreg, "vzip", bits_2, suf_32;
+    Vzip, [Use_shuffle zip_elems], Pair_result Qreg, "vzipQ", bits_2, pf_su_8_32; 
 
     (* Unzip elements.  *)
-    Vuzp, [ReturnPtr], Pair_result Dreg, "vuzp", bits_2, pf_su_8_32;
-    Vuzp, [ReturnPtr], Pair_result Qreg, "vuzpQ", bits_2, pf_su_8_32;
+    Vuzp, [Use_shuffle uzip_elems], Pair_result Dreg, "vuzp", bits_2,
+      pf_su_8_32;
+    Vuzp, [Use_shuffle uzip_elems], Pair_result Qreg, "vuzpQ", bits_2,
+      pf_su_8_32;
 
     (* Element/structure loads.  VLD1 variants.  *)
     Vldx 1,
Index: gcc/config/arm/arm_neon.h
===================================================================
--- gcc/config/arm/arm_neon.h	(revision 189288)
+++ gcc/config/arm/arm_neon.h	(working copy)
@@ -7047,217 +7047,217 @@ 
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vrev64_s8 (int8x8_t __a)
 {
-  return (int8x8_t)__builtin_neon_vrev64v8qi (__a, 1);
+  return (int8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
 }
 
 __extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 vrev64_s16 (int16x4_t __a)
 {
-  return (int16x4_t)__builtin_neon_vrev64v4hi (__a, 1);
+  return (int16x4_t) __builtin_shuffle (__a, (uint16x4_t) { 3, 2, 1, 0 });
 }
 
 __extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 vrev64_s32 (int32x2_t __a)
 {
-  return (int32x2_t)__builtin_neon_vrev64v2si (__a, 1);
+  return (int32x2_t) __builtin_shuffle (__a, (uint32x2_t) { 1, 0 });
 }
 
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vrev64_f32 (float32x2_t __a)
 {
-  return (float32x2_t)__builtin_neon_vrev64v2sf (__a, 3);
+  return (float32x2_t) __builtin_shuffle (__a, (uint32x2_t) { 1, 0 });
 }
 
 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 vrev64_u8 (uint8x8_t __a)
 {
-  return (uint8x8_t)__builtin_neon_vrev64v8qi ((int8x8_t) __a, 0);
+  return (uint8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
 }
 
 __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 vrev64_u16 (uint16x4_t __a)
 {
-  return (uint16x4_t)__builtin_neon_vrev64v4hi ((int16x4_t) __a, 0);
+  return (uint16x4_t) __builtin_shuffle (__a, (uint16x4_t) { 3, 2, 1, 0 });
 }
 
 __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 vrev64_u32 (uint32x2_t __a)
 {
-  return (uint32x2_t)__builtin_neon_vrev64v2si ((int32x2_t) __a, 0);
+  return (uint32x2_t) __builtin_shuffle (__a, (uint32x2_t) { 1, 0 });
 }
 
 __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 vrev64_p8 (poly8x8_t __a)
 {
-  return (poly8x8_t)__builtin_neon_vrev64v8qi ((int8x8_t) __a, 2);
+  return (poly8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
 }
 
 __extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
 vrev64_p16 (poly16x4_t __a)
 {
-  return (poly16x4_t)__builtin_neon_vrev64v4hi ((int16x4_t) __a, 2);
+  return (poly16x4_t) __builtin_shuffle (__a, (uint16x4_t) { 3, 2, 1, 0 });
 }
 
 __extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 vrev64q_s8 (int8x16_t __a)
 {
-  return (int8x16_t)__builtin_neon_vrev64v16qi (__a, 1);
+  return (int8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
 }
 
 __extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 vrev64q_s16 (int16x8_t __a)
 {
-  return (int16x8_t)__builtin_neon_vrev64v8hi (__a, 1);
+  return (int16x8_t) __builtin_shuffle (__a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
 }
 
 __extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 vrev64q_s32 (int32x4_t __a)
 {
-  return (int32x4_t)__builtin_neon_vrev64v4si (__a, 1);
+  return (int32x4_t) __builtin_shuffle (__a, (uint32x4_t) { 1, 0, 3, 2 });
 }
 
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vrev64q_f32 (float32x4_t __a)
 {
-  return (float32x4_t)__builtin_neon_vrev64v4sf (__a, 3);
+  return (float32x4_t) __builtin_shuffle (__a, (uint32x4_t) { 1, 0, 3, 2 });
 }
 
 __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 vrev64q_u8 (uint8x16_t __a)
 {
-  return (uint8x16_t)__builtin_neon_vrev64v16qi ((int8x16_t) __a, 0);
+  return (uint8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
 }
 
 __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 vrev64q_u16 (uint16x8_t __a)
 {
-  return (uint16x8_t)__builtin_neon_vrev64v8hi ((int16x8_t) __a, 0);
+  return (uint16x8_t) __builtin_shuffle (__a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
 }
 
 __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 vrev64q_u32 (uint32x4_t __a)
 {
-  return (uint32x4_t)__builtin_neon_vrev64v4si ((int32x4_t) __a, 0);
+  return (uint32x4_t) __builtin_shuffle (__a, (uint32x4_t) { 1, 0, 3, 2 });
 }
 
 __extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
 vrev64q_p8 (poly8x16_t __a)
 {
-  return (poly8x16_t)__builtin_neon_vrev64v16qi ((int8x16_t) __a, 2);
+  return (poly8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
 }
 
 __extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
 vrev64q_p16 (poly16x8_t __a)
 {
-  return (poly16x8_t)__builtin_neon_vrev64v8hi ((int16x8_t) __a, 2);
+  return (poly16x8_t) __builtin_shuffle (__a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
 }
 
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vrev32_s8 (int8x8_t __a)
 {
-  return (int8x8_t)__builtin_neon_vrev32v8qi (__a, 1);
+  return (int8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
 }
 
 __extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 vrev32_s16 (int16x4_t __a)
 {
-  return (int16x4_t)__builtin_neon_vrev32v4hi (__a, 1);
+  return (int16x4_t) __builtin_shuffle (__a, (uint16x4_t) { 1, 0, 3, 2 });
 }
 
 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 vrev32_u8 (uint8x8_t __a)
 {
-  return (uint8x8_t)__builtin_neon_vrev32v8qi ((int8x8_t) __a, 0);
+  return (uint8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
 }
 
 __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 vrev32_u16 (uint16x4_t __a)
 {
-  return (uint16x4_t)__builtin_neon_vrev32v4hi ((int16x4_t) __a, 0);
+  return (uint16x4_t) __builtin_shuffle (__a, (uint16x4_t) { 1, 0, 3, 2 });
 }
 
 __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 vrev32_p8 (poly8x8_t __a)
 {
-  return (poly8x8_t)__builtin_neon_vrev32v8qi ((int8x8_t) __a, 2);
+  return (poly8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
 }
 
 __extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
 vrev32_p16 (poly16x4_t __a)
 {
-  return (poly16x4_t)__builtin_neon_vrev32v4hi ((int16x4_t) __a, 2);
+  return (poly16x4_t) __builtin_shuffle (__a, (uint16x4_t) { 1, 0, 3, 2 });
 }
 
 __extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 vrev32q_s8 (int8x16_t __a)
 {
-  return (int8x16_t)__builtin_neon_vrev32v16qi (__a, 1);
+  return (int8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
 }
 
 __extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 vrev32q_s16 (int16x8_t __a)
 {
-  return (int16x8_t)__builtin_neon_vrev32v8hi (__a, 1);
+  return (int16x8_t) __builtin_shuffle (__a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
 }
 
 __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 vrev32q_u8 (uint8x16_t __a)
 {
-  return (uint8x16_t)__builtin_neon_vrev32v16qi ((int8x16_t) __a, 0);
+  return (uint8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
 }
 
 __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 vrev32q_u16 (uint16x8_t __a)
 {
-  return (uint16x8_t)__builtin_neon_vrev32v8hi ((int16x8_t) __a, 0);
+  return (uint16x8_t) __builtin_shuffle (__a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
 }
 
 __extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
 vrev32q_p8 (poly8x16_t __a)
 {
-  return (poly8x16_t)__builtin_neon_vrev32v16qi ((int8x16_t) __a, 2);
+  return (poly8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
 }
 
 __extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
 vrev32q_p16 (poly16x8_t __a)
 {
-  return (poly16x8_t)__builtin_neon_vrev32v8hi ((int16x8_t) __a, 2);
+  return (poly16x8_t) __builtin_shuffle (__a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
 }
 
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vrev16_s8 (int8x8_t __a)
 {
-  return (int8x8_t)__builtin_neon_vrev16v8qi (__a, 1);
+  return (int8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
 }
 
 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 vrev16_u8 (uint8x8_t __a)
 {
-  return (uint8x8_t)__builtin_neon_vrev16v8qi ((int8x8_t) __a, 0);
+  return (uint8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
 }
 
 __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 vrev16_p8 (poly8x8_t __a)
 {
-  return (poly8x8_t)__builtin_neon_vrev16v8qi ((int8x8_t) __a, 2);
+  return (poly8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
 }
 
 __extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 vrev16q_s8 (int8x16_t __a)
 {
-  return (int8x16_t)__builtin_neon_vrev16v16qi (__a, 1);
+  return (int8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
 }
 
 __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 vrev16q_u8 (uint8x16_t __a)
 {
-  return (uint8x16_t)__builtin_neon_vrev16v16qi ((int8x16_t) __a, 0);
+  return (uint8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
 }
 
 __extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
 vrev16q_p8 (poly8x16_t __a)
 {
-  return (poly8x16_t)__builtin_neon_vrev16v16qi ((int8x16_t) __a, 2);
+  return (poly8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
 }
 
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
@@ -7396,7 +7396,8 @@ 
 vtrn_s8 (int8x8_t __a, int8x8_t __b)
 {
   int8x8x2_t __rv;
-  __builtin_neon_vtrnv8qi (&__rv.val[0], __a, __b);
+  __rv.val[0] = (int8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 0, 8, 2, 10, 4, 12, 6, 14 });
+  __rv.val[1] = (int8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 1, 9, 3, 11, 5, 13, 7, 15 });
   return __rv;
 }
 
@@ -7404,31 +7405,17 @@ 
 vtrn_s16 (int16x4_t __a, int16x4_t __b)
 {
   int16x4x2_t __rv;
-  __builtin_neon_vtrnv4hi (&__rv.val[0], __a, __b);
+  __rv.val[0] = (int16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 0, 4, 2, 6 });
+  __rv.val[1] = (int16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 1, 5, 3, 7 });
   return __rv;
 }
 
-__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
-vtrn_s32 (int32x2_t __a, int32x2_t __b)
-{
-  int32x2x2_t __rv;
-  __builtin_neon_vtrnv2si (&__rv.val[0], __a, __b);
-  return __rv;
-}
-
-__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
-vtrn_f32 (float32x2_t __a, float32x2_t __b)
-{
-  float32x2x2_t __rv;
-  __builtin_neon_vtrnv2sf (&__rv.val[0], __a, __b);
-  return __rv;
-}
-
 __extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
 vtrn_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   uint8x8x2_t __rv;
-  __builtin_neon_vtrnv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
+  __rv.val[0] = (uint8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 0, 8, 2, 10, 4, 12, 6, 14 });
+  __rv.val[1] = (uint8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 1, 9, 3, 11, 5, 13, 7, 15 });
   return __rv;
 }
 
@@ -7436,23 +7423,17 @@ 
 vtrn_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   uint16x4x2_t __rv;
-  __builtin_neon_vtrnv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
+  __rv.val[0] = (uint16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 0, 4, 2, 6 });
+  __rv.val[1] = (uint16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 1, 5, 3, 7 });
   return __rv;
 }
 
-__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
-vtrn_u32 (uint32x2_t __a, uint32x2_t __b)
-{
-  uint32x2x2_t __rv;
-  __builtin_neon_vtrnv2si ((int32x2_t *) &__rv.val[0], (int32x2_t) __a, (int32x2_t) __b);
-  return __rv;
-}
-
 __extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
 vtrn_p8 (poly8x8_t __a, poly8x8_t __b)
 {
   poly8x8x2_t __rv;
-  __builtin_neon_vtrnv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
+  __rv.val[0] = (poly8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 0, 8, 2, 10, 4, 12, 6, 14 });
+  __rv.val[1] = (poly8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 1, 9, 3, 11, 5, 13, 7, 15 });
   return __rv;
 }
 
@@ -7460,15 +7441,44 @@ 
 vtrn_p16 (poly16x4_t __a, poly16x4_t __b)
 {
   poly16x4x2_t __rv;
-  __builtin_neon_vtrnv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
+  __rv.val[0] = (poly16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 0, 4, 2, 6 });
+  __rv.val[1] = (poly16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 1, 5, 3, 7 });
   return __rv;
 }
 
+__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
+vtrn_s32 (int32x2_t __a, int32x2_t __b)
+{
+  int32x2x2_t __rv;
+  __rv.val[0] = (int32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 0, 2 });
+  __rv.val[1] = (int32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 1, 3 });
+  return __rv;
+}
+
+__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
+vtrn_f32 (float32x2_t __a, float32x2_t __b)
+{
+  float32x2x2_t __rv;
+  __rv.val[0] = (float32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 0, 2 });
+  __rv.val[1] = (float32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 1, 3 });
+  return __rv;
+}
+
+__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
+vtrn_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  uint32x2x2_t __rv;
+  __rv.val[0] = (uint32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 0, 2 });
+  __rv.val[1] = (uint32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 1, 3 });
+  return __rv;
+}
+
 __extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
 vtrnq_s8 (int8x16_t __a, int8x16_t __b)
 {
   int8x16x2_t __rv;
-  __builtin_neon_vtrnv16qi (&__rv.val[0], __a, __b);
+  __rv.val[0] = (int8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 });
+  __rv.val[1] = (int8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 });
   return __rv;
 }
 
@@ -7476,7 +7486,8 @@ 
 vtrnq_s16 (int16x8_t __a, int16x8_t __b)
 {
   int16x8x2_t __rv;
-  __builtin_neon_vtrnv8hi (&__rv.val[0], __a, __b);
+  __rv.val[0] = (int16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 0, 8, 2, 10, 4, 12, 6, 14 });
+  __rv.val[1] = (int16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 1, 9, 3, 11, 5, 13, 7, 15 });
   return __rv;
 }
 
@@ -7484,7 +7495,8 @@ 
 vtrnq_s32 (int32x4_t __a, int32x4_t __b)
 {
   int32x4x2_t __rv;
-  __builtin_neon_vtrnv4si (&__rv.val[0], __a, __b);
+  __rv.val[0] = (int32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 4, 2, 6 });
+  __rv.val[1] = (int32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 1, 5, 3, 7 });
   return __rv;
 }
 
@@ -7492,7 +7504,8 @@ 
 vtrnq_f32 (float32x4_t __a, float32x4_t __b)
 {
   float32x4x2_t __rv;
-  __builtin_neon_vtrnv4sf (&__rv.val[0], __a, __b);
+  __rv.val[0] = (float32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 4, 2, 6 });
+  __rv.val[1] = (float32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 1, 5, 3, 7 });
   return __rv;
 }
 
@@ -7500,7 +7513,8 @@ 
 vtrnq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   uint8x16x2_t __rv;
-  __builtin_neon_vtrnv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
+  __rv.val[0] = (uint8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 });
+  __rv.val[1] = (uint8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 });
   return __rv;
 }
 
@@ -7508,7 +7522,8 @@ 
 vtrnq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   uint16x8x2_t __rv;
-  __builtin_neon_vtrnv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
+  __rv.val[0] = (uint16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 0, 8, 2, 10, 4, 12, 6, 14 });
+  __rv.val[1] = (uint16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 1, 9, 3, 11, 5, 13, 7, 15 });
   return __rv;
 }
 
@@ -7516,7 +7531,8 @@ 
 vtrnq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   uint32x4x2_t __rv;
-  __builtin_neon_vtrnv4si ((int32x4_t *) &__rv.val[0], (int32x4_t) __a, (int32x4_t) __b);
+  __rv.val[0] = (uint32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 4, 2, 6 });
+  __rv.val[1] = (uint32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 1, 5, 3, 7 });
   return __rv;
 }
 
@@ -7524,7 +7540,8 @@ 
 vtrnq_p8 (poly8x16_t __a, poly8x16_t __b)
 {
   poly8x16x2_t __rv;
-  __builtin_neon_vtrnv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
+  __rv.val[0] = (poly8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 });
+  __rv.val[1] = (poly8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 });
   return __rv;
 }
 
@@ -7532,7 +7549,8 @@ 
 vtrnq_p16 (poly16x8_t __a, poly16x8_t __b)
 {
   poly16x8x2_t __rv;
-  __builtin_neon_vtrnv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
+  __rv.val[0] = (poly16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 0, 8, 2, 10, 4, 12, 6, 14 });
+  __rv.val[1] = (poly16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 1, 9, 3, 11, 5, 13, 7, 15 });
   return __rv;
 }
 
@@ -7540,7 +7558,8 @@ 
 vzip_s8 (int8x8_t __a, int8x8_t __b)
 {
   int8x8x2_t __rv;
-  __builtin_neon_vzipv8qi (&__rv.val[0], __a, __b);
+  __rv.val[0] = (int8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 0, 8, 1, 9, 2, 10, 3, 11 });
+  __rv.val[1] = (int8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 4, 12, 5, 13, 6, 14, 7, 15 });
   return __rv;
 }
 
@@ -7548,31 +7567,17 @@ 
 vzip_s16 (int16x4_t __a, int16x4_t __b)
 {
   int16x4x2_t __rv;
-  __builtin_neon_vzipv4hi (&__rv.val[0], __a, __b);
+  __rv.val[0] = (int16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 0, 4, 1, 5 });
+  __rv.val[1] = (int16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 2, 6, 3, 7 });
   return __rv;
 }
 
-__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
-vzip_s32 (int32x2_t __a, int32x2_t __b)
-{
-  int32x2x2_t __rv;
-  __builtin_neon_vzipv2si (&__rv.val[0], __a, __b);
-  return __rv;
-}
-
-__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
-vzip_f32 (float32x2_t __a, float32x2_t __b)
-{
-  float32x2x2_t __rv;
-  __builtin_neon_vzipv2sf (&__rv.val[0], __a, __b);
-  return __rv;
-}
-
 __extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
 vzip_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   uint8x8x2_t __rv;
-  __builtin_neon_vzipv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
+  __rv.val[0] = (uint8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 0, 8, 1, 9, 2, 10, 3, 11 });
+  __rv.val[1] = (uint8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 4, 12, 5, 13, 6, 14, 7, 15 });
   return __rv;
 }
 
@@ -7580,23 +7585,17 @@ 
 vzip_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   uint16x4x2_t __rv;
-  __builtin_neon_vzipv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
+  __rv.val[0] = (uint16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 0, 4, 1, 5 });
+  __rv.val[1] = (uint16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 2, 6, 3, 7 });
   return __rv;
 }
 
-__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
-vzip_u32 (uint32x2_t __a, uint32x2_t __b)
-{
-  uint32x2x2_t __rv;
-  __builtin_neon_vzipv2si ((int32x2_t *) &__rv.val[0], (int32x2_t) __a, (int32x2_t) __b);
-  return __rv;
-}
-
 __extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
 vzip_p8 (poly8x8_t __a, poly8x8_t __b)
 {
   poly8x8x2_t __rv;
-  __builtin_neon_vzipv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
+  __rv.val[0] = (poly8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 0, 8, 1, 9, 2, 10, 3, 11 });
+  __rv.val[1] = (poly8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 4, 12, 5, 13, 6, 14, 7, 15 });
   return __rv;
 }
 
@@ -7604,15 +7603,44 @@ 
 vzip_p16 (poly16x4_t __a, poly16x4_t __b)
 {
   poly16x4x2_t __rv;
-  __builtin_neon_vzipv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
+  __rv.val[0] = (poly16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 0, 4, 1, 5 });
+  __rv.val[1] = (poly16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 2, 6, 3, 7 });
   return __rv;
 }
 
+__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
+vzip_s32 (int32x2_t __a, int32x2_t __b)
+{
+  int32x2x2_t __rv;
+  __rv.val[0] = (int32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 0, 2 });
+  __rv.val[1] = (int32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 1, 3 });
+  return __rv;
+}
+
+__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
+vzip_f32 (float32x2_t __a, float32x2_t __b)
+{
+  float32x2x2_t __rv;
+  __rv.val[0] = (float32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 0, 2 });
+  __rv.val[1] = (float32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 1, 3 });
+  return __rv;
+}
+
+__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
+vzip_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  uint32x2x2_t __rv;
+  __rv.val[0] = (uint32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 0, 2 });
+  __rv.val[1] = (uint32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 1, 3 });
+  return __rv;
+}
+
 __extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
 vzipq_s8 (int8x16_t __a, int8x16_t __b)
 {
   int8x16x2_t __rv;
-  __builtin_neon_vzipv16qi (&__rv.val[0], __a, __b);
+  __rv.val[0] = (int8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 });
+  __rv.val[1] = (int8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 });
   return __rv;
 }
 
@@ -7620,7 +7648,8 @@ 
 vzipq_s16 (int16x8_t __a, int16x8_t __b)
 {
   int16x8x2_t __rv;
-  __builtin_neon_vzipv8hi (&__rv.val[0], __a, __b);
+  __rv.val[0] = (int16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 0, 8, 1, 9, 2, 10, 3, 11 });
+  __rv.val[1] = (int16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 4, 12, 5, 13, 6, 14, 7, 15 });
   return __rv;
 }
 
@@ -7628,7 +7657,8 @@ 
 vzipq_s32 (int32x4_t __a, int32x4_t __b)
 {
   int32x4x2_t __rv;
-  __builtin_neon_vzipv4si (&__rv.val[0], __a, __b);
+  __rv.val[0] = (int32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 4, 1, 5 });
+  __rv.val[1] = (int32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 6, 3, 7 });
   return __rv;
 }
 
@@ -7636,7 +7666,8 @@ 
 vzipq_f32 (float32x4_t __a, float32x4_t __b)
 {
   float32x4x2_t __rv;
-  __builtin_neon_vzipv4sf (&__rv.val[0], __a, __b);
+  __rv.val[0] = (float32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 4, 1, 5 });
+  __rv.val[1] = (float32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 6, 3, 7 });
   return __rv;
 }
 
@@ -7644,7 +7675,8 @@ 
 vzipq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   uint8x16x2_t __rv;
-  __builtin_neon_vzipv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
+  __rv.val[0] = (uint8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 });
+  __rv.val[1] = (uint8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 });
   return __rv;
 }
 
@@ -7652,7 +7684,8 @@ 
 vzipq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   uint16x8x2_t __rv;
-  __builtin_neon_vzipv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
+  __rv.val[0] = (uint16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 0, 8, 1, 9, 2, 10, 3, 11 });
+  __rv.val[1] = (uint16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 4, 12, 5, 13, 6, 14, 7, 15 });
   return __rv;
 }
 
@@ -7660,7 +7693,8 @@ 
 vzipq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   uint32x4x2_t __rv;
-  __builtin_neon_vzipv4si ((int32x4_t *) &__rv.val[0], (int32x4_t) __a, (int32x4_t) __b);
+  __rv.val[0] = (uint32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 4, 1, 5 });
+  __rv.val[1] = (uint32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 6, 3, 7 });
   return __rv;
 }
 
@@ -7668,7 +7702,8 @@ 
 vzipq_p8 (poly8x16_t __a, poly8x16_t __b)
 {
   poly8x16x2_t __rv;
-  __builtin_neon_vzipv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
+  __rv.val[0] = (poly8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 });
+  __rv.val[1] = (poly8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 });
   return __rv;
 }
 
@@ -7676,7 +7711,8 @@ 
 vzipq_p16 (poly16x8_t __a, poly16x8_t __b)
 {
   poly16x8x2_t __rv;
-  __builtin_neon_vzipv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
+  __rv.val[0] = (poly16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 0, 8, 1, 9, 2, 10, 3, 11 });
+  __rv.val[1] = (poly16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 4, 12, 5, 13, 6, 14, 7, 15 });
   return __rv;
 }
 
@@ -7684,7 +7720,8 @@ 
 vuzp_s8 (int8x8_t __a, int8x8_t __b)
 {
   int8x8x2_t __rv;
-  __builtin_neon_vuzpv8qi (&__rv.val[0], __a, __b);
+  __rv.val[0] = (int8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 0, 2, 4, 6, 8, 10, 12, 14 });
+  __rv.val[1] = (int8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 1, 3, 5, 7, 9, 11, 13, 15 });
   return __rv;
 }
 
@@ -7692,7 +7729,8 @@ 
 vuzp_s16 (int16x4_t __a, int16x4_t __b)
 {
   int16x4x2_t __rv;
-  __builtin_neon_vuzpv4hi (&__rv.val[0], __a, __b);
+  __rv.val[0] = (int16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 0, 2, 4, 6 });
+  __rv.val[1] = (int16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 1, 3, 5, 7 });
   return __rv;
 }
 
@@ -7700,7 +7738,8 @@ 
 vuzp_s32 (int32x2_t __a, int32x2_t __b)
 {
   int32x2x2_t __rv;
-  __builtin_neon_vuzpv2si (&__rv.val[0], __a, __b);
+  __rv.val[0] = (int32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 0, 2 });
+  __rv.val[1] = (int32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 1, 3 });
   return __rv;
 }
 
@@ -7708,7 +7747,8 @@ 
 vuzp_f32 (float32x2_t __a, float32x2_t __b)
 {
   float32x2x2_t __rv;
-  __builtin_neon_vuzpv2sf (&__rv.val[0], __a, __b);
+  __rv.val[0] = (float32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 0, 2 });
+  __rv.val[1] = (float32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 1, 3 });
   return __rv;
 }
 
@@ -7716,7 +7756,8 @@ 
 vuzp_u8 (uint8x8_t __a, uint8x8_t __b)
 {
   uint8x8x2_t __rv;
-  __builtin_neon_vuzpv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
+  __rv.val[0] = (uint8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 0, 2, 4, 6, 8, 10, 12, 14 });
+  __rv.val[1] = (uint8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 1, 3, 5, 7, 9, 11, 13, 15 });
   return __rv;
 }
 
@@ -7724,7 +7765,8 @@ 
 vuzp_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   uint16x4x2_t __rv;
-  __builtin_neon_vuzpv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
+  __rv.val[0] = (uint16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 0, 2, 4, 6 });
+  __rv.val[1] = (uint16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 1, 3, 5, 7 });
   return __rv;
 }
 
@@ -7732,7 +7774,8 @@ 
 vuzp_u32 (uint32x2_t __a, uint32x2_t __b)
 {
   uint32x2x2_t __rv;
-  __builtin_neon_vuzpv2si ((int32x2_t *) &__rv.val[0], (int32x2_t) __a, (int32x2_t) __b);
+  __rv.val[0] = (uint32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 0, 2 });
+  __rv.val[1] = (uint32x2_t) __builtin_shuffle (__a, __b, (uint32x2_t) { 1, 3 });
   return __rv;
 }
 
@@ -7740,7 +7783,8 @@ 
 vuzp_p8 (poly8x8_t __a, poly8x8_t __b)
 {
   poly8x8x2_t __rv;
-  __builtin_neon_vuzpv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
+  __rv.val[0] = (poly8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 0, 2, 4, 6, 8, 10, 12, 14 });
+  __rv.val[1] = (poly8x8_t) __builtin_shuffle (__a, __b, (uint8x8_t) { 1, 3, 5, 7, 9, 11, 13, 15 });
   return __rv;
 }
 
@@ -7748,7 +7792,8 @@ 
 vuzp_p16 (poly16x4_t __a, poly16x4_t __b)
 {
   poly16x4x2_t __rv;
-  __builtin_neon_vuzpv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
+  __rv.val[0] = (poly16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 0, 2, 4, 6 });
+  __rv.val[1] = (poly16x4_t) __builtin_shuffle (__a, __b, (uint16x4_t) { 1, 3, 5, 7 });
   return __rv;
 }
 
@@ -7756,7 +7801,8 @@ 
 vuzpq_s8 (int8x16_t __a, int8x16_t __b)
 {
   int8x16x2_t __rv;
-  __builtin_neon_vuzpv16qi (&__rv.val[0], __a, __b);
+  __rv.val[0] = (int8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 });
+  __rv.val[1] = (int8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 });
   return __rv;
 }
 
@@ -7764,7 +7810,8 @@ 
 vuzpq_s16 (int16x8_t __a, int16x8_t __b)
 {
   int16x8x2_t __rv;
-  __builtin_neon_vuzpv8hi (&__rv.val[0], __a, __b);
+  __rv.val[0] = (int16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 0, 2, 4, 6, 8, 10, 12, 14 });
+  __rv.val[1] = (int16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 1, 3, 5, 7, 9, 11, 13, 15 });
   return __rv;
 }
 
@@ -7772,7 +7819,8 @@ 
 vuzpq_s32 (int32x4_t __a, int32x4_t __b)
 {
   int32x4x2_t __rv;
-  __builtin_neon_vuzpv4si (&__rv.val[0], __a, __b);
+  __rv.val[0] = (int32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 2, 4, 6 });
+  __rv.val[1] = (int32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 1, 3, 5, 7 });
   return __rv;
 }
 
@@ -7780,7 +7828,8 @@ 
 vuzpq_f32 (float32x4_t __a, float32x4_t __b)
 {
   float32x4x2_t __rv;
-  __builtin_neon_vuzpv4sf (&__rv.val[0], __a, __b);
+  __rv.val[0] = (float32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 2, 4, 6 });
+  __rv.val[1] = (float32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 1, 3, 5, 7 });
   return __rv;
 }
 
@@ -7788,7 +7837,8 @@ 
 vuzpq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
   uint8x16x2_t __rv;
-  __builtin_neon_vuzpv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
+  __rv.val[0] = (uint8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 });
+  __rv.val[1] = (uint8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 });
   return __rv;
 }
 
@@ -7796,7 +7846,8 @@ 
 vuzpq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
   uint16x8x2_t __rv;
-  __builtin_neon_vuzpv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
+  __rv.val[0] = (uint16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 0, 2, 4, 6, 8, 10, 12, 14 });
+  __rv.val[1] = (uint16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 1, 3, 5, 7, 9, 11, 13, 15 });
   return __rv;
 }
 
@@ -7804,7 +7855,8 @@ 
 vuzpq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
   uint32x4x2_t __rv;
-  __builtin_neon_vuzpv4si ((int32x4_t *) &__rv.val[0], (int32x4_t) __a, (int32x4_t) __b);
+  __rv.val[0] = (uint32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 2, 4, 6 });
+  __rv.val[1] = (uint32x4_t) __builtin_shuffle (__a, __b, (uint32x4_t) { 1, 3, 5, 7 });
   return __rv;
 }
 
@@ -7812,7 +7864,8 @@ 
 vuzpq_p8 (poly8x16_t __a, poly8x16_t __b)
 {
   poly8x16x2_t __rv;
-  __builtin_neon_vuzpv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
+  __rv.val[0] = (poly8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 });
+  __rv.val[1] = (poly8x16_t) __builtin_shuffle (__a, __b, (uint8x16_t) { 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 });
   return __rv;
 }
 
@@ -7820,7 +7873,8 @@ 
 vuzpq_p16 (poly16x8_t __a, poly16x8_t __b)
 {
   poly16x8x2_t __rv;
-  __builtin_neon_vuzpv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
+  __rv.val[0] = (poly16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 0, 2, 4, 6, 8, 10, 12, 14 });
+  __rv.val[1] = (poly16x8_t) __builtin_shuffle (__a, __b, (uint16x8_t) { 1, 3, 5, 7, 9, 11, 13, 15 });
   return __rv;
 }
 
Index: gcc/config/arm/neon-gen.ml
===================================================================
--- gcc/config/arm/neon-gen.ml	(revision 189288)
+++ gcc/config/arm/neon-gen.ml	(working copy)
@@ -91,15 +91,14 @@ 
   end;
   open_braceblock ffmt;
   let rec print_lines = function
-    [] -> ()
+    []       -> ()
+  | "" :: lines -> print_lines lines
   | [line] -> Format.printf "%s" line
-  | line::lines -> Format.printf "%s@," line; print_lines lines in
+  | line::lines -> Format.printf "%s@," line ; print_lines lines in
   print_lines body;
   close_braceblock ffmt;
   end_function ffmt
 
-let return_by_ptr features = List.mem ReturnPtr features
-
 let union_string num elts base =
   let itype = inttype_for_array num elts in
   let iname = string_of_inttype itype
@@ -141,29 +140,76 @@ 
 
 (* Return a tuple of a list of declarations to go at the start of the function,
    and a list of statements needed to return THING.  *)
-let return arity return_by_ptr thing =
+let return arity thing =
   match arity with
     Arity0 (ret) | Arity1 (ret, _) | Arity2 (ret, _, _) | Arity3 (ret, _, _, _)
   | Arity4 (ret, _, _, _, _) ->
-    match ret with
-      T_arrayof (num, vec) ->
-        if return_by_ptr then
-          let sname = string_of_vectype ret in
-          [Printf.sprintf "%s __rv;" sname],
-          [thing ^ ";"; "return __rv;"]
-        else
+      begin match ret with
+	T_arrayof (num, vec) ->
           let uname = union_string num vec "__rv" in
           [uname ^ ";"], ["__rv.__o = " ^ thing ^ ";"; "return __rv.__i;"]
-    | T_void -> [], [thing ^ ";"]
-    | _ ->
-        [], ["return " ^ (cast_for_return ret) ^ thing ^ ";"]
+      | T_void ->
+	  [], [thing ^ ";"]
+      | _ ->
+	  [], ["return " ^ (cast_for_return ret) ^ thing ^ ";"]
+      end
 
+let mask_shape_for_shuffle = function
+    All (num, reg) -> All (num, reg)
+  | Pair_result reg -> All (2, reg)
+  | _ -> failwith "mask_for_shuffle"
+
+let mask_elems shuffle shape elttype part =
+  let elem_size = elt_width elttype in
+  let num_elems =
+    match regmap shape 0 with
+      Dreg -> 64 / elem_size
+    | Qreg -> 128 / elem_size
+    | _ -> failwith "mask_elems" in
+  shuffle elem_size num_elems part
+
+(* Return a tuple of a list of declarations 0and a list of statements needed
+   to implement an intrinsic using __builtin_shuffle.  SHUFFLE is a function
+   which returns a list of elements suitable for using as a mask.  *)
+
+let shuffle_fn shuffle shape arity elttype =
+  let mshape = mask_shape_for_shuffle shape in
+  let masktype = type_for_elt mshape (unsigned_of_elt elttype) 0 in
+  let masktype_str = string_of_vectype masktype in
+  let shuffle_res = type_for_elt mshape elttype 0 in
+  let shuffle_res_str = string_of_vectype shuffle_res in
+  match arity with
+    Arity0 (ret) | Arity1 (ret, _) | Arity2 (ret, _, _) | Arity3 (ret, _, _, _)
+  | Arity4 (ret, _, _, _, _) ->
+      begin match ret with
+        T_arrayof (num, vec) ->
+	  let elems1 = mask_elems shuffle mshape elttype `lo
+	  and elems2 = mask_elems shuffle mshape elttype `hi in
+	  let mask1 = (String.concat ", " (List.map string_of_int elems1))
+	  and mask2 = (String.concat ", " (List.map string_of_int elems2)) in
+	  let shuf1 = Printf.sprintf
+	    "__rv.val[0] = (%s) __builtin_shuffle (__a, __b, (%s) { %s });"
+	    shuffle_res_str masktype_str mask1
+	  and shuf2 = Printf.sprintf
+	    "__rv.val[1] = (%s) __builtin_shuffle (__a, __b, (%s) { %s });"
+	    shuffle_res_str masktype_str mask2 in
+	  [Printf.sprintf "%s __rv;" (string_of_vectype ret);],
+	  [shuf1; shuf2; "return __rv;"]
+      | _ ->
+          let elems = mask_elems shuffle mshape elttype `lo in
+          let mask =  (String.concat ", " (List.map string_of_int elems)) in
+	  let shuf = Printf.sprintf
+	    "return (%s) __builtin_shuffle (__a, (%s) { %s });" shuffle_res_str masktype_str mask in
+	  [""],
+	  [shuf]
+      end
+
 let rec element_type ctype =
   match ctype with
     T_arrayof (_, v) -> element_type v
   | _ -> ctype
 
-let params return_by_ptr ps =
+let params ps =
   let pdecls = ref [] in
   let ptype t p =
     match t with
@@ -180,13 +226,7 @@ 
   | Arity3 (_, t1, t2, t3) -> [ptype t1 "__a"; ptype t2 "__b"; ptype t3 "__c"]
   | Arity4 (_, t1, t2, t3, t4) ->
       [ptype t1 "__a"; ptype t2 "__b"; ptype t3 "__c"; ptype t4 "__d"] in
-  match ps with
-    Arity0 ret | Arity1 (ret, _) | Arity2 (ret, _, _) | Arity3 (ret, _, _, _)
-  | Arity4 (ret, _, _, _, _) ->
-      if return_by_ptr then
-        !pdecls, add_cast (T_ptrto (element_type ret)) "&__rv.val[0]" :: plist
-      else
-        !pdecls, plist
+  !pdecls, plist
 
 let modify_params features plist =
   let is_flipped =
@@ -239,17 +279,27 @@ 
     and srcmode = mode_of_elt src shape in
     string_of_mode dstmode ^ string_of_mode srcmode
 
+let get_shuffle features =
+  try
+    match List.find (function Use_shuffle _ -> true | _ -> false) features with
+      Use_shuffle fn -> Some fn
+    | _ -> None
+  with Not_found -> None
+
 let print_variant opcode features shape name (ctype, asmtype, elttype) =
   let bits = infoword_value elttype features in
   let modesuf = mode_suffix elttype shape in
-  let return_by_ptr = return_by_ptr features in
-  let pdecls, paramlist = params return_by_ptr ctype in
-  let paramlist' = modify_params features paramlist in
-  let paramlist'' = extra_word shape features paramlist' bits in
-  let parstr = String.concat ", " paramlist'' in
-  let builtin = Printf.sprintf "__builtin_neon_%s%s (%s)"
-                  (builtin_name features name) modesuf parstr in
-  let rdecls, stmts = return ctype return_by_ptr builtin in
+  let pdecls, paramlist = params ctype in
+  let rdecls, stmts =
+    match get_shuffle features with
+      Some shuffle -> shuffle_fn shuffle shape ctype elttype
+    | None ->
+	let paramlist' = modify_params features paramlist in
+	let paramlist'' = extra_word shape features paramlist' bits in
+	let parstr = String.concat ", " paramlist'' in
+	let builtin = Printf.sprintf "__builtin_neon_%s%s (%s)"
+                	(builtin_name features name) modesuf parstr in
+	return ctype builtin in
   let body = pdecls @ rdecls @ stmts
   and fnname = (intrinsic_name name) ^ "_" ^ (string_of_elt elttype) in
   print_function ctype fnname body