@@ -7047,217 +7047,325 @@ vextq_p16 (poly16x8_t __a, poly16x8_t __b, const int __c)
__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
vrev64_s8 (int8x8_t __a)
{
- return (int8x8_t)__builtin_neon_vrev64v8qi (__a, 1);
+ int8x8_t __rv;
+ uint8x8_t __mask1 = {7, 6, 5, 4, 3, 2, 1, 0};
+ __rv = (int8x8_t) __builtin_shuffle (__a , __mask1);
+ return __rv;
}
__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
vrev64_s16 (int16x4_t __a)
{
- return (int16x4_t)__builtin_neon_vrev64v4hi (__a, 1);
+ int16x4_t __rv;
+ uint16x4_t __mask1 = {3, 2, 1, 0};
+ __rv = (int16x4_t) __builtin_shuffle (__a , __mask1);
+ return __rv;
}
__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
vrev64_s32 (int32x2_t __a)
{
- return (int32x2_t)__builtin_neon_vrev64v2si (__a, 1);
+ int32x2_t __rv;
+ uint32x2_t __mask1 = {1, 0};
+ __rv = (int32x2_t) __builtin_shuffle (__a , __mask1);
+ return __rv;
}
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vrev64_f32 (float32x2_t __a)
{
- return (float32x2_t)__builtin_neon_vrev64v2sf (__a, 3);
+ float32x2_t __rv;
+ uint32x2_t __mask1 = {1, 0};
+ __rv = (float32x2_t) __builtin_shuffle (__a , __mask1);
+ return __rv;
}
__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
vrev64_u8 (uint8x8_t __a)
{
- return (uint8x8_t)__builtin_neon_vrev64v8qi ((int8x8_t) __a, 0);
+ uint8x8_t __rv;
+ uint8x8_t __mask1 = {7, 6, 5, 4, 3, 2, 1, 0};
+ __rv = (uint8x8_t) __builtin_shuffle ((int8x8_t) __a , __mask1);
+ return __rv;
}
__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
vrev64_u16 (uint16x4_t __a)
{
- return (uint16x4_t)__builtin_neon_vrev64v4hi ((int16x4_t) __a, 0);
+ uint16x4_t __rv;
+ uint16x4_t __mask1 = {3, 2, 1, 0};
+ __rv = (uint16x4_t) __builtin_shuffle ((int16x4_t) __a , __mask1);
+ return __rv;
}
__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
vrev64_u32 (uint32x2_t __a)
{
- return (uint32x2_t)__builtin_neon_vrev64v2si ((int32x2_t) __a, 0);
+ uint32x2_t __rv;
+ uint32x2_t __mask1 = {1, 0};
+ __rv = (uint32x2_t) __builtin_shuffle ((int32x2_t) __a , __mask1);
+ return __rv;
}
__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
vrev64_p8 (poly8x8_t __a)
{
- return (poly8x8_t)__builtin_neon_vrev64v8qi ((int8x8_t) __a, 2);
+ poly8x8_t __rv;
+ uint8x8_t __mask1 = {7, 6, 5, 4, 3, 2, 1, 0};
+ __rv = (poly8x8_t) __builtin_shuffle ((int8x8_t) __a , __mask1);
+ return __rv;
}
__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
vrev64_p16 (poly16x4_t __a)
{
- return (poly16x4_t)__builtin_neon_vrev64v4hi ((int16x4_t) __a, 2);
+ poly16x4_t __rv;
+ uint16x4_t __mask1 = {3, 2, 1, 0};
+ __rv = (poly16x4_t) __builtin_shuffle ((int16x4_t) __a , __mask1);
+ return __rv;
}
__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
vrev64q_s8 (int8x16_t __a)
{
- return (int8x16_t)__builtin_neon_vrev64v16qi (__a, 1);
+ int8x16_t __rv;
+ uint8x16_t __mask1 = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8};
+ __rv = (int8x16_t) __builtin_shuffle (__a , __mask1);
+ return __rv;
}
__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
vrev64q_s16 (int16x8_t __a)
{
- return (int16x8_t)__builtin_neon_vrev64v8hi (__a, 1);
+ int16x8_t __rv;
+ uint16x8_t __mask1 = {3, 2, 1, 0, 7, 6, 5, 4};
+ __rv = (int16x8_t) __builtin_shuffle (__a , __mask1);
+ return __rv;
}
__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
vrev64q_s32 (int32x4_t __a)
{
- return (int32x4_t)__builtin_neon_vrev64v4si (__a, 1);
+ int32x4_t __rv;
+ uint32x4_t __mask1 = {1, 0, 3, 2};
+ __rv = (int32x4_t) __builtin_shuffle (__a , __mask1);
+ return __rv;
}
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vrev64q_f32 (float32x4_t __a)
{
- return (float32x4_t)__builtin_neon_vrev64v4sf (__a, 3);
+ float32x4_t __rv;
+ uint32x4_t __mask1 = {1, 0, 3, 2};
+ __rv = (float32x4_t) __builtin_shuffle (__a , __mask1);
+ return __rv;
}
__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
vrev64q_u8 (uint8x16_t __a)
{
- return (uint8x16_t)__builtin_neon_vrev64v16qi ((int8x16_t) __a, 0);
+ uint8x16_t __rv;
+ uint8x16_t __mask1 = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8};
+ __rv = (uint8x16_t) __builtin_shuffle ((int8x16_t) __a , __mask1);
+ return __rv;
}
__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
vrev64q_u16 (uint16x8_t __a)
{
- return (uint16x8_t)__builtin_neon_vrev64v8hi ((int16x8_t) __a, 0);
+ uint16x8_t __rv;
+ uint16x8_t __mask1 = {3, 2, 1, 0, 7, 6, 5, 4};
+ __rv = (uint16x8_t) __builtin_shuffle ((int16x8_t) __a , __mask1);
+ return __rv;
}
__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
vrev64q_u32 (uint32x4_t __a)
{
- return (uint32x4_t)__builtin_neon_vrev64v4si ((int32x4_t) __a, 0);
+ uint32x4_t __rv;
+ uint32x4_t __mask1 = {1, 0, 3, 2};
+ __rv = (uint32x4_t) __builtin_shuffle ((int32x4_t) __a , __mask1);
+ return __rv;
}
__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
vrev64q_p8 (poly8x16_t __a)
{
- return (poly8x16_t)__builtin_neon_vrev64v16qi ((int8x16_t) __a, 2);
+ poly8x16_t __rv;
+ uint8x16_t __mask1 = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8};
+ __rv = (poly8x16_t) __builtin_shuffle ((int8x16_t) __a , __mask1);
+ return __rv;
}
__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
vrev64q_p16 (poly16x8_t __a)
{
- return (poly16x8_t)__builtin_neon_vrev64v8hi ((int16x8_t) __a, 2);
+ poly16x8_t __rv;
+ uint16x8_t __mask1 = {3, 2, 1, 0, 7, 6, 5, 4};
+ __rv = (poly16x8_t) __builtin_shuffle ((int16x8_t) __a , __mask1);
+ return __rv;
}
__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
vrev32_s8 (int8x8_t __a)
{
- return (int8x8_t)__builtin_neon_vrev32v8qi (__a, 1);
+ int8x8_t __rv;
+ uint8x8_t __mask1 = {3, 2, 1, 0, 7, 6, 5, 4};
+ __rv = (int8x8_t) __builtin_shuffle (__a , __mask1);
+ return __rv;
}
__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
vrev32_s16 (int16x4_t __a)
{
- return (int16x4_t)__builtin_neon_vrev32v4hi (__a, 1);
+ int16x4_t __rv;
+ uint16x4_t __mask1 = {1, 0, 3, 2};
+ __rv = (int16x4_t) __builtin_shuffle (__a , __mask1);
+ return __rv;
}
__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
vrev32_u8 (uint8x8_t __a)
{
- return (uint8x8_t)__builtin_neon_vrev32v8qi ((int8x8_t) __a, 0);
+ uint8x8_t __rv;
+ uint8x8_t __mask1 = {3, 2, 1, 0, 7, 6, 5, 4};
+ __rv = (uint8x8_t) __builtin_shuffle ((int8x8_t) __a , __mask1);
+ return __rv;
}
__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
vrev32_u16 (uint16x4_t __a)
{
- return (uint16x4_t)__builtin_neon_vrev32v4hi ((int16x4_t) __a, 0);
+ uint16x4_t __rv;
+ uint16x4_t __mask1 = {1, 0, 3, 2};
+ __rv = (uint16x4_t) __builtin_shuffle ((int16x4_t) __a , __mask1);
+ return __rv;
}
__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
vrev32_p8 (poly8x8_t __a)
{
- return (poly8x8_t)__builtin_neon_vrev32v8qi ((int8x8_t) __a, 2);
+ poly8x8_t __rv;
+ uint8x8_t __mask1 = {3, 2, 1, 0, 7, 6, 5, 4};
+ __rv = (poly8x8_t) __builtin_shuffle ((int8x8_t) __a , __mask1);
+ return __rv;
}
__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
vrev32_p16 (poly16x4_t __a)
{
- return (poly16x4_t)__builtin_neon_vrev32v4hi ((int16x4_t) __a, 2);
+ poly16x4_t __rv;
+ uint16x4_t __mask1 = {1, 0, 3, 2};
+ __rv = (poly16x4_t) __builtin_shuffle ((int16x4_t) __a , __mask1);
+ return __rv;
}
__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
vrev32q_s8 (int8x16_t __a)
{
- return (int8x16_t)__builtin_neon_vrev32v16qi (__a, 1);
+ int8x16_t __rv;
+ uint8x16_t __mask1 = {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12};
+ __rv = (int8x16_t) __builtin_shuffle (__a , __mask1);
+ return __rv;
}
__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
vrev32q_s16 (int16x8_t __a)
{
- return (int16x8_t)__builtin_neon_vrev32v8hi (__a, 1);
+ int16x8_t __rv;
+ uint16x8_t __mask1 = {1, 0, 3, 2, 5, 4, 7, 6};
+ __rv = (int16x8_t) __builtin_shuffle (__a , __mask1);
+ return __rv;
}
__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
vrev32q_u8 (uint8x16_t __a)
{
- return (uint8x16_t)__builtin_neon_vrev32v16qi ((int8x16_t) __a, 0);
+ uint8x16_t __rv;
+ uint8x16_t __mask1 = {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12};
+ __rv = (uint8x16_t) __builtin_shuffle ((int8x16_t) __a , __mask1);
+ return __rv;
}
__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
vrev32q_u16 (uint16x8_t __a)
{
- return (uint16x8_t)__builtin_neon_vrev32v8hi ((int16x8_t) __a, 0);
+ uint16x8_t __rv;
+ uint16x8_t __mask1 = {1, 0, 3, 2, 5, 4, 7, 6};
+ __rv = (uint16x8_t) __builtin_shuffle ((int16x8_t) __a , __mask1);
+ return __rv;
}
__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
vrev32q_p8 (poly8x16_t __a)
{
- return (poly8x16_t)__builtin_neon_vrev32v16qi ((int8x16_t) __a, 2);
+ poly8x16_t __rv;
+ uint8x16_t __mask1 = {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12};
+ __rv = (poly8x16_t) __builtin_shuffle ((int8x16_t) __a , __mask1);
+ return __rv;
}
__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
vrev32q_p16 (poly16x8_t __a)
{
- return (poly16x8_t)__builtin_neon_vrev32v8hi ((int16x8_t) __a, 2);
+ poly16x8_t __rv;
+ uint16x8_t __mask1 = {1, 0, 3, 2, 5, 4, 7, 6};
+ __rv = (poly16x8_t) __builtin_shuffle ((int16x8_t) __a , __mask1);
+ return __rv;
}
__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
vrev16_s8 (int8x8_t __a)
{
- return (int8x8_t)__builtin_neon_vrev16v8qi (__a, 1);
+ int8x8_t __rv;
+ uint8x8_t __mask1 = {1, 0, 3, 2, 5, 4, 7, 6};
+ __rv = (int8x8_t) __builtin_shuffle (__a , __mask1);
+ return __rv;
}
__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
vrev16_u8 (uint8x8_t __a)
{
- return (uint8x8_t)__builtin_neon_vrev16v8qi ((int8x8_t) __a, 0);
+ uint8x8_t __rv;
+ uint8x8_t __mask1 = {1, 0, 3, 2, 5, 4, 7, 6};
+ __rv = (uint8x8_t) __builtin_shuffle ((int8x8_t) __a , __mask1);
+ return __rv;
}
__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
vrev16_p8 (poly8x8_t __a)
{
- return (poly8x8_t)__builtin_neon_vrev16v8qi ((int8x8_t) __a, 2);
+ poly8x8_t __rv;
+ uint8x8_t __mask1 = {1, 0, 3, 2, 5, 4, 7, 6};
+ __rv = (poly8x8_t) __builtin_shuffle ((int8x8_t) __a , __mask1);
+ return __rv;
}
__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
vrev16q_s8 (int8x16_t __a)
{
- return (int8x16_t)__builtin_neon_vrev16v16qi (__a, 1);
+ int8x16_t __rv;
+ uint8x16_t __mask1 = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+ __rv = (int8x16_t) __builtin_shuffle (__a , __mask1);
+ return __rv;
}
__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
vrev16q_u8 (uint8x16_t __a)
{
- return (uint8x16_t)__builtin_neon_vrev16v16qi ((int8x16_t) __a, 0);
+ uint8x16_t __rv;
+ uint8x16_t __mask1 = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+ __rv = (uint8x16_t) __builtin_shuffle ((int8x16_t) __a , __mask1);
+ return __rv;
}
__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
vrev16q_p8 (poly8x16_t __a)
{
- return (poly8x16_t)__builtin_neon_vrev16v16qi ((int8x16_t) __a, 2);
+ poly8x16_t __rv;
+ uint8x16_t __mask1 = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+ __rv = (poly8x16_t) __builtin_shuffle ((int8x16_t) __a , __mask1);
+ return __rv;
}
__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
@@ -7396,7 +7504,10 @@ __extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
vtrn_s8 (int8x8_t __a, int8x8_t __b)
{
int8x8x2_t __rv;
- __builtin_neon_vtrnv8qi (&__rv.val[0], __a, __b);
+ uint8x8_t __mask1 = {0, 8, 2, 10, 4, 12, 6, 14};
+ uint8x8_t __mask2 = {1, 9, 3, 11, 5, 13, 7, 15};
+ __rv.val[0] = (int8x8_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int8x8_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7404,7 +7515,10 @@ __extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
vtrn_s16 (int16x4_t __a, int16x4_t __b)
{
int16x4x2_t __rv;
- __builtin_neon_vtrnv4hi (&__rv.val[0], __a, __b);
+ uint16x4_t __mask1 = {0, 4, 2, 6};
+ uint16x4_t __mask2 = {1, 5, 3, 7};
+ __rv.val[0] = (int16x4_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int16x4_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7412,7 +7526,10 @@ __extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
vtrn_s32 (int32x2_t __a, int32x2_t __b)
{
int32x2x2_t __rv;
- __builtin_neon_vtrnv2si (&__rv.val[0], __a, __b);
+ uint32x2_t __mask1 = {0, 2};
+ uint32x2_t __mask2 = {1, 3};
+ __rv.val[0] = (int32x2_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int32x2_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7420,7 +7537,10 @@ __extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
vtrn_f32 (float32x2_t __a, float32x2_t __b)
{
float32x2x2_t __rv;
- __builtin_neon_vtrnv2sf (&__rv.val[0], __a, __b);
+ uint32x2_t __mask1 = {0, 2};
+ uint32x2_t __mask2 = {1, 3};
+ __rv.val[0] = (float32x2_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (float32x2_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7428,7 +7548,10 @@ __extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
vtrn_u8 (uint8x8_t __a, uint8x8_t __b)
{
uint8x8x2_t __rv;
- __builtin_neon_vtrnv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
+ uint8x8_t __mask1 = {0, 8, 2, 10, 4, 12, 6, 14};
+ uint8x8_t __mask2 = {1, 9, 3, 11, 5, 13, 7, 15};
+ __rv.val[0] = (uint8x8_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint8x8_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7436,7 +7559,10 @@ __extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
vtrn_u16 (uint16x4_t __a, uint16x4_t __b)
{
uint16x4x2_t __rv;
- __builtin_neon_vtrnv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
+ uint16x4_t __mask1 = {0, 4, 2, 6};
+ uint16x4_t __mask2 = {1, 5, 3, 7};
+ __rv.val[0] = (uint16x4_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint16x4_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7444,7 +7570,10 @@ __extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
vtrn_u32 (uint32x2_t __a, uint32x2_t __b)
{
uint32x2x2_t __rv;
- __builtin_neon_vtrnv2si ((int32x2_t *) &__rv.val[0], (int32x2_t) __a, (int32x2_t) __b);
+ uint32x2_t __mask1 = {0, 2};
+ uint32x2_t __mask2 = {1, 3};
+ __rv.val[0] = (uint32x2_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint32x2_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7452,7 +7581,10 @@ __extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
vtrn_p8 (poly8x8_t __a, poly8x8_t __b)
{
poly8x8x2_t __rv;
- __builtin_neon_vtrnv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
+ uint8x8_t __mask1 = {0, 8, 2, 10, 4, 12, 6, 14};
+ uint8x8_t __mask2 = {1, 9, 3, 11, 5, 13, 7, 15};
+ __rv.val[0] = (poly8x8_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (poly8x8_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7460,7 +7592,10 @@ __extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
vtrn_p16 (poly16x4_t __a, poly16x4_t __b)
{
poly16x4x2_t __rv;
- __builtin_neon_vtrnv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
+ uint16x4_t __mask1 = {0, 4, 2, 6};
+ uint16x4_t __mask2 = {1, 5, 3, 7};
+ __rv.val[0] = (poly16x4_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (poly16x4_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7468,7 +7603,10 @@ __extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
vtrnq_s8 (int8x16_t __a, int8x16_t __b)
{
int8x16x2_t __rv;
- __builtin_neon_vtrnv16qi (&__rv.val[0], __a, __b);
+ uint8x16_t __mask1 = {0, 2};
+ uint8x16_t __mask2 = {1, 3};
+ __rv.val[0] = (int8x16_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int8x16_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7476,7 +7614,10 @@ __extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
vtrnq_s16 (int16x8_t __a, int16x8_t __b)
{
int16x8x2_t __rv;
- __builtin_neon_vtrnv8hi (&__rv.val[0], __a, __b);
+ uint16x8_t __mask1 = {0, 8, 2, 10, 4, 12, 6, 14};
+ uint16x8_t __mask2 = {1, 9, 3, 11, 5, 13, 7, 15};
+ __rv.val[0] = (int16x8_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int16x8_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7484,7 +7625,10 @@ __extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
vtrnq_s32 (int32x4_t __a, int32x4_t __b)
{
int32x4x2_t __rv;
- __builtin_neon_vtrnv4si (&__rv.val[0], __a, __b);
+ uint32x4_t __mask1 = {0, 4, 2, 6};
+ uint32x4_t __mask2 = {1, 5, 3, 7};
+ __rv.val[0] = (int32x4_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int32x4_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7492,7 +7636,10 @@ __extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
vtrnq_f32 (float32x4_t __a, float32x4_t __b)
{
float32x4x2_t __rv;
- __builtin_neon_vtrnv4sf (&__rv.val[0], __a, __b);
+ uint32x4_t __mask1 = {0, 4, 2, 6};
+ uint32x4_t __mask2 = {1, 5, 3, 7};
+ __rv.val[0] = (float32x4_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (float32x4_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7500,7 +7647,10 @@ __extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
vtrnq_u8 (uint8x16_t __a, uint8x16_t __b)
{
uint8x16x2_t __rv;
- __builtin_neon_vtrnv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
+ uint8x16_t __mask1 = {0, 2};
+ uint8x16_t __mask2 = {1, 3};
+ __rv.val[0] = (uint8x16_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint8x16_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7508,7 +7658,10 @@ __extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
vtrnq_u16 (uint16x8_t __a, uint16x8_t __b)
{
uint16x8x2_t __rv;
- __builtin_neon_vtrnv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
+ uint16x8_t __mask1 = {0, 8, 2, 10, 4, 12, 6, 14};
+ uint16x8_t __mask2 = {1, 9, 3, 11, 5, 13, 7, 15};
+ __rv.val[0] = (uint16x8_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint16x8_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7516,7 +7669,10 @@ __extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
vtrnq_u32 (uint32x4_t __a, uint32x4_t __b)
{
uint32x4x2_t __rv;
- __builtin_neon_vtrnv4si ((int32x4_t *) &__rv.val[0], (int32x4_t) __a, (int32x4_t) __b);
+ uint32x4_t __mask1 = {0, 4, 2, 6};
+ uint32x4_t __mask2 = {1, 5, 3, 7};
+ __rv.val[0] = (uint32x4_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint32x4_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7524,7 +7680,10 @@ __extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
vtrnq_p8 (poly8x16_t __a, poly8x16_t __b)
{
poly8x16x2_t __rv;
- __builtin_neon_vtrnv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
+ uint8x16_t __mask1 = {0, 2};
+ uint8x16_t __mask2 = {1, 3};
+ __rv.val[0] = (poly8x16_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (poly8x16_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7532,7 +7691,10 @@ __extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
vtrnq_p16 (poly16x8_t __a, poly16x8_t __b)
{
poly16x8x2_t __rv;
- __builtin_neon_vtrnv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
+ uint16x8_t __mask1 = {0, 8, 2, 10, 4, 12, 6, 14};
+ uint16x8_t __mask2 = {1, 9, 3, 11, 5, 13, 7, 15};
+ __rv.val[0] = (poly16x8_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (poly16x8_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7540,7 +7702,10 @@ __extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
vzip_s8 (int8x8_t __a, int8x8_t __b)
{
int8x8x2_t __rv;
- __builtin_neon_vzipv8qi (&__rv.val[0], __a, __b);
+ uint8x8_t __mask1 = {0, 8, 1, 9, 2, 10, 3, 11};
+ uint8x8_t __mask2 = {4, 12, 5, 13, 6, 14, 7, 15};
+ __rv.val[0] = (int8x8_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int8x8_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7548,7 +7713,10 @@ __extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
vzip_s16 (int16x4_t __a, int16x4_t __b)
{
int16x4x2_t __rv;
- __builtin_neon_vzipv4hi (&__rv.val[0], __a, __b);
+ uint16x4_t __mask1 = {0, 4, 1, 5};
+ uint16x4_t __mask2 = {2, 6, 3, 7};
+ __rv.val[0] = (int16x4_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int16x4_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7556,7 +7724,10 @@ __extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
vzip_s32 (int32x2_t __a, int32x2_t __b)
{
int32x2x2_t __rv;
- __builtin_neon_vzipv2si (&__rv.val[0], __a, __b);
+ uint32x2_t __mask1 = {0, 2};
+ uint32x2_t __mask2 = {1, 3};
+ __rv.val[0] = (int32x2_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int32x2_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7564,7 +7735,10 @@ __extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
vzip_f32 (float32x2_t __a, float32x2_t __b)
{
float32x2x2_t __rv;
- __builtin_neon_vzipv2sf (&__rv.val[0], __a, __b);
+ uint32x2_t __mask1 = {0, 2};
+ uint32x2_t __mask2 = {1, 3};
+ __rv.val[0] = (float32x2_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (float32x2_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7572,7 +7746,10 @@ __extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
vzip_u8 (uint8x8_t __a, uint8x8_t __b)
{
uint8x8x2_t __rv;
- __builtin_neon_vzipv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
+ uint8x8_t __mask1 = {0, 8, 1, 9, 2, 10, 3, 11};
+ uint8x8_t __mask2 = {4, 12, 5, 13, 6, 14, 7, 15};
+ __rv.val[0] = (uint8x8_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint8x8_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7580,7 +7757,10 @@ __extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
vzip_u16 (uint16x4_t __a, uint16x4_t __b)
{
uint16x4x2_t __rv;
- __builtin_neon_vzipv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
+ uint16x4_t __mask1 = {0, 4, 1, 5};
+ uint16x4_t __mask2 = {2, 6, 3, 7};
+ __rv.val[0] = (uint16x4_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint16x4_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7588,7 +7768,10 @@ __extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
vzip_u32 (uint32x2_t __a, uint32x2_t __b)
{
uint32x2x2_t __rv;
- __builtin_neon_vzipv2si ((int32x2_t *) &__rv.val[0], (int32x2_t) __a, (int32x2_t) __b);
+ uint32x2_t __mask1 = {0, 2};
+ uint32x2_t __mask2 = {1, 3};
+ __rv.val[0] = (uint32x2_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint32x2_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7596,7 +7779,10 @@ __extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
vzip_p8 (poly8x8_t __a, poly8x8_t __b)
{
poly8x8x2_t __rv;
- __builtin_neon_vzipv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
+ uint8x8_t __mask1 = {0, 8, 1, 9, 2, 10, 3, 11};
+ uint8x8_t __mask2 = {4, 12, 5, 13, 6, 14, 7, 15};
+ __rv.val[0] = (poly8x8_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (poly8x8_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7604,7 +7790,10 @@ __extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
vzip_p16 (poly16x4_t __a, poly16x4_t __b)
{
poly16x4x2_t __rv;
- __builtin_neon_vzipv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
+ uint16x4_t __mask1 = {0, 4, 1, 5};
+ uint16x4_t __mask2 = {2, 6, 3, 7};
+ __rv.val[0] = (poly16x4_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (poly16x4_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7612,7 +7801,10 @@ __extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
vzipq_s8 (int8x16_t __a, int8x16_t __b)
{
int8x16x2_t __rv;
- __builtin_neon_vzipv16qi (&__rv.val[0], __a, __b);
+ uint8x16_t __mask1 = {0, 2};
+ uint8x16_t __mask2 = {1, 3};
+ __rv.val[0] = (int8x16_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int8x16_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7620,7 +7812,10 @@ __extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
vzipq_s16 (int16x8_t __a, int16x8_t __b)
{
int16x8x2_t __rv;
- __builtin_neon_vzipv8hi (&__rv.val[0], __a, __b);
+ uint16x8_t __mask1 = {0, 8, 1, 9, 2, 10, 3, 11};
+ uint16x8_t __mask2 = {4, 12, 5, 13, 6, 14, 7, 15};
+ __rv.val[0] = (int16x8_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int16x8_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7628,7 +7823,10 @@ __extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
vzipq_s32 (int32x4_t __a, int32x4_t __b)
{
int32x4x2_t __rv;
- __builtin_neon_vzipv4si (&__rv.val[0], __a, __b);
+ uint32x4_t __mask1 = {0, 4, 1, 5};
+ uint32x4_t __mask2 = {2, 6, 3, 7};
+ __rv.val[0] = (int32x4_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int32x4_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7636,7 +7834,10 @@ __extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
vzipq_f32 (float32x4_t __a, float32x4_t __b)
{
float32x4x2_t __rv;
- __builtin_neon_vzipv4sf (&__rv.val[0], __a, __b);
+ uint32x4_t __mask1 = {0, 4, 1, 5};
+ uint32x4_t __mask2 = {2, 6, 3, 7};
+ __rv.val[0] = (float32x4_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (float32x4_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7644,7 +7845,10 @@ __extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
vzipq_u8 (uint8x16_t __a, uint8x16_t __b)
{
uint8x16x2_t __rv;
- __builtin_neon_vzipv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
+ uint8x16_t __mask1 = {0, 2};
+ uint8x16_t __mask2 = {1, 3};
+ __rv.val[0] = (uint8x16_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint8x16_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7652,7 +7856,10 @@ __extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
vzipq_u16 (uint16x8_t __a, uint16x8_t __b)
{
uint16x8x2_t __rv;
- __builtin_neon_vzipv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
+ uint16x8_t __mask1 = {0, 8, 1, 9, 2, 10, 3, 11};
+ uint16x8_t __mask2 = {4, 12, 5, 13, 6, 14, 7, 15};
+ __rv.val[0] = (uint16x8_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint16x8_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7660,7 +7867,10 @@ __extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
vzipq_u32 (uint32x4_t __a, uint32x4_t __b)
{
uint32x4x2_t __rv;
- __builtin_neon_vzipv4si ((int32x4_t *) &__rv.val[0], (int32x4_t) __a, (int32x4_t) __b);
+ uint32x4_t __mask1 = {0, 4, 1, 5};
+ uint32x4_t __mask2 = {2, 6, 3, 7};
+ __rv.val[0] = (uint32x4_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint32x4_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7668,7 +7878,10 @@ __extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
vzipq_p8 (poly8x16_t __a, poly8x16_t __b)
{
poly8x16x2_t __rv;
- __builtin_neon_vzipv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
+ uint8x16_t __mask1 = {0, 2};
+ uint8x16_t __mask2 = {1, 3};
+ __rv.val[0] = (poly8x16_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (poly8x16_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7676,7 +7889,10 @@ __extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
vzipq_p16 (poly16x8_t __a, poly16x8_t __b)
{
poly16x8x2_t __rv;
- __builtin_neon_vzipv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
+ uint16x8_t __mask1 = {0, 8, 1, 9, 2, 10, 3, 11};
+ uint16x8_t __mask2 = {4, 12, 5, 13, 6, 14, 7, 15};
+ __rv.val[0] = (poly16x8_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (poly16x8_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7684,7 +7900,10 @@ __extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
vuzp_s8 (int8x8_t __a, int8x8_t __b)
{
int8x8x2_t __rv;
- __builtin_neon_vuzpv8qi (&__rv.val[0], __a, __b);
+ uint8x8_t __mask1 = {0, 2, 4, 6, 8, 10, 12, 14};
+ uint8x8_t __mask2 = {1, 3, 5, 7, 9, 11, 13, 15};
+ __rv.val[0] = (int8x8_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int8x8_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7692,7 +7911,10 @@ __extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
vuzp_s16 (int16x4_t __a, int16x4_t __b)
{
int16x4x2_t __rv;
- __builtin_neon_vuzpv4hi (&__rv.val[0], __a, __b);
+ uint16x4_t __mask1 = {0, 2, 4, 6};
+ uint16x4_t __mask2 = {1, 3, 5, 7};
+ __rv.val[0] = (int16x4_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int16x4_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7700,7 +7922,10 @@ __extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
vuzp_s32 (int32x2_t __a, int32x2_t __b)
{
int32x2x2_t __rv;
- __builtin_neon_vuzpv2si (&__rv.val[0], __a, __b);
+ uint32x2_t __mask1 = {0, 2};
+ uint32x2_t __mask2 = {1, 3};
+ __rv.val[0] = (int32x2_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int32x2_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7708,7 +7933,10 @@ __extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
vuzp_f32 (float32x2_t __a, float32x2_t __b)
{
float32x2x2_t __rv;
- __builtin_neon_vuzpv2sf (&__rv.val[0], __a, __b);
+ uint32x2_t __mask1 = {0, 2};
+ uint32x2_t __mask2 = {1, 3};
+ __rv.val[0] = (float32x2_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (float32x2_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7716,7 +7944,10 @@ __extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
vuzp_u8 (uint8x8_t __a, uint8x8_t __b)
{
uint8x8x2_t __rv;
- __builtin_neon_vuzpv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
+ uint8x8_t __mask1 = {0, 2, 4, 6, 8, 10, 12, 14};
+ uint8x8_t __mask2 = {1, 3, 5, 7, 9, 11, 13, 15};
+ __rv.val[0] = (uint8x8_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint8x8_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7724,7 +7955,10 @@ __extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
vuzp_u16 (uint16x4_t __a, uint16x4_t __b)
{
uint16x4x2_t __rv;
- __builtin_neon_vuzpv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
+ uint16x4_t __mask1 = {0, 2, 4, 6};
+ uint16x4_t __mask2 = {1, 3, 5, 7};
+ __rv.val[0] = (uint16x4_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint16x4_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7732,7 +7966,10 @@ __extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
vuzp_u32 (uint32x2_t __a, uint32x2_t __b)
{
uint32x2x2_t __rv;
- __builtin_neon_vuzpv2si ((int32x2_t *) &__rv.val[0], (int32x2_t) __a, (int32x2_t) __b);
+ uint32x2_t __mask1 = {0, 2};
+ uint32x2_t __mask2 = {1, 3};
+ __rv.val[0] = (uint32x2_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint32x2_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7740,7 +7977,10 @@ __extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
vuzp_p8 (poly8x8_t __a, poly8x8_t __b)
{
poly8x8x2_t __rv;
- __builtin_neon_vuzpv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
+ uint8x8_t __mask1 = {0, 2, 4, 6, 8, 10, 12, 14};
+ uint8x8_t __mask2 = {1, 3, 5, 7, 9, 11, 13, 15};
+ __rv.val[0] = (poly8x8_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (poly8x8_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7748,7 +7988,10 @@ __extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
vuzp_p16 (poly16x4_t __a, poly16x4_t __b)
{
poly16x4x2_t __rv;
- __builtin_neon_vuzpv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
+ uint16x4_t __mask1 = {0, 2, 4, 6};
+ uint16x4_t __mask2 = {1, 3, 5, 7};
+ __rv.val[0] = (poly16x4_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (poly16x4_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7756,7 +7999,10 @@ __extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
vuzpq_s8 (int8x16_t __a, int8x16_t __b)
{
int8x16x2_t __rv;
- __builtin_neon_vuzpv16qi (&__rv.val[0], __a, __b);
+ uint8x16_t __mask1 = {0, 2};
+ uint8x16_t __mask2 = {1, 3};
+ __rv.val[0] = (int8x16_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int8x16_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7764,7 +8010,10 @@ __extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
vuzpq_s16 (int16x8_t __a, int16x8_t __b)
{
int16x8x2_t __rv;
- __builtin_neon_vuzpv8hi (&__rv.val[0], __a, __b);
+ uint16x8_t __mask1 = {0, 2, 4, 6, 8, 10, 12, 14};
+ uint16x8_t __mask2 = {1, 3, 5, 7, 9, 11, 13, 15};
+ __rv.val[0] = (int16x8_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int16x8_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7772,7 +8021,10 @@ __extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
vuzpq_s32 (int32x4_t __a, int32x4_t __b)
{
int32x4x2_t __rv;
- __builtin_neon_vuzpv4si (&__rv.val[0], __a, __b);
+ uint32x4_t __mask1 = {0, 2, 4, 6};
+ uint32x4_t __mask2 = {1, 3, 5, 7};
+ __rv.val[0] = (int32x4_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int32x4_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7780,7 +8032,10 @@ __extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
vuzpq_f32 (float32x4_t __a, float32x4_t __b)
{
float32x4x2_t __rv;
- __builtin_neon_vuzpv4sf (&__rv.val[0], __a, __b);
+ uint32x4_t __mask1 = {0, 2, 4, 6};
+ uint32x4_t __mask2 = {1, 3, 5, 7};
+ __rv.val[0] = (float32x4_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (float32x4_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7788,7 +8043,10 @@ __extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
vuzpq_u8 (uint8x16_t __a, uint8x16_t __b)
{
uint8x16x2_t __rv;
- __builtin_neon_vuzpv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
+ uint8x16_t __mask1 = {0, 2};
+ uint8x16_t __mask2 = {1, 3};
+ __rv.val[0] = (uint8x16_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint8x16_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7796,7 +8054,10 @@ __extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
vuzpq_u16 (uint16x8_t __a, uint16x8_t __b)
{
uint16x8x2_t __rv;
- __builtin_neon_vuzpv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
+ uint16x8_t __mask1 = {0, 2, 4, 6, 8, 10, 12, 14};
+ uint16x8_t __mask2 = {1, 3, 5, 7, 9, 11, 13, 15};
+ __rv.val[0] = (uint16x8_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint16x8_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7804,7 +8065,10 @@ __extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
vuzpq_u32 (uint32x4_t __a, uint32x4_t __b)
{
uint32x4x2_t __rv;
- __builtin_neon_vuzpv4si ((int32x4_t *) &__rv.val[0], (int32x4_t) __a, (int32x4_t) __b);
+ uint32x4_t __mask1 = {0, 2, 4, 6};
+ uint32x4_t __mask2 = {1, 3, 5, 7};
+ __rv.val[0] = (uint32x4_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint32x4_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7812,7 +8076,10 @@ __extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
vuzpq_p8 (poly8x16_t __a, poly8x16_t __b)
{
poly8x16x2_t __rv;
- __builtin_neon_vuzpv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
+ uint8x16_t __mask1 = {0, 2};
+ uint8x16_t __mask2 = {1, 3};
+ __rv.val[0] = (poly8x16_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (poly8x16_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7820,7 +8087,10 @@ __extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
vuzpq_p16 (poly16x8_t __a, poly16x8_t __b)
{
poly16x8x2_t __rv;
- __builtin_neon_vuzpv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
+ uint16x8_t __mask1 = {0, 2, 4, 6, 8, 10, 12, 14};
+ uint16x8_t __mask2 = {1, 3, 5, 7, 9, 11, 13, 15};
+ __rv.val[0] = (poly16x8_t)__builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (poly16x8_t)__builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -98,7 +98,7 @@ let print_function arity fnname body =
close_braceblock ffmt;
end_function ffmt
-let return_by_ptr features = List.mem ReturnPtr features
+let gcc_builtin_shuffle features = List.exists (function GCCBuiltinShuffle (a, b) -> true | _ -> false) features
let union_string num elts base =
let itype = inttype_for_array num elts in
@@ -137,33 +137,242 @@ let add_cast ctype cval =
else
cval
+(* This function gives the base type for any vector type
+ that we care about for the permute operations. Expand as need
+ be for other cases. *)
+let rec base_type vectype =
+ match vectype with
+ T_int8x8
+ | T_uint8x8
+ | T_poly8x8
+ | T_int16x4
+ | T_uint16x4
+ | T_poly16x4
+ | T_int32x2
+ | T_uint32x2
+ | T_float32x2
+ | T_int8x16
+ | T_uint8x16
+ | T_poly8x16
+ | T_int16x8
+ | T_uint16x8
+ | T_poly16x8
+ | T_int32x4
+ | T_float32x4
+ | T_uint32x4 -> vectype
+ | T_arrayof (num, base) -> base_type base
+ | _ -> raise Not_found
+
+(* This function tells us what type to give to the mask. *)
+let rec masktype vectype =
+ match vectype with
+ T_int8x8
+ | T_uint8x8
+ | T_poly8x8 -> T_uint8x8
+ | T_int16x4
+ | T_uint16x4
+ | T_poly16x4 -> T_uint16x4
+ | T_int32x2
+ | T_uint32x2
+ | T_float32x2 -> T_uint32x2
+ | T_int8x16
+ | T_uint8x16
+ | T_poly8x16 -> T_uint8x16
+ | T_int16x8
+ | T_uint16x8
+ | T_poly16x8 -> T_uint16x8
+ | T_int32x4
+ | T_float32x4
+ | T_uint32x4 -> T_uint32x4
+ | T_arrayof (num, base) -> masktype base
+ | _ -> raise Not_found
+
+(* Return number of elements available in the underlying vector
+ type. *)
+let rec num_vec_elt vectype =
+ match vectype with
+ T_int8x8
+ | T_uint8x8
+ | T_poly8x8 -> 8
+ | T_int16x4
+ | T_uint16x4
+ | T_poly16x4 -> 4
+ | T_int32x2
+ | T_uint32x2
+ | T_float32x2 -> 2
+ | T_int8x16
+ | T_uint8x16
+ | T_poly8x16 -> 2
+ | T_int16x8
+ | T_uint16x8
+ | T_poly16x8 -> 8
+ | T_int32x4
+ | T_float32x4
+ | T_uint32x4 -> 4
+ | T_arrayof (num, base) -> (num_vec_elt base)
+ | _ -> raise Not_found
+
+
let cast_for_return to_ty = "(" ^ (string_of_vectype to_ty) ^ ")"
+(* Produce a list of integers in the descending range i ... j. *)
+let rec range i j = if i < j then [] else i :: (range (i - 1) j )
+let gen_revmask high low = List.map string_of_int (range high low)
+
+(* An initialization to produce the right value for the mask
+ that gets produced in the form of a const_vec. This could be
+ written much better in terms of proper permutations like
+ some of the zip, unzip and trn implementations below. *)
+let init_rev_mask elttype maskty =
+ (let rangelim = (match elttype with
+ I64 ->
+ (match maskty with
+ T_uint8x8 -> [(7, 0)]
+ | T_uint16x4 -> [(3, 0)]
+ | T_uint32x2 -> [(1, 0)]
+ | T_uint16x8 -> [(3, 0) ; (7, 4)]
+ | T_uint32x4 -> [(1, 0) ; (3, 2)]
+ | T_uint8x16 -> [(7, 0) ; (15, 8)]
+ | _ -> raise Not_found)
+ | I32 ->
+ (match maskty with
+ T_uint8x8 -> [(3, 0) ; (7, 4)]
+ | T_uint16x4 -> [(1, 0) ; (3, 2)]
+ | T_uint16x8 -> [(1, 0) ; (3, 2) ; (5, 4); (7, 6)]
+ | T_uint8x16 -> [(3, 0) ; (7, 4) ; (11, 8); (15, 12)]
+ | _ -> raise Not_found)
+ | I16 ->
+ (match maskty with
+ T_uint8x8 -> [(1, 0) ; (3, 2); (5, 4); (7, 6)]
+ | T_uint8x16 -> [(1, 0) ; (3, 2); (5, 4); (7, 6); (9, 8); (11, 10); (13, 12); (15, 14)]
+ | _ -> raise Not_found)
+ | _ -> raise Not_found) in
+ let rec strlist t = (match t with
+ [] -> []
+ | (x, y) :: xs -> (String.concat ", " (gen_revmask x y)) :: strlist xs) in
+ "{" ^ (String.concat ", " (strlist rangelim)) ^ "}")
+
+(* Generic helper function that produces a permutation based on
+ an initial value, stride, number of elements and an increment value. *)
+let rec permute_range i stride nelts increment =
+let j = i + stride in
+if nelts = 0
+ then []
+else
+ let ls = i :: [j] in
+ List.append ls (permute_range (i + increment) stride (nelts - 1) increment)
+
+(* Generate a list of integers suitable for vzip. *)
+let rec zip_range i stride nelts = permute_range i stride nelts 1
+(* Generate a list of integers suitable for vunzip. *)
+let rec uzip_range i stride nelts = permute_range i stride nelts 4
+(* Generate a list of integers suitable for trn. *)
+let rec trn_range i stride nelts = permute_range i stride nelts 2
+
+(* Fixme: Not fully happy with the way in which this is written. Screams for
+ slightly better factoring. *)
+let init_zip_mask shufop maskty =
+ let num_elts = num_vec_elt maskty in
+ let printstr = match shufop with
+ Ziplo -> (match maskty with
+ T_uint8x8
+ | T_uint16x4
+ | T_uint16x8
+ | T_uint32x2
+ | T_uint32x4
+ | T_uint8x16 -> (List.map string_of_int (zip_range 0 num_elts (num_elts / 2)))
+ | _ -> raise Not_found)
+ | Ziphi -> (match maskty with
+ T_uint8x8
+ | T_uint16x8
+ | T_uint16x4
+ | T_uint32x2
+ | T_uint32x4
+ | T_uint8x16 -> (List.map string_of_int (zip_range (num_elts / 2) num_elts (num_elts / 2)))
+ | _ ->raise Not_found)
+ | Uzplo -> (match maskty with
+ T_uint8x8
+ | T_uint16x4
+ | T_uint16x8
+ | T_uint32x2
+ | T_uint32x4
+ | T_uint8x16 -> (List.map string_of_int (uzip_range 0 2 (num_elts / 2)))
+ | _ -> raise Not_found)
+ | Uzphi -> (match maskty with
+ T_uint8x8
+ | T_uint16x4
+ | T_uint16x8
+ | T_uint32x2
+ | T_uint32x4
+ | T_uint8x16 -> (List.map string_of_int (uzip_range 1 2 (num_elts / 2)))
+ | _ -> raise Not_found)
+
+ | Trnlo -> (match maskty with
+ T_uint8x8
+ | T_uint16x4
+ | T_uint16x8
+ | T_uint32x2
+ | T_uint32x4
+ | T_uint8x16 -> (List.map string_of_int (trn_range 0 (num_elts) (num_elts / 2)))
+ | _ -> raise Not_found)
+ | Trnhi -> (match maskty with
+ T_uint8x8
+ | T_uint16x4
+ | T_uint16x8
+ | T_uint32x2
+ | T_uint32x4
+ | T_uint8x16 -> (List.map string_of_int (trn_range 1 (num_elts) (num_elts / 2)))
+ | _ -> raise Not_found)
+ | _ -> raise Not_found in
+ "{" ^ String.concat ", " printstr ^ "}"
+
+let perm_locode op = match op with
+ Zip -> Ziplo
+| Unzip -> Uzplo
+| Trn -> Trnlo
+| _ -> raise Not_found
+
+let perm_hicode op = match op with
+ Zip -> Ziphi
+| Unzip -> Uzphi
+| Trn -> Trnhi
+| _ -> raise Not_found
+
(* Return a tuple of a list of declarations to go at the start of the function,
and a list of statements needed to return THING. *)
-let return arity return_by_ptr thing =
+let return arity gcc_builtin_shuffle shufop shufty thing =
match arity with
Arity0 (ret) | Arity1 (ret, _) | Arity2 (ret, _, _) | Arity3 (ret, _, _, _)
| Arity4 (ret, _, _, _, _) ->
- match ret with
- T_arrayof (num, vec) ->
- if return_by_ptr then
+ if gcc_builtin_shuffle then
let sname = string_of_vectype ret in
- [Printf.sprintf "%s __rv;" sname],
- [thing ^ ";"; "return __rv;"]
- else
+ let mname = string_of_vectype (masktype ret) in
+ (match shufop with
+ Reverse -> (let mask_initializer = init_rev_mask shufty (masktype ret) in
+ [Printf.sprintf "%s __rv;" sname ; Printf.sprintf "%s __mask1 = %s;" mname mask_initializer],
+ ["__rv = " ^ (cast_for_return ret) ^ thing ^ ";" ; "return __rv;"])
+ | Unzip
+ | Trn
+ | Zip -> (let mask_initializer1 = init_zip_mask (perm_locode shufop) (masktype ret) in
+ let mask_initializer2 = init_zip_mask (perm_hicode shufop) (masktype ret) in
+ [Printf.sprintf "%s __rv;" sname ; Printf.sprintf "%s __mask1 = %s;" mname mask_initializer1 ; Printf.sprintf "%s __mask2 = %s;" mname mask_initializer2; Printf.sprintf "__rv.val[0] = " ^ (cast_for_return (base_type ret)) ^ "__builtin_shuffle (__a, __b, __mask1);" ; Printf.sprintf "__rv.val[1] = " ^ (cast_for_return (base_type ret)) ^ "__builtin_shuffle (__a, __b, __mask2);" ],
+ ["return __rv;"])
+ | _ -> raise Not_found)
+ else
+ match ret with
+ T_arrayof (num, vec) ->
let uname = union_string num vec "__rv" in
[uname ^ ";"], ["__rv.__o = " ^ thing ^ ";"; "return __rv.__i;"]
| T_void -> [], [thing ^ ";"]
| _ ->
- [], ["return " ^ (cast_for_return ret) ^ thing ^ ";"]
+ [], ["return " ^ (cast_for_return ret) ^ thing ^ ";"]
let rec element_type ctype =
match ctype with
T_arrayof (_, v) -> element_type v
| _ -> ctype
-let params return_by_ptr ps =
+let params ps =
let pdecls = ref [] in
let ptype t p =
match t with
@@ -182,11 +391,7 @@ let params return_by_ptr ps =
[ptype t1 "__a"; ptype t2 "__b"; ptype t3 "__c"; ptype t4 "__d"] in
match ps with
Arity0 ret | Arity1 (ret, _) | Arity2 (ret, _, _) | Arity3 (ret, _, _, _)
- | Arity4 (ret, _, _, _, _) ->
- if return_by_ptr then
- !pdecls, add_cast (T_ptrto (element_type ret)) "&__rv.val[0]" :: plist
- else
- !pdecls, plist
+ | Arity4 (ret, _, _, _, _) -> !pdecls, plist
let modify_params features plist =
let is_flipped =
@@ -200,9 +405,13 @@ let modify_params features plist =
plist
(* !!! Decide whether to add an extra information word based on the shape
- form. *)
-let extra_word shape features paramlist bits =
+ form. If we have a builtin shuffle we really do not need the extra
+ word. *)
+let extra_word shape features paramlist gcc_builtin_shuffle bits =
let use_word =
+ if gcc_builtin_shuffle then
+ false
+ else
match shape with
All _ | Long | Long_noreg _ | Wide | Wide_noreg _ | Narrow
| By_scalar _ | Wide_scalar | Wide_lane | Binary_imm _ | Long_imm
@@ -239,17 +448,41 @@ let rec mode_suffix elttype shape =
and srcmode = mode_of_elt src shape in
string_of_mode dstmode ^ string_of_mode srcmode
+
+let rec shuffle_inner_op features = match features with
+ [] -> NoShuffle
+ | (GCCBuiltinShuffle (a, b)) :: xs -> a
+ | x :: xs -> shuffle_inner_op xs
+
+let rec shuffle_inner_ty features = match features with
+ [] -> NoElts
+ | (GCCBuiltinShuffle (a, b)) :: xs -> b
+ | x :: xs -> shuffle_inner_ty xs
+
+let shuffle_type features = shuffle_inner_ty features
+let shuffle_op features = shuffle_inner_op features
+
let print_variant opcode features shape name (ctype, asmtype, elttype) =
let bits = infoword_value elttype features in
let modesuf = mode_suffix elttype shape in
- let return_by_ptr = return_by_ptr features in
- let pdecls, paramlist = params return_by_ptr ctype in
+ let gcc_builtin_shuffle = gcc_builtin_shuffle features in
+ let pdecls, paramlist = params ctype in
let paramlist' = modify_params features paramlist in
- let paramlist'' = extra_word shape features paramlist' bits in
+ let paramlist'' = extra_word shape features paramlist' gcc_builtin_shuffle bits in
let parstr = String.concat ", " paramlist'' in
- let builtin = Printf.sprintf "__builtin_neon_%s%s (%s)"
+ let shufty = shuffle_type features in
+ let shufop = shuffle_op features in
+ let builtin = if gcc_builtin_shuffle then
+ (match shufop with
+ Reverse -> Printf.sprintf " __builtin_shuffle (%s , __mask1)" parstr
+ | Unzip
+ | Trn
+ | Zip -> Printf.sprintf ""
+ | _ -> raise Not_found)
+ else Printf.sprintf "__builtin_neon_%s%s (%s)"
(builtin_name features name) modesuf parstr in
- let rdecls, stmts = return ctype return_by_ptr builtin in
+
+ let rdecls, stmts = return ctype gcc_builtin_shuffle shufop shufty builtin in
let body = pdecls @ rdecls @ stmts
and fnname = (intrinsic_name name) ^ "_" ^ (string_of_elt elttype) in
print_function ctype fnname body
@@ -201,6 +201,23 @@ type opcode =
(* Reinterpret casts. *)
| Vreinterp
+(* Shuffletype can be one of the below - The lo and hi variants
+ are to allow the split forms to be generated for the Zip, Unzip
+ Trn cases. These are not to be used from the toplevel ops table
+ but for the lower level routines in neon-gen.ml. *)
+type shuffletype =
+ Reverse
+ | Zip
+ | Ziplo
+ | Ziphi
+ | Unzip
+ | Uzplo
+ | Uzphi
+ | Trn
+ | Trnlo
+ | Trnhi
+ | NoShuffle
+
(* Features used for documentation, to distinguish between some instruction
variants, and to signal special requirements (e.g. swapping arguments). *)
@@ -214,7 +231,7 @@ type features =
| Flipped of string (* Builtin name to use with flipped arguments. *)
| InfoWord (* Pass an extra word for signage/rounding etc. (always passed
for All _, Long, Wide, Narrow shape_forms. *)
- | ReturnPtr (* Pass explicit pointer to return value as first argument. *)
+ | GCCBuiltinShuffle of (shuffletype * elts)
(* A specification as to the shape of instruction expected upon
disassembly, used if it differs from the shape used to build the
intrinsic prototype. Multiple entries in the constructor's argument
@@ -1317,12 +1334,12 @@ let ops =
pf_su_8_64;
(* Reverse elements. *)
- Vrev64, [], All (2, Dreg), "vrev64", bits_1, P8 :: P16 :: F32 :: su_8_32;
- Vrev64, [], All (2, Qreg), "vrev64Q", bits_1, P8 :: P16 :: F32 :: su_8_32;
- Vrev32, [], All (2, Dreg), "vrev32", bits_1, [P8; P16; S8; U8; S16; U16];
- Vrev32, [], All (2, Qreg), "vrev32Q", bits_1, [P8; P16; S8; U8; S16; U16];
- Vrev16, [], All (2, Dreg), "vrev16", bits_1, [P8; S8; U8];
- Vrev16, [], All (2, Qreg), "vrev16Q", bits_1, [P8; S8; U8];
+ Vrev64, [GCCBuiltinShuffle (Reverse, I64)], All (2, Dreg), "vrev64", bits_1, P8 :: P16 :: F32 :: su_8_32;
+ Vrev64, [GCCBuiltinShuffle (Reverse, I64)], All (2, Qreg), "vrev64Q", bits_1, P8 :: P16 :: F32 :: su_8_32;
+ Vrev32, [GCCBuiltinShuffle (Reverse, I32)], All (2, Dreg), "vrev32", bits_1, [P8; P16; S8; U8; S16; U16];
+ Vrev32, [GCCBuiltinShuffle (Reverse, I32)], All (2, Qreg), "vrev32Q", bits_1, [P8; P16; S8; U8; S16; U16];
+ Vrev16, [GCCBuiltinShuffle (Reverse, I16)], All (2, Dreg), "vrev16", bits_1, [P8; S8; U8];
+ Vrev16, [GCCBuiltinShuffle (Reverse, I16)], All (2, Qreg), "vrev16Q", bits_1, [P8; S8; U8];
(* Bit selection. *)
Vbsl,
@@ -1336,25 +1353,15 @@ let ops =
Use_operands [| Qreg; Qreg; Qreg; Qreg |], "vbslQ", bit_select,
pf_su_8_64;
- (* Transpose elements. **NOTE** ReturnPtr goes some of the way towards
- generating good code for intrinsics which return structure types --
- builtins work well by themselves (and understand that the values being
- stored on e.g. the stack also reside in registers, so can optimise the
- stores away entirely if the results are used immediately), but
- intrinsics are very much less efficient. Maybe something can be improved
- re: inlining, or tweaking the ABI used for intrinsics (a special call
- attribute?).
- *)
- Vtrn, [ReturnPtr], Pair_result Dreg, "vtrn", bits_2, pf_su_8_32;
- Vtrn, [ReturnPtr], Pair_result Qreg, "vtrnQ", bits_2, pf_su_8_32;
-
+ Vtrn, [GCCBuiltinShuffle (Trn, NoElts)], Pair_result Dreg, "vtrn", bits_2, pf_su_8_32;
+ Vtrn, [GCCBuiltinShuffle (Trn, NoElts)], Pair_result Qreg, "vtrnQ", bits_2, pf_su_8_32;
(* Zip elements. *)
- Vzip, [ReturnPtr], Pair_result Dreg, "vzip", bits_2, pf_su_8_32;
- Vzip, [ReturnPtr], Pair_result Qreg, "vzipQ", bits_2, pf_su_8_32;
+ Vzip, [GCCBuiltinShuffle (Zip, NoElts)], Pair_result Dreg, "vzip", bits_2, pf_su_8_32;
+ Vzip, [GCCBuiltinShuffle (Zip, NoElts)], Pair_result Qreg, "vzipQ", bits_2, pf_su_8_32;
(* Unzip elements. *)
- Vuzp, [ReturnPtr], Pair_result Dreg, "vuzp", bits_2, pf_su_8_32;
- Vuzp, [ReturnPtr], Pair_result Qreg, "vuzpQ", bits_2, pf_su_8_32;
+ Vuzp, [GCCBuiltinShuffle (Unzip, NoElts)], Pair_result Dreg, "vuzp", bits_2, pf_su_8_32;
+ Vuzp, [GCCBuiltinShuffle (Unzip, NoElts)], Pair_result Qreg, "vuzpQ", bits_2, pf_su_8_32;
(* Element/structure loads. VLD1 variants. *)
Vldx 1,