diff mbox

[2/4,AARCH64,NEON] Convert arm_neon.h to use new builtins for vld[234](q?)_lane_*

Message ID 1411069109-31425-3-git-send-email-charles.baylis@linaro.org
State New
Headers show

Commit Message

Charles Baylis Sept. 18, 2014, 7:38 p.m. UTC
This patch replaces the inline assembler implementations of the
vld[234](q?)_lane_* intrinsics with new versions which exploit the new builtin
functions added in patch 1.

Tested (with the rest of the patch series) with make check on aarch64-oe-linux
with qemu, and also causes no regressions in clyon's NEON intrinsics tests.

<DATE>  Charles Baylis  <charles.baylis@linaro.org>

	* config/aarch64/arm_neon.h (__LD2_LANE_FUNC): Rewrite using builtins,
	update uses to use new macro arguments.
	(__LD3_LANE_FUNC): Likewise.
	(__LD4_LANE_FUNC): Likewise.

Change-Id: I3bd5934b5c4f6127088193c1ab12848144d5540a
---
 gcc/config/aarch64/arm_neon.h | 359 ++++++++++++++++++++++++++++--------------
 1 file changed, 237 insertions(+), 122 deletions(-)

Comments

Tejas Belagod Sept. 19, 2014, 11:21 a.m. UTC | #1
On 18/09/14 20:38, Charles Baylis wrote:
> This patch replaces the inline assembler implementations of the
> vld[234](q?)_lane_* intrinsics with new versions which exploit the new builtin
> functions added in patch 1.
>
> Tested (with the rest of the patch series) with make check on aarch64-oe-linux
> with qemu, and also causes no regressions in clyon's NEON intrinsics tests.
>
> <DATE>  Charles Baylis  <charles.baylis@linaro.org>
>
>          * config/aarch64/arm_neon.h (__LD2_LANE_FUNC): Rewrite using builtins,
>          update uses to use new macro arguments.
>          (__LD3_LANE_FUNC): Likewise.
>          (__LD4_LANE_FUNC): Likewise.
>
> Change-Id: I3bd5934b5c4f6127088193c1ab12848144d5540a
> ---
>   gcc/config/aarch64/arm_neon.h | 359 ++++++++++++++++++++++++++++--------------
>   1 file changed, 237 insertions(+), 122 deletions(-)
>
> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
> index e62c783..c1fcb47 100644
> --- a/gcc/config/aarch64/arm_neon.h
> +++ b/gcc/config/aarch64/arm_neon.h
> @@ -11805,47 +11805,79 @@ __LD2R_FUNC (uint16x8x2_t, uint16x2_t, uint16_t, 8h, u16, q)
>   __LD2R_FUNC (uint32x4x2_t, uint32x2_t, uint32_t, 4s, u32, q)
>   __LD2R_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, 2d, u64, q)
>
> -#define __LD2_LANE_FUNC(rettype, ptrtype, regsuffix,                   \
> -                       lnsuffix, funcsuffix, Q)                        \
> -  __extension__ static __inline rettype                                        \
> -  __attribute__ ((__always_inline__))                                  \
> -  vld2 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr,               \
> -                                    rettype b, const int c)            \
> -  {                                                                    \
> -    rettype result;                                                    \
> -    __asm__ ("ld1 {v16." #regsuffix ", v17." #regsuffix "}, %1\n\t"    \
> -            "ld2 {v16." #lnsuffix ", v17." #lnsuffix "}[%3], %2\n\t"   \
> -            "st1 {v16." #regsuffix ", v17." #regsuffix "}, %0\n\t"     \
> -            : "=Q"(result)                                             \
> -            : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c)               \
> -            : "memory", "v16", "v17");                                 \
> -    return result;                                                     \
> -  }
> -
> -__LD2_LANE_FUNC (int8x8x2_t, uint8_t, 8b, b, s8,)
> -__LD2_LANE_FUNC (float32x2x2_t, float32_t, 2s, s, f32,)
> -__LD2_LANE_FUNC (float64x1x2_t, float64_t, 1d, d, f64,)
> -__LD2_LANE_FUNC (poly8x8x2_t, poly8_t, 8b, b, p8,)
> -__LD2_LANE_FUNC (poly16x4x2_t, poly16_t, 4h, h, p16,)
> -__LD2_LANE_FUNC (int16x4x2_t, int16_t, 4h, h, s16,)
> -__LD2_LANE_FUNC (int32x2x2_t, int32_t, 2s, s, s32,)
> -__LD2_LANE_FUNC (int64x1x2_t, int64_t, 1d, d, s64,)
> -__LD2_LANE_FUNC (uint8x8x2_t, uint8_t, 8b, b, u8,)
> -__LD2_LANE_FUNC (uint16x4x2_t, uint16_t, 4h, h, u16,)
> -__LD2_LANE_FUNC (uint32x2x2_t, uint32_t, 2s, s, u32,)
> -__LD2_LANE_FUNC (uint64x1x2_t, uint64_t, 1d, d, u64,)
> -__LD2_LANE_FUNC (float32x4x2_t, float32_t, 4s, s, f32, q)
> -__LD2_LANE_FUNC (float64x2x2_t, float64_t, 2d, d, f64, q)
> -__LD2_LANE_FUNC (poly8x16x2_t, poly8_t, 16b, b, p8, q)
> -__LD2_LANE_FUNC (poly16x8x2_t, poly16_t, 8h, h, p16, q)
> -__LD2_LANE_FUNC (int8x16x2_t, int8_t, 16b, b, s8, q)
> -__LD2_LANE_FUNC (int16x8x2_t, int16_t, 8h, h, s16, q)
> -__LD2_LANE_FUNC (int32x4x2_t, int32_t, 4s, s, s32, q)
> -__LD2_LANE_FUNC (int64x2x2_t, int64_t, 2d, d, s64, q)
> -__LD2_LANE_FUNC (uint8x16x2_t, uint8_t, 16b, b, u8, q)
> -__LD2_LANE_FUNC (uint16x8x2_t, uint16_t, 8h, h, u16, q)
> -__LD2_LANE_FUNC (uint32x4x2_t, uint32_t, 4s, s, u32, q)
> -__LD2_LANE_FUNC (uint64x2x2_t, uint64_t, 2d, d, u64, q)
> +#define __LD2_LANE_FUNC(intype, vectype, largetype, ptrtype,              \
> +                        mode, ptrmode, funcsuffix, signedtype)            \
> +__extension__ static __inline intype __attribute__ ((__always_inline__))   \
> +vld2_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
> +{                                                                         \
> +  __builtin_aarch64_simd_oi __o;                                          \
> +  largetype __temp;                                                       \
> +  __temp.val[0] =                                                         \
> +    vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0));         \
> +  __temp.val[1] =                                                         \
> +    vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0));         \
> +  __o = __builtin_aarch64_set_qregoi##mode (__o,                          \
> +                                          (signedtype) __temp.val[0],     \
> +                                          0);                             \
> +  __o = __builtin_aarch64_set_qregoi##mode (__o,                          \
> +                                          (signedtype) __temp.val[1],     \
> +                                          1);                             \
> +  __o =        __builtin_aarch64_ld2_lane##mode (                                 \
> +         (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);           \
> +  __b.val[0] = (vectype) __builtin_aarch64_get_dregoidi (__o, 0);         \
> +  __b.val[1] = (vectype) __builtin_aarch64_get_dregoidi (__o, 1);         \
> +  return __b;                                                             \
> +}
> +
> +__LD2_LANE_FUNC (float32x2x2_t, float32x2_t, float32x4x2_t, float32_t, v4sf,
> +                sf, f32, float32x4_t)
> +__LD2_LANE_FUNC (float64x1x2_t, float64x1_t, float64x2x2_t, float64_t, v2df,
> +                df, f64, float64x2_t)
> +__LD2_LANE_FUNC (poly8x8x2_t, poly8x8_t, poly8x16x2_t, poly8_t, v16qi, qi, p8,
> +                int8x16_t)
> +__LD2_LANE_FUNC (poly16x4x2_t, poly16x4_t, poly16x8x2_t, poly16_t, v8hi, hi,
> +                p16, int16x8_t)
> +__LD2_LANE_FUNC (int8x8x2_t, int8x8_t, int8x16x2_t, int8_t, v16qi, qi, s8,
> +                int8x16_t)
> +__LD2_LANE_FUNC (int16x4x2_t, int16x4_t, int16x8x2_t, int16_t, v8hi, hi, s16,
> +                int16x8_t)
> +__LD2_LANE_FUNC (int32x2x2_t, int32x2_t, int32x4x2_t, int32_t, v4si, si, s32,
> +                int32x4_t)
> +__LD2_LANE_FUNC (int64x1x2_t, int64x1_t, int64x2x2_t, int64_t, v2di, di, s64,
> +                int64x2_t)
> +__LD2_LANE_FUNC (uint8x8x2_t, uint8x8_t, uint8x16x2_t, uint8_t, v16qi, qi, u8,
> +                int8x16_t)
> +__LD2_LANE_FUNC (uint16x4x2_t, uint16x4_t, uint16x8x2_t, uint16_t, v8hi, hi,
> +                u16, int16x8_t)
> +__LD2_LANE_FUNC (uint32x2x2_t, uint32x2_t, uint32x4x2_t, uint32_t, v4si, si,
> +                u32, int32x4_t)
> +__LD2_LANE_FUNC (uint64x1x2_t, uint64x1_t, uint64x2x2_t, uint64_t, v2di, di,
> +                u64, int64x2_t)
> +
> +#undef __LD2_LANE_FUNC
> +#define __LD2_LANE_FUNC(intype, ptrtype, mode, ptrmode, funcsuffix)       \
> +__extension__ static __inline intype __attribute__ ((__always_inline__))   \
> +vld2q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
> +{                                                                         \
> +  union { intype __i;                                                     \
> +         __builtin_aarch64_simd_oi __o; } __temp = { __b };               \
> +  __temp.__o = __builtin_aarch64_ld2_lane##mode (                         \
> +       (__builtin_aarch64_simd_##ptrmode *) __ptr, __temp.__o, __c);      \
> +  return __temp.__i;                                                      \
> +}
> +
> +__LD2_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32)
> +__LD2_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64)
> +__LD2_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8)
> +__LD2_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16)
> +__LD2_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8)
> +__LD2_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16)
> +__LD2_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32)
> +__LD2_LANE_FUNC (int64x2x2_t, int64_t, v2di, di, s64)
> +__LD2_LANE_FUNC (uint8x16x2_t, uint8_t, v16qi, qi, u8)
> +__LD2_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16)
> +__LD2_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32)
> +__LD2_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64)
>
>   #define __LD3R_FUNC(rettype, structtype, ptrtype,                      \
>                      regsuffix, funcsuffix, Q)                           \
> @@ -11887,47 +11919,85 @@ __LD3R_FUNC (uint16x8x3_t, uint16x3_t, uint16_t, 8h, u16, q)
>   __LD3R_FUNC (uint32x4x3_t, uint32x3_t, uint32_t, 4s, u32, q)
>   __LD3R_FUNC (uint64x2x3_t, uint64x3_t, uint64_t, 2d, u64, q)
>
> -#define __LD3_LANE_FUNC(rettype, ptrtype, regsuffix,                   \
> -                       lnsuffix, funcsuffix, Q)                        \
> -  __extension__ static __inline rettype                                        \
> -  __attribute__ ((__always_inline__))                                  \
> -  vld3 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr,               \
> -                                    rettype b, const int c)            \
> -  {                                                                    \
> -    rettype result;                                                    \
> -    __asm__ ("ld1 {v16." #regsuffix " - v18." #regsuffix "}, %1\n\t"   \
> -            "ld3 {v16." #lnsuffix " - v18." #lnsuffix "}[%3], %2\n\t"  \
> -            "st1 {v16." #regsuffix " - v18." #regsuffix "}, %0\n\t"    \
> -            : "=Q"(result)                                             \
> -            : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c)               \
> -            : "memory", "v16", "v17", "v18");                          \
> -    return result;                                                     \
> -  }
> -
> -__LD3_LANE_FUNC (int8x8x3_t, uint8_t, 8b, b, s8,)
> -__LD3_LANE_FUNC (float32x2x3_t, float32_t, 2s, s, f32,)
> -__LD3_LANE_FUNC (float64x1x3_t, float64_t, 1d, d, f64,)
> -__LD3_LANE_FUNC (poly8x8x3_t, poly8_t, 8b, b, p8,)
> -__LD3_LANE_FUNC (poly16x4x3_t, poly16_t, 4h, h, p16,)
> -__LD3_LANE_FUNC (int16x4x3_t, int16_t, 4h, h, s16,)
> -__LD3_LANE_FUNC (int32x2x3_t, int32_t, 2s, s, s32,)
> -__LD3_LANE_FUNC (int64x1x3_t, int64_t, 1d, d, s64,)
> -__LD3_LANE_FUNC (uint8x8x3_t, uint8_t, 8b, b, u8,)
> -__LD3_LANE_FUNC (uint16x4x3_t, uint16_t, 4h, h, u16,)
> -__LD3_LANE_FUNC (uint32x2x3_t, uint32_t, 2s, s, u32,)
> -__LD3_LANE_FUNC (uint64x1x3_t, uint64_t, 1d, d, u64,)
> -__LD3_LANE_FUNC (float32x4x3_t, float32_t, 4s, s, f32, q)
> -__LD3_LANE_FUNC (float64x2x3_t, float64_t, 2d, d, f64, q)
> -__LD3_LANE_FUNC (poly8x16x3_t, poly8_t, 16b, b, p8, q)
> -__LD3_LANE_FUNC (poly16x8x3_t, poly16_t, 8h, h, p16, q)
> -__LD3_LANE_FUNC (int8x16x3_t, int8_t, 16b, b, s8, q)
> -__LD3_LANE_FUNC (int16x8x3_t, int16_t, 8h, h, s16, q)
> -__LD3_LANE_FUNC (int32x4x3_t, int32_t, 4s, s, s32, q)
> -__LD3_LANE_FUNC (int64x2x3_t, int64_t, 2d, d, s64, q)
> -__LD3_LANE_FUNC (uint8x16x3_t, uint8_t, 16b, b, u8, q)
> -__LD3_LANE_FUNC (uint16x8x3_t, uint16_t, 8h, h, u16, q)
> -__LD3_LANE_FUNC (uint32x4x3_t, uint32_t, 4s, s, u32, q)
> -__LD3_LANE_FUNC (uint64x2x3_t, uint64_t, 2d, d, u64, q)
> +#define __LD3_LANE_FUNC(intype, vectype, largetype, ptrtype,              \
> +                        mode, ptrmode, funcsuffix, signedtype)            \
> +__extension__ static __inline intype __attribute__ ((__always_inline__))   \
> +vld3_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
> +{                                                                         \
> +  __builtin_aarch64_simd_ci __o;                                          \
> +  largetype __temp;                                                       \
> +  __temp.val[0] =                                                         \
> +    vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0));         \
> +  __temp.val[1] =                                                         \
> +    vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0));         \
> +  __temp.val[2] =                                                         \
> +    vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0));         \
> +  __o = __builtin_aarch64_set_qregci##mode (__o,                          \
> +                                          (signedtype) __temp.val[0],     \
> +                                          0);                             \
> +  __o = __builtin_aarch64_set_qregci##mode (__o,                          \
> +                                          (signedtype) __temp.val[1],     \
> +                                          1);                             \
> +  __o = __builtin_aarch64_set_qregci##mode (__o,                          \
> +                                          (signedtype) __temp.val[2],     \
> +                                          2);                             \
> +  __o =        __builtin_aarch64_ld3_lane##mode (                                 \
> +         (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);           \
> +  __b.val[0] = (vectype) __builtin_aarch64_get_dregcidi (__o, 0);         \
> +  __b.val[1] = (vectype) __builtin_aarch64_get_dregcidi (__o, 1);         \
> +  __b.val[2] = (vectype) __builtin_aarch64_get_dregcidi (__o, 2);         \
> +  return __b;                                                             \
> +}
> +
> +__LD3_LANE_FUNC (float32x2x3_t, float32x2_t, float32x4x3_t, float32_t, v4sf,
> +                sf, f32, float32x4_t)
> +__LD3_LANE_FUNC (float64x1x3_t, float64x1_t, float64x2x3_t, float64_t, v2df,
> +                df, f64, float64x2_t)
> +__LD3_LANE_FUNC (poly8x8x3_t, poly8x8_t, poly8x16x3_t, poly8_t, v16qi, qi, p8,
> +                int8x16_t)
> +__LD3_LANE_FUNC (poly16x4x3_t, poly16x4_t, poly16x8x3_t, poly16_t, v8hi, hi,
> +                p16, int16x8_t)
> +__LD3_LANE_FUNC (int8x8x3_t, int8x8_t, int8x16x3_t, int8_t, v16qi, qi, s8,
> +                int8x16_t)
> +__LD3_LANE_FUNC (int16x4x3_t, int16x4_t, int16x8x3_t, int16_t, v8hi, hi, s16,
> +                int16x8_t)
> +__LD3_LANE_FUNC (int32x2x3_t, int32x2_t, int32x4x3_t, int32_t, v4si, si, s32,
> +                int32x4_t)
> +__LD3_LANE_FUNC (int64x1x3_t, int64x1_t, int64x2x3_t, int64_t, v2di, di, s64,
> +                int64x2_t)
> +__LD3_LANE_FUNC (uint8x8x3_t, uint8x8_t, uint8x16x3_t, uint8_t, v16qi, qi, u8,
> +                int8x16_t)
> +__LD3_LANE_FUNC (uint16x4x3_t, uint16x4_t, uint16x8x3_t, uint16_t, v8hi, hi,
> +                u16, int16x8_t)
> +__LD3_LANE_FUNC (uint32x2x3_t, uint32x2_t, uint32x4x3_t, uint32_t, v4si, si,
> +                u32, int32x4_t)
> +__LD3_LANE_FUNC (uint64x1x3_t, uint64x1_t, uint64x2x3_t, uint64_t, v2di, di,
> +                u64, int64x2_t)
> +
> +#undef __LD3_LANE_FUNC
> +#define __LD3_LANE_FUNC(intype, ptrtype, mode, ptrmode, funcsuffix)       \
> +__extension__ static __inline intype __attribute__ ((__always_inline__))   \
> +vld3q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
> +{                                                                         \
> +  union { intype __i;                                                     \
> +         __builtin_aarch64_simd_xi __o; } __temp = { __b };               \
> +  __temp.__o = __builtin_aarch64_ld4_lane##mode (                         \
> +       (__builtin_aarch64_simd_##ptrmode *) __ptr, __temp.__o, __c);      \
> +  return __temp.__i;                                                      \
> +}
> +
> +__LD3_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32)
> +__LD3_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64)
> +__LD3_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8)
> +__LD3_LANE_FUNC (poly16x8x3_t, poly16_t, v8hi, hi, p16)
> +__LD3_LANE_FUNC (int8x16x3_t, int8_t, v16qi, qi, s8)
> +__LD3_LANE_FUNC (int16x8x3_t, int16_t, v8hi, hi, s16)
> +__LD3_LANE_FUNC (int32x4x3_t, int32_t, v4si, si, s32)
> +__LD3_LANE_FUNC (int64x2x3_t, int64_t, v2di, di, s64)
> +__LD3_LANE_FUNC (uint8x16x3_t, uint8_t, v16qi, qi, u8)
> +__LD3_LANE_FUNC (uint16x8x3_t, uint16_t, v8hi, hi, u16)
> +__LD3_LANE_FUNC (uint32x4x3_t, uint32_t, v4si, si, u32)
> +__LD3_LANE_FUNC (uint64x2x3_t, uint64_t, v2di, di, u64)
>
>   #define __LD4R_FUNC(rettype, structtype, ptrtype,                      \
>                      regsuffix, funcsuffix, Q)                           \
> @@ -11969,47 +12039,92 @@ __LD4R_FUNC (uint16x8x4_t, uint16x4_t, uint16_t, 8h, u16, q)
>   __LD4R_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, 4s, u32, q)
>   __LD4R_FUNC (uint64x2x4_t, uint64x4_t, uint64_t, 2d, u64, q)
>
> -#define __LD4_LANE_FUNC(rettype, ptrtype, regsuffix,                   \
> -                       lnsuffix, funcsuffix, Q)                        \
> -  __extension__ static __inline rettype                                        \
> -  __attribute__ ((__always_inline__))                                  \
> -  vld4 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr,               \
> -                                    rettype b, const int c)            \
> -  {                                                                    \
> -    rettype result;                                                    \
> -    __asm__ ("ld1 {v16." #regsuffix " - v19." #regsuffix "}, %1\n\t"   \
> -            "ld4 {v16." #lnsuffix " - v19." #lnsuffix "}[%3], %2\n\t"  \
> -            "st1 {v16." #regsuffix " - v19." #regsuffix "}, %0\n\t"    \
> -            : "=Q"(result)                                             \
> -            : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c)               \
> -            : "memory", "v16", "v17", "v18", "v19");                   \
> -    return result;                                                     \
> -  }
>
> -__LD4_LANE_FUNC (int8x8x4_t, uint8_t, 8b, b, s8,)
> -__LD4_LANE_FUNC (float32x2x4_t, float32_t, 2s, s, f32,)
> -__LD4_LANE_FUNC (float64x1x4_t, float64_t, 1d, d, f64,)
> -__LD4_LANE_FUNC (poly8x8x4_t, poly8_t, 8b, b, p8,)
> -__LD4_LANE_FUNC (poly16x4x4_t, poly16_t, 4h, h, p16,)
> -__LD4_LANE_FUNC (int16x4x4_t, int16_t, 4h, h, s16,)
> -__LD4_LANE_FUNC (int32x2x4_t, int32_t, 2s, s, s32,)
> -__LD4_LANE_FUNC (int64x1x4_t, int64_t, 1d, d, s64,)
> -__LD4_LANE_FUNC (uint8x8x4_t, uint8_t, 8b, b, u8,)
> -__LD4_LANE_FUNC (uint16x4x4_t, uint16_t, 4h, h, u16,)
> -__LD4_LANE_FUNC (uint32x2x4_t, uint32_t, 2s, s, u32,)
> -__LD4_LANE_FUNC (uint64x1x4_t, uint64_t, 1d, d, u64,)
> -__LD4_LANE_FUNC (float32x4x4_t, float32_t, 4s, s, f32, q)
> -__LD4_LANE_FUNC (float64x2x4_t, float64_t, 2d, d, f64, q)
> -__LD4_LANE_FUNC (poly8x16x4_t, poly8_t, 16b, b, p8, q)
> -__LD4_LANE_FUNC (poly16x8x4_t, poly16_t, 8h, h, p16, q)
> -__LD4_LANE_FUNC (int8x16x4_t, int8_t, 16b, b, s8, q)
> -__LD4_LANE_FUNC (int16x8x4_t, int16_t, 8h, h, s16, q)
> -__LD4_LANE_FUNC (int32x4x4_t, int32_t, 4s, s, s32, q)
> -__LD4_LANE_FUNC (int64x2x4_t, int64_t, 2d, d, s64, q)
> -__LD4_LANE_FUNC (uint8x16x4_t, uint8_t, 16b, b, u8, q)
> -__LD4_LANE_FUNC (uint16x8x4_t, uint16_t, 8h, h, u16, q)
> -__LD4_LANE_FUNC (uint32x4x4_t, uint32_t, 4s, s, u32, q)
> -__LD4_LANE_FUNC (uint64x2x4_t, uint64_t, 2d, d, u64, q)
> +#define __LD4_LANE_FUNC(intype, vectype, largetype, ptrtype,              \
> +                        mode, ptrmode, funcsuffix, signedtype)            \
> +__extension__ static __inline intype __attribute__ ((__always_inline__))   \
> +vld4_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
> +{                                                                         \
> +  __builtin_aarch64_simd_xi __o;                                          \
> +  largetype __temp;                                                       \
> +  __temp.val[0] =                                                         \
> +    vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0));         \
> +  __temp.val[1] =                                                         \
> +    vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0));         \
> +  __temp.val[2] =                                                         \
> +    vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0));         \
> +  __temp.val[3] =                                                         \
> +    vcombine_##funcsuffix (__b.val[3], vcreate_##funcsuffix (0));         \
> +  __o = __builtin_aarch64_set_qregxi##mode (__o,                          \
> +                                          (signedtype) __temp.val[0],     \
> +                                          0);                             \
> +  __o = __builtin_aarch64_set_qregxi##mode (__o,                          \
> +                                          (signedtype) __temp.val[1],     \
> +                                          1);                             \
> +  __o = __builtin_aarch64_set_qregxi##mode (__o,                          \
> +                                          (signedtype) __temp.val[2],     \
> +                                          2);                             \
> +  __o = __builtin_aarch64_set_qregxi##mode (__o,                          \
> +                                          (signedtype) __temp.val[3],     \
> +                                          3);                             \
> +  __o =        __builtin_aarch64_ld4_lane##mode (                                 \
> +         (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);           \
> +  __b.val[0] = (vectype) __builtin_aarch64_get_dregxidi (__o, 0);         \
> +  __b.val[1] = (vectype) __builtin_aarch64_get_dregxidi (__o, 1);         \
> +  __b.val[2] = (vectype) __builtin_aarch64_get_dregxidi (__o, 2);         \
> +  __b.val[3] = (vectype) __builtin_aarch64_get_dregxidi (__o, 3);         \
> +  return __b;                                                             \
> +}
> +
> +__LD4_LANE_FUNC (float32x2x4_t, float32x2_t, float32x4x4_t, float32_t, v4sf,
> +                sf, f32, float32x4_t)
> +__LD4_LANE_FUNC (float64x1x4_t, float64x1_t, float64x2x4_t, float64_t, v2df,
> +                df, f64, float64x2_t)
> +__LD4_LANE_FUNC (poly8x8x4_t, poly8x8_t, poly8x16x4_t, poly8_t, v16qi, qi, p8,
> +                int8x16_t)
> +__LD4_LANE_FUNC (poly16x4x4_t, poly16x4_t, poly16x8x4_t, poly16_t, v8hi, hi,
> +                p16, int16x8_t)
> +__LD4_LANE_FUNC (int8x8x4_t, int8x8_t, int8x16x4_t, int8_t, v16qi, qi, s8,
> +                int8x16_t)
> +__LD4_LANE_FUNC (int16x4x4_t, int16x4_t, int16x8x4_t, int16_t, v8hi, hi, s16,
> +                int16x8_t)
> +__LD4_LANE_FUNC (int32x2x4_t, int32x2_t, int32x4x4_t, int32_t, v4si, si, s32,
> +                int32x4_t)
> +__LD4_LANE_FUNC (int64x1x4_t, int64x1_t, int64x2x4_t, int64_t, v2di, di, s64,
> +                int64x2_t)
> +__LD4_LANE_FUNC (uint8x8x4_t, uint8x8_t, uint8x16x4_t, uint8_t, v16qi, qi, u8,
> +                int8x16_t)
> +__LD4_LANE_FUNC (uint16x4x4_t, uint16x4_t, uint16x8x4_t, uint16_t, v8hi, hi,
> +                u16, int16x8_t)
> +__LD4_LANE_FUNC (uint32x2x4_t, uint32x2_t, uint32x4x4_t, uint32_t, v4si, si,
> +                u32, int32x4_t)
> +__LD4_LANE_FUNC (uint64x1x4_t, uint64x1_t, uint64x2x4_t, uint64_t, v2di, di,
> +                u64, int64x2_t)
> +
> +#undef __LD4_LANE_FUNC
> +#define __LD4_LANE_FUNC(intype, ptrtype, mode, ptrmode, funcsuffix)       \
> +__extension__ static __inline intype __attribute__ ((__always_inline__))   \
> +vld4q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
> +{                                                                         \
> +  union { intype __i;                                                     \
> +         __builtin_aarch64_simd_xi __o; } __temp = { __b };               \
> +  __temp.__o = __builtin_aarch64_ld4_lane##mode (                         \
> +       (__builtin_aarch64_simd_##ptrmode *) __ptr, __temp.__o, __c);      \
> +  return __temp.__i;                                                      \
> +}
> +

The reason we avoided using type-punning using unions was that reload 
would get confused with potential subreg(mem) that could be introduced 
because of memory xfer caused by unions and large int modes. As a 
result, we would get incorrect or sub-optimal code. But this seems to 
have fixed itself. :-)

Because this involves xfers between large int modes and 
CANNOT_CHANGE_MODE_CLASS has some impact on it, it would be good to test 
what impact your patch has with C_C_M_C removed, so that it will be 
easier to fix the fallout once we remove C_C_M_C eventually. To test 
this you will need Richard's patch set 
https://gcc.gnu.org/ml/gcc-patches/2014-09/msg01440.html.

Same for your other 2 patches in this series(3,4).

Thanks,
Tejas.

> +__LD4_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32)
> +__LD4_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64)
> +__LD4_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8)
> +__LD4_LANE_FUNC (poly16x8x4_t, poly16_t, v8hi, hi, p16)
> +__LD4_LANE_FUNC (int8x16x4_t, int8_t, v16qi, qi, s8)
> +__LD4_LANE_FUNC (int16x8x4_t, int16_t, v8hi, hi, s16)
> +__LD4_LANE_FUNC (int32x4x4_t, int32_t, v4si, si, s32)
> +__LD4_LANE_FUNC (int64x2x4_t, int64_t, v2di, di, s64)
> +__LD4_LANE_FUNC (uint8x16x4_t, uint8_t, v16qi, qi, u8)
> +__LD4_LANE_FUNC (uint16x8x4_t, uint16_t, v8hi, hi, u16)
> +__LD4_LANE_FUNC (uint32x4x4_t, uint32_t, v4si, si, u32)
> +__LD4_LANE_FUNC (uint64x2x4_t, uint64_t, v2di, di, u64)
>
>   #define __ST2_LANE_FUNC(intype, largetype, ptrtype,                         \
>                          mode, ptr_mode, funcsuffix, signedtype)              \
> --
> 1.9.1
>
>
Charles Baylis Sept. 26, 2014, 1:16 a.m. UTC | #2
On 19 September 2014 12:21, Tejas Belagod <tejas.belagod@arm.com> wrote:
> The reason we avoided using type-punning using unions was that reload would
> get confused with potential subreg(mem) that could be introduced because of
> memory xfer caused by unions and large int modes. As a result, we would get
> incorrect or sub-optimal code. But this seems to have fixed itself. :-)
>
> Because this involves xfers between large int modes and
> CANNOT_CHANGE_MODE_CLASS has some impact on it, it would be good to test
> what impact your patch has with C_C_M_C removed, so that it will be easier
> to fix the fallout once we remove C_C_M_C eventually. To test this you will
> need Richard's patch set
> https://gcc.gnu.org/ml/gcc-patches/2014-09/msg01440.html.
>
> Same for your other 2 patches in this series(3,4).

I tried those patches, and altered aarch64_cannot_change_mode_class to
return false for all cases.

However, this does not avoid the unnecessary moves.

Taking a really simple test case:

#include <arm_neon.h>

int32x2x2_t xvld2_s32(int32_t *__a)
{
  int32x2x2_t ret;
  __builtin_aarch64_simd_oi __o;
  __o = __builtin_aarch64_ld2v2si ((const __builtin_aarch64_simd_si *) __a);
  ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0);
  ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1);
  return ret;
}

(disabling scheduling for clarity)
$ aarch64-oe-linux-gcc -O2 -S -o - simd.c -fno-schedule-insns
-fno-schedule-insns2
        ...
xvld2_s32:
        ld2     {v2.2s - v3.2s}, [x0]
        orr     v0.8b, v2.8b, v2.8b
        orr     v1.8b, v3.8b, v3.8b
        ret
        ...


The reason is apparent in the rtl dump from ira:
...
      Allocno a0r73 of FP_REGS(32) has 31 avail. regs  33-63, node:
33-63 (confl regs =  0-32 64 65)
...
(insn 2 4 3 2 (set (reg/v/f:DI 79 [ __a ])
        (reg:DI 0 x0 [ __a ])) simd.c:5 34 {*movdi_aarch64}
     (expr_list:REG_DEAD (reg:DI 0 x0 [ __a ])
        (nil)))
(note 3 2 6 2 NOTE_INSN_FUNCTION_BEG)
(insn 6 3 20 2 (set (reg/v:OI 73 [ __o ])
        (subreg:OI (vec_concat:V8SI (vec_concat:V4SI (unspec:V2SI [
                            (mem:TI (reg/v/f:DI 79 [ __a ]) [0  S16 A8])
                        ] UNSPEC_LD2)
                    (vec_duplicate:V2SI (const_int 0 [0])))
                (vec_concat:V4SI (unspec:V2SI [
                            (mem:TI (reg/v/f:DI 79 [ __a ]) [0  S16 A8])
                        ] UNSPEC_LD2)
                    (vec_duplicate:V2SI (const_int 0 [0])))) 0))
simd.c:8 2149 {aarch64_ld2v2si_dreg}
     (expr_list:REG_DEAD (reg/v/f:DI 79 [ __a ])
        (nil)))
(insn 20 6 21 2 (set (reg:V2SI 32 v0)
        (subreg:V2SI (reg/v:OI 73 [ __o ]) 0)) simd.c:12 778
{*aarch64_simd_movv2si}
     (nil))
(insn 21 20 22 2 (set (reg:V2SI 33 v1)
        (subreg:V2SI (reg/v:OI 73 [ __o ]) 16)) simd.c:12 778
{*aarch64_simd_movv2si}
     (expr_list:REG_DEAD (reg/v:OI 73 [ __o ])
        (nil)))
(insn 22 21 23 2 (use (reg:V2SI 32 v0)) simd.c:12 -1
     (nil))
(insn 23 22 0 2 (use (reg:V2SI 33 v1)) simd.c:12 -1
     (nil))

The register allocator considers r73 to conflict with v0, because they
are simultaneously live after insn 20. Without the 2nd use of v73 (eg
if the write to res.val[1] is replaced with vdup_n_s32(0) ) then the
allocator does do the right thing with the subreg and allocates v73 to
{v0,v1}.

I haven't read all of the old threads relating to Richard's patches
yet, but I don't see why they would affect this issue.

I don't think the register allocator is able to resolve this unless
the conversion between the __builtin_simd type and the int32x4x2_t
type is done as a single operation.

However, type-punning is not possible with the arrays of 64 bit
vectors, as the arrays are not the same size as the corresponding
__builtin_simd types, and any solution for those would probably help
with the q variants too. Maybe the solution is to pass the NEON
intrinsic types directly to the builtins? Is there a reason that it
wasn't done that way before?

Thanks
Charles
Tejas Belagod Sept. 26, 2014, 12:47 p.m. UTC | #3
On 26/09/14 02:16, Charles Baylis wrote:
> On 19 September 2014 12:21, Tejas Belagod <tejas.belagod@arm.com> wrote:
>> The reason we avoided using type-punning using unions was that reload would
>> get confused with potential subreg(mem) that could be introduced because of
>> memory xfer caused by unions and large int modes. As a result, we would get
>> incorrect or sub-optimal code. But this seems to have fixed itself. :-)
>>
>> Because this involves xfers between large int modes and
>> CANNOT_CHANGE_MODE_CLASS has some impact on it, it would be good to test
>> what impact your patch has with C_C_M_C removed, so that it will be easier
>> to fix the fallout once we remove C_C_M_C eventually. To test this you will
>> need Richard's patch set
>> https://gcc.gnu.org/ml/gcc-patches/2014-09/msg01440.html.
>>
>> Same for your other 2 patches in this series(3,4).
>
> I tried those patches, and altered aarch64_cannot_change_mode_class to
> return false for all cases.
>
> However, this does not avoid the unnecessary moves.
>
> Taking a really simple test case:
>
> #include <arm_neon.h>
>
> int32x2x2_t xvld2_s32(int32_t *__a)
> {
>    int32x2x2_t ret;
>    __builtin_aarch64_simd_oi __o;
>    __o = __builtin_aarch64_ld2v2si ((const __builtin_aarch64_simd_si *) __a);
>    ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0);
>    ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1);
>    return ret;
> }
>
> (disabling scheduling for clarity)
> $ aarch64-oe-linux-gcc -O2 -S -o - simd.c -fno-schedule-insns
> -fno-schedule-insns2
>          ...
> xvld2_s32:
>          ld2     {v2.2s - v3.2s}, [x0]
>          orr     v0.8b, v2.8b, v2.8b
>          orr     v1.8b, v3.8b, v3.8b
>          ret
>          ...
>
>
> The reason is apparent in the rtl dump from ira:
> ...
>        Allocno a0r73 of FP_REGS(32) has 31 avail. regs  33-63, node:
> 33-63 (confl regs =  0-32 64 65)
> ...
> (insn 2 4 3 2 (set (reg/v/f:DI 79 [ __a ])
>          (reg:DI 0 x0 [ __a ])) simd.c:5 34 {*movdi_aarch64}
>       (expr_list:REG_DEAD (reg:DI 0 x0 [ __a ])
>          (nil)))
> (note 3 2 6 2 NOTE_INSN_FUNCTION_BEG)
> (insn 6 3 20 2 (set (reg/v:OI 73 [ __o ])
>          (subreg:OI (vec_concat:V8SI (vec_concat:V4SI (unspec:V2SI [
>                              (mem:TI (reg/v/f:DI 79 [ __a ]) [0  S16 A8])
>                          ] UNSPEC_LD2)
>                      (vec_duplicate:V2SI (const_int 0 [0])))
>                  (vec_concat:V4SI (unspec:V2SI [
>                              (mem:TI (reg/v/f:DI 79 [ __a ]) [0  S16 A8])
>                          ] UNSPEC_LD2)
>                      (vec_duplicate:V2SI (const_int 0 [0])))) 0))
> simd.c:8 2149 {aarch64_ld2v2si_dreg}
>       (expr_list:REG_DEAD (reg/v/f:DI 79 [ __a ])
>          (nil)))
> (insn 20 6 21 2 (set (reg:V2SI 32 v0)
>          (subreg:V2SI (reg/v:OI 73 [ __o ]) 0)) simd.c:12 778
> {*aarch64_simd_movv2si}
>       (nil))
> (insn 21 20 22 2 (set (reg:V2SI 33 v1)
>          (subreg:V2SI (reg/v:OI 73 [ __o ]) 16)) simd.c:12 778
> {*aarch64_simd_movv2si}
>       (expr_list:REG_DEAD (reg/v:OI 73 [ __o ])
>          (nil)))
> (insn 22 21 23 2 (use (reg:V2SI 32 v0)) simd.c:12 -1
>       (nil))
> (insn 23 22 0 2 (use (reg:V2SI 33 v1)) simd.c:12 -1
>       (nil))
>
> The register allocator considers r73 to conflict with v0, because they
> are simultaneously live after insn 20. Without the 2nd use of v73 (eg
> if the write to res.val[1] is replaced with vdup_n_s32(0) ) then the
> allocator does do the right thing with the subreg and allocates v73 to
> {v0,v1}.
>
> I haven't read all of the old threads relating to Richard's patches
> yet, but I don't see why they would affect this issue.
>
> I don't think the register allocator is able to resolve this unless
> the conversion between the __builtin_simd type and the int32x4x2_t
> type is done as a single operation.
>

For this piece of code,

#include "arm_neon.h"

int32x2x2_t xvld2_s32(int32_t *__a)
{
   union { int32x2x2_t __i;
          __builtin_aarch64_simd_oi __o; } __temp;
   __temp.__o = __builtin_aarch64_ld2v2si ((const 
__builtin_aarch64_simd_si *) __a);
   return __temp.__i;
}

int32x2x2_t yvld2_s32(int32_t *__a)
{
   int32x2x2_t ret;
   __builtin_aarch64_simd_oi __o;
   __o = __builtin_aarch64_ld2v2si ((const __builtin_aarch64_simd_si *) 
__a);
   ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0);
   ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1);
   return ret;
}

currently my gcc HEAD generates at -O3:

xvld2_s32:
	ld2	{v0.2s - v1.2s}, [x0]
	sub	sp, sp, #64
	st1	{v0.16b - v1.16b}, [sp]
	ldr	x1, [sp]
	ldr	x0, [sp, 8]
	add	sp, sp, 64
	ins	v0.d[0], x1
	ins	v1.d[0], x0
	ret
         ....
yvld2_s32:
	ld2	{v2.2s - v3.2s}, [x0]
	orr	v1.8b, v3.8b, v3.8b
	orr	v0.8b, v2.8b, v2.8b
	ret

If we use type-punning, there are unnecessary spills that are generated 
which is also incorrect for BE because of of the way we spill (st1 
{v0.16b - v1.16b}, [sp]) and restore. The implementation without 
type-punning seems to give a more optimal result. Did your patches 
improve on the spills for the type-punning solution?

> However, type-punning is not possible with the arrays of 64 bit
> vectors, as the arrays are not the same size as the corresponding
> __builtin_simd types, and any solution for those would probably help
> with the q variants too.

That is because we fill a zero-extended D-reg value into a 128-bit reg 
and pack them into an large int mode(eg. OI). We don't have large int 
modes made up of purely D-regs because we run into ambiguities like 4 
D-regs is an OImode and 2 Q-regs is also an OImode.

> Maybe the solution is to pass the NEON
> intrinsic types directly to the builtins? Is there a reason that it
> wasn't done that way before?
>

How do you mean? Do you mean pass a loaded value int32x2x2_t into a 
__builtin? How will that work?

If you mean why we don't pass an int32x2x2_t into a builtin as a 
structure, I don't think that would work as it is struct type which 
would correspond to a  BLK mode, but we need RTL patterns with reg-lists 
to work with large int modes for the regalloc to allocate consecutive 
regs for the reglists.

Thanks,
Tejas.
Charles Baylis Oct. 8, 2014, 6:47 p.m. UTC | #4
On 26 September 2014 13:47, Tejas Belagod <tejas.belagod@arm.com> wrote:
> If we use type-punning, there are unnecessary spills that are generated
> which is also incorrect for BE because of of the way we spill (st1 {v0.16b -
> v1.16b}, [sp]) and restore. The implementation without type-punning seems to
> give a more optimal result. Did your patches improve on the spills for the
> type-punning solution?

OK, this part seems too contentious, so I've respun the vldN_lane
parts without the type punning and reposted them. This issue can be
resolved separately.

Trying an example like this gives good code with type punning, and
poor code without.

void t2(int32_t *p)
{
    int32x4x4_t va = vld4q_s32(p);
    va = vld4q_lane_s32(p + 500, va, 1);
    vst4q_s32(p+1000, va);
}


With type-punning, good code:
t2:
        ld4     {v0.4s - v3.4s}, [x0]
        add     x2, x0, 2000
        add     x1, x0, 4000
        ld4     {v0.s - v3.s}[1], [x2]
        st4     {v0.4s - v3.4s}, [x1]
        ret

Without type-punning, horrible code:
t2:
        ld4     {v0.4s - v3.4s}, [x0]
        sub     sp, sp, #64
        add     x14, x0, 2000
        add     x0, x0, 4000
        umov    x12, v0.d[0]
        umov    x13, v0.d[1]
        umov    x10, v1.d[0]
        umov    x11, v1.d[1]
        umov    x8, v2.d[0]
        str     x12, [sp]
        umov    x9, v2.d[1]
        str     x13, [sp, 8]
        str     q3, [sp, 48]
        str     x10, [sp, 16]
        str     x11, [sp, 24]
        str     x8, [sp, 32]
        str     x9, [sp, 40]
        ld1     {v0.16b - v3.16b}, [sp]
        ld4     {v0.s - v3.s}[1], [x14]
        umov    x10, v0.d[0]
        umov    x11, v0.d[1]
        umov    x8, v1.d[0]
        umov    x9, v1.d[1]
        umov    x6, v2.d[0]
        str     x10, [sp]
        umov    x7, v2.d[1]
        str     x11, [sp, 8]
        str     q3, [sp, 48]
        str     x8, [sp, 16]
        str     x9, [sp, 24]
        str     x6, [sp, 32]
        str     x7, [sp, 40]
        ld1     {v0.16b - v3.16b}, [sp]
        add     sp, sp, 64
        st4     {v0.4s - v3.4s}, [x0]
        ret

>> Maybe the solution is to pass the NEON
>> intrinsic types directly to the builtins? Is there a reason that it
>> wasn't done that way before?
>
> How do you mean? Do you mean pass a loaded value int32x2x2_t into a
> __builtin? How will that work?
>
> If you mean why we don't pass an int32x2x2_t into a builtin as a structure,
> I don't think that would work as it is struct type which would correspond to
> a  BLK mode, but we need RTL patterns with reg-lists to work with large int
> modes for the regalloc to allocate consecutive regs for the reglists.

OK, that makes sense. However, something needs to be done to create
the __arch64_simd_ objects without register moves. Since the existing
mechanism causes problems because the lifetimes of the inputs overlap
with the lifetimes of the outputs, I think there are these options:

1. represent the construction/deconstruction as a single operation, to
avoid overlapping variable liveness in the source.
2. add a pass or peephole which can combine the existing builtins into
a single operation, so that the lifetimes are normalised.
3. teach the register allocator how to handle overlapping liveness of
a register and a subreg of that register.

Option 1 would require a new builtin interface which somehow handled a
whole int32x2x2_t in one operation. Construction is easy
(__builtin_aarch64_simd_construct(v.val[0], v.val[1]) or similar).
Deconstruction is less obvious

Option 2 sounds like a hack, but would probably be effective,
particularly if it can be done before inlining.

Option 3 would also help with poor code generation for ARM targets
with vget_low_*, vget_high_* and vcombine_*.

What do you think is the best approach?

Thanks
Charles
diff mbox

Patch

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index e62c783..c1fcb47 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -11805,47 +11805,79 @@  __LD2R_FUNC (uint16x8x2_t, uint16x2_t, uint16_t, 8h, u16, q)
 __LD2R_FUNC (uint32x4x2_t, uint32x2_t, uint32_t, 4s, u32, q)
 __LD2R_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, 2d, u64, q)
 
-#define __LD2_LANE_FUNC(rettype, ptrtype, regsuffix,			\
-			lnsuffix, funcsuffix, Q)			\
-  __extension__ static __inline rettype					\
-  __attribute__ ((__always_inline__))					\
-  vld2 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr,		\
-				     rettype b, const int c)		\
-  {									\
-    rettype result;							\
-    __asm__ ("ld1 {v16." #regsuffix ", v17." #regsuffix "}, %1\n\t"	\
-	     "ld2 {v16." #lnsuffix ", v17." #lnsuffix "}[%3], %2\n\t"	\
-	     "st1 {v16." #regsuffix ", v17." #regsuffix "}, %0\n\t"	\
-	     : "=Q"(result)						\
-	     : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c)		\
-	     : "memory", "v16", "v17");					\
-    return result;							\
-  }
-
-__LD2_LANE_FUNC (int8x8x2_t, uint8_t, 8b, b, s8,)
-__LD2_LANE_FUNC (float32x2x2_t, float32_t, 2s, s, f32,)
-__LD2_LANE_FUNC (float64x1x2_t, float64_t, 1d, d, f64,)
-__LD2_LANE_FUNC (poly8x8x2_t, poly8_t, 8b, b, p8,)
-__LD2_LANE_FUNC (poly16x4x2_t, poly16_t, 4h, h, p16,)
-__LD2_LANE_FUNC (int16x4x2_t, int16_t, 4h, h, s16,)
-__LD2_LANE_FUNC (int32x2x2_t, int32_t, 2s, s, s32,)
-__LD2_LANE_FUNC (int64x1x2_t, int64_t, 1d, d, s64,)
-__LD2_LANE_FUNC (uint8x8x2_t, uint8_t, 8b, b, u8,)
-__LD2_LANE_FUNC (uint16x4x2_t, uint16_t, 4h, h, u16,)
-__LD2_LANE_FUNC (uint32x2x2_t, uint32_t, 2s, s, u32,)
-__LD2_LANE_FUNC (uint64x1x2_t, uint64_t, 1d, d, u64,)
-__LD2_LANE_FUNC (float32x4x2_t, float32_t, 4s, s, f32, q)
-__LD2_LANE_FUNC (float64x2x2_t, float64_t, 2d, d, f64, q)
-__LD2_LANE_FUNC (poly8x16x2_t, poly8_t, 16b, b, p8, q)
-__LD2_LANE_FUNC (poly16x8x2_t, poly16_t, 8h, h, p16, q)
-__LD2_LANE_FUNC (int8x16x2_t, int8_t, 16b, b, s8, q)
-__LD2_LANE_FUNC (int16x8x2_t, int16_t, 8h, h, s16, q)
-__LD2_LANE_FUNC (int32x4x2_t, int32_t, 4s, s, s32, q)
-__LD2_LANE_FUNC (int64x2x2_t, int64_t, 2d, d, s64, q)
-__LD2_LANE_FUNC (uint8x16x2_t, uint8_t, 16b, b, u8, q)
-__LD2_LANE_FUNC (uint16x8x2_t, uint16_t, 8h, h, u16, q)
-__LD2_LANE_FUNC (uint32x4x2_t, uint32_t, 4s, s, u32, q)
-__LD2_LANE_FUNC (uint64x2x2_t, uint64_t, 2d, d, u64, q)
+#define __LD2_LANE_FUNC(intype, vectype, largetype, ptrtype,		   \
+			 mode, ptrmode, funcsuffix, signedtype)		   \
+__extension__ static __inline intype __attribute__ ((__always_inline__))   \
+vld2_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
+{									   \
+  __builtin_aarch64_simd_oi __o;					   \
+  largetype __temp;							   \
+  __temp.val[0] = 							   \
+    vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0));	   \
+  __temp.val[1] =							   \
+    vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0));	   \
+  __o = __builtin_aarch64_set_qregoi##mode (__o,			   \
+					   (signedtype) __temp.val[0],	   \
+					   0);				   \
+  __o = __builtin_aarch64_set_qregoi##mode (__o,			   \
+					   (signedtype) __temp.val[1],	   \
+					   1);				   \
+  __o =	__builtin_aarch64_ld2_lane##mode (				   \
+	  (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);	   \
+  __b.val[0] = (vectype) __builtin_aarch64_get_dregoidi (__o, 0);	   \
+  __b.val[1] = (vectype) __builtin_aarch64_get_dregoidi (__o, 1);	   \
+  return __b;								   \
+}
+
+__LD2_LANE_FUNC (float32x2x2_t, float32x2_t, float32x4x2_t, float32_t, v4sf,
+		 sf, f32, float32x4_t)
+__LD2_LANE_FUNC (float64x1x2_t, float64x1_t, float64x2x2_t, float64_t, v2df,
+		 df, f64, float64x2_t)
+__LD2_LANE_FUNC (poly8x8x2_t, poly8x8_t, poly8x16x2_t, poly8_t, v16qi, qi, p8,
+		 int8x16_t)
+__LD2_LANE_FUNC (poly16x4x2_t, poly16x4_t, poly16x8x2_t, poly16_t, v8hi, hi,
+		 p16, int16x8_t)
+__LD2_LANE_FUNC (int8x8x2_t, int8x8_t, int8x16x2_t, int8_t, v16qi, qi, s8,
+		 int8x16_t)
+__LD2_LANE_FUNC (int16x4x2_t, int16x4_t, int16x8x2_t, int16_t, v8hi, hi, s16,
+		 int16x8_t)
+__LD2_LANE_FUNC (int32x2x2_t, int32x2_t, int32x4x2_t, int32_t, v4si, si, s32,
+		 int32x4_t)
+__LD2_LANE_FUNC (int64x1x2_t, int64x1_t, int64x2x2_t, int64_t, v2di, di, s64,
+		 int64x2_t)
+__LD2_LANE_FUNC (uint8x8x2_t, uint8x8_t, uint8x16x2_t, uint8_t, v16qi, qi, u8,
+		 int8x16_t)
+__LD2_LANE_FUNC (uint16x4x2_t, uint16x4_t, uint16x8x2_t, uint16_t, v8hi, hi,
+		 u16, int16x8_t)
+__LD2_LANE_FUNC (uint32x2x2_t, uint32x2_t, uint32x4x2_t, uint32_t, v4si, si,
+		 u32, int32x4_t)
+__LD2_LANE_FUNC (uint64x1x2_t, uint64x1_t, uint64x2x2_t, uint64_t, v2di, di,
+		 u64, int64x2_t)
+
+#undef __LD2_LANE_FUNC
+#define __LD2_LANE_FUNC(intype, ptrtype, mode, ptrmode, funcsuffix)	   \
+__extension__ static __inline intype __attribute__ ((__always_inline__))   \
+vld2q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{									   \
+  union { intype __i;							   \
+	  __builtin_aarch64_simd_oi __o; } __temp = { __b };		   \
+  __temp.__o = __builtin_aarch64_ld2_lane##mode (			   \
+	(__builtin_aarch64_simd_##ptrmode *) __ptr, __temp.__o, __c);	   \
+  return __temp.__i;							   \
+}
+
+__LD2_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32)
+__LD2_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64)
+__LD2_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8)
+__LD2_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16)
+__LD2_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8)
+__LD2_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16)
+__LD2_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32)
+__LD2_LANE_FUNC (int64x2x2_t, int64_t, v2di, di, s64)
+__LD2_LANE_FUNC (uint8x16x2_t, uint8_t, v16qi, qi, u8)
+__LD2_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16)
+__LD2_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32)
+__LD2_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64)
 
 #define __LD3R_FUNC(rettype, structtype, ptrtype,			\
 		    regsuffix, funcsuffix, Q)				\
@@ -11887,47 +11919,85 @@  __LD3R_FUNC (uint16x8x3_t, uint16x3_t, uint16_t, 8h, u16, q)
 __LD3R_FUNC (uint32x4x3_t, uint32x3_t, uint32_t, 4s, u32, q)
 __LD3R_FUNC (uint64x2x3_t, uint64x3_t, uint64_t, 2d, u64, q)
 
-#define __LD3_LANE_FUNC(rettype, ptrtype, regsuffix,			\
-			lnsuffix, funcsuffix, Q)			\
-  __extension__ static __inline rettype					\
-  __attribute__ ((__always_inline__))					\
-  vld3 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr,		\
-				     rettype b, const int c)		\
-  {									\
-    rettype result;							\
-    __asm__ ("ld1 {v16." #regsuffix " - v18." #regsuffix "}, %1\n\t"	\
-	     "ld3 {v16." #lnsuffix " - v18." #lnsuffix "}[%3], %2\n\t"	\
-	     "st1 {v16." #regsuffix " - v18." #regsuffix "}, %0\n\t"	\
-	     : "=Q"(result)						\
-	     : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c)		\
-	     : "memory", "v16", "v17", "v18");				\
-    return result;							\
-  }
-
-__LD3_LANE_FUNC (int8x8x3_t, uint8_t, 8b, b, s8,)
-__LD3_LANE_FUNC (float32x2x3_t, float32_t, 2s, s, f32,)
-__LD3_LANE_FUNC (float64x1x3_t, float64_t, 1d, d, f64,)
-__LD3_LANE_FUNC (poly8x8x3_t, poly8_t, 8b, b, p8,)
-__LD3_LANE_FUNC (poly16x4x3_t, poly16_t, 4h, h, p16,)
-__LD3_LANE_FUNC (int16x4x3_t, int16_t, 4h, h, s16,)
-__LD3_LANE_FUNC (int32x2x3_t, int32_t, 2s, s, s32,)
-__LD3_LANE_FUNC (int64x1x3_t, int64_t, 1d, d, s64,)
-__LD3_LANE_FUNC (uint8x8x3_t, uint8_t, 8b, b, u8,)
-__LD3_LANE_FUNC (uint16x4x3_t, uint16_t, 4h, h, u16,)
-__LD3_LANE_FUNC (uint32x2x3_t, uint32_t, 2s, s, u32,)
-__LD3_LANE_FUNC (uint64x1x3_t, uint64_t, 1d, d, u64,)
-__LD3_LANE_FUNC (float32x4x3_t, float32_t, 4s, s, f32, q)
-__LD3_LANE_FUNC (float64x2x3_t, float64_t, 2d, d, f64, q)
-__LD3_LANE_FUNC (poly8x16x3_t, poly8_t, 16b, b, p8, q)
-__LD3_LANE_FUNC (poly16x8x3_t, poly16_t, 8h, h, p16, q)
-__LD3_LANE_FUNC (int8x16x3_t, int8_t, 16b, b, s8, q)
-__LD3_LANE_FUNC (int16x8x3_t, int16_t, 8h, h, s16, q)
-__LD3_LANE_FUNC (int32x4x3_t, int32_t, 4s, s, s32, q)
-__LD3_LANE_FUNC (int64x2x3_t, int64_t, 2d, d, s64, q)
-__LD3_LANE_FUNC (uint8x16x3_t, uint8_t, 16b, b, u8, q)
-__LD3_LANE_FUNC (uint16x8x3_t, uint16_t, 8h, h, u16, q)
-__LD3_LANE_FUNC (uint32x4x3_t, uint32_t, 4s, s, u32, q)
-__LD3_LANE_FUNC (uint64x2x3_t, uint64_t, 2d, d, u64, q)
+#define __LD3_LANE_FUNC(intype, vectype, largetype, ptrtype,		   \
+			 mode, ptrmode, funcsuffix, signedtype)		   \
+__extension__ static __inline intype __attribute__ ((__always_inline__))   \
+vld3_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
+{									   \
+  __builtin_aarch64_simd_ci __o;					   \
+  largetype __temp;							   \
+  __temp.val[0] = 							   \
+    vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0));	   \
+  __temp.val[1] =							   \
+    vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0));	   \
+  __temp.val[2] =							   \
+    vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0));	   \
+  __o = __builtin_aarch64_set_qregci##mode (__o,			   \
+					   (signedtype) __temp.val[0],	   \
+					   0);				   \
+  __o = __builtin_aarch64_set_qregci##mode (__o,			   \
+					   (signedtype) __temp.val[1],	   \
+					   1);				   \
+  __o = __builtin_aarch64_set_qregci##mode (__o,			   \
+					   (signedtype) __temp.val[2],	   \
+					   2);				   \
+  __o =	__builtin_aarch64_ld3_lane##mode (				   \
+	  (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);	   \
+  __b.val[0] = (vectype) __builtin_aarch64_get_dregcidi (__o, 0);	   \
+  __b.val[1] = (vectype) __builtin_aarch64_get_dregcidi (__o, 1);	   \
+  __b.val[2] = (vectype) __builtin_aarch64_get_dregcidi (__o, 2);	   \
+  return __b;								   \
+}
+
+__LD3_LANE_FUNC (float32x2x3_t, float32x2_t, float32x4x3_t, float32_t, v4sf,
+		 sf, f32, float32x4_t)
+__LD3_LANE_FUNC (float64x1x3_t, float64x1_t, float64x2x3_t, float64_t, v2df,
+		 df, f64, float64x2_t)
+__LD3_LANE_FUNC (poly8x8x3_t, poly8x8_t, poly8x16x3_t, poly8_t, v16qi, qi, p8,
+		 int8x16_t)
+__LD3_LANE_FUNC (poly16x4x3_t, poly16x4_t, poly16x8x3_t, poly16_t, v8hi, hi,
+		 p16, int16x8_t)
+__LD3_LANE_FUNC (int8x8x3_t, int8x8_t, int8x16x3_t, int8_t, v16qi, qi, s8,
+		 int8x16_t)
+__LD3_LANE_FUNC (int16x4x3_t, int16x4_t, int16x8x3_t, int16_t, v8hi, hi, s16,
+		 int16x8_t)
+__LD3_LANE_FUNC (int32x2x3_t, int32x2_t, int32x4x3_t, int32_t, v4si, si, s32,
+		 int32x4_t)
+__LD3_LANE_FUNC (int64x1x3_t, int64x1_t, int64x2x3_t, int64_t, v2di, di, s64,
+		 int64x2_t)
+__LD3_LANE_FUNC (uint8x8x3_t, uint8x8_t, uint8x16x3_t, uint8_t, v16qi, qi, u8,
+		 int8x16_t)
+__LD3_LANE_FUNC (uint16x4x3_t, uint16x4_t, uint16x8x3_t, uint16_t, v8hi, hi,
+		 u16, int16x8_t)
+__LD3_LANE_FUNC (uint32x2x3_t, uint32x2_t, uint32x4x3_t, uint32_t, v4si, si,
+		 u32, int32x4_t)
+__LD3_LANE_FUNC (uint64x1x3_t, uint64x1_t, uint64x2x3_t, uint64_t, v2di, di,
+		 u64, int64x2_t)
+
+#undef __LD3_LANE_FUNC
+#define __LD3_LANE_FUNC(intype, ptrtype, mode, ptrmode, funcsuffix)	   \
+__extension__ static __inline intype __attribute__ ((__always_inline__))   \
+vld3q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{									   \
+  union { intype __i;							   \
+	  __builtin_aarch64_simd_xi __o; } __temp = { __b };		   \
+  __temp.__o = __builtin_aarch64_ld4_lane##mode (			   \
+	(__builtin_aarch64_simd_##ptrmode *) __ptr, __temp.__o, __c);	   \
+  return __temp.__i;							   \
+}
+
+__LD3_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32)
+__LD3_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64)
+__LD3_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8)
+__LD3_LANE_FUNC (poly16x8x3_t, poly16_t, v8hi, hi, p16)
+__LD3_LANE_FUNC (int8x16x3_t, int8_t, v16qi, qi, s8)
+__LD3_LANE_FUNC (int16x8x3_t, int16_t, v8hi, hi, s16)
+__LD3_LANE_FUNC (int32x4x3_t, int32_t, v4si, si, s32)
+__LD3_LANE_FUNC (int64x2x3_t, int64_t, v2di, di, s64)
+__LD3_LANE_FUNC (uint8x16x3_t, uint8_t, v16qi, qi, u8)
+__LD3_LANE_FUNC (uint16x8x3_t, uint16_t, v8hi, hi, u16)
+__LD3_LANE_FUNC (uint32x4x3_t, uint32_t, v4si, si, u32)
+__LD3_LANE_FUNC (uint64x2x3_t, uint64_t, v2di, di, u64)
 
 #define __LD4R_FUNC(rettype, structtype, ptrtype,			\
 		    regsuffix, funcsuffix, Q)				\
@@ -11969,47 +12039,92 @@  __LD4R_FUNC (uint16x8x4_t, uint16x4_t, uint16_t, 8h, u16, q)
 __LD4R_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, 4s, u32, q)
 __LD4R_FUNC (uint64x2x4_t, uint64x4_t, uint64_t, 2d, u64, q)
 
-#define __LD4_LANE_FUNC(rettype, ptrtype, regsuffix,			\
-			lnsuffix, funcsuffix, Q)			\
-  __extension__ static __inline rettype					\
-  __attribute__ ((__always_inline__))					\
-  vld4 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr,		\
-				     rettype b, const int c)		\
-  {									\
-    rettype result;							\
-    __asm__ ("ld1 {v16." #regsuffix " - v19." #regsuffix "}, %1\n\t"	\
-	     "ld4 {v16." #lnsuffix " - v19." #lnsuffix "}[%3], %2\n\t"	\
-	     "st1 {v16." #regsuffix " - v19." #regsuffix "}, %0\n\t"	\
-	     : "=Q"(result)						\
-	     : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c)		\
-	     : "memory", "v16", "v17", "v18", "v19");			\
-    return result;							\
-  }
 
-__LD4_LANE_FUNC (int8x8x4_t, uint8_t, 8b, b, s8,)
-__LD4_LANE_FUNC (float32x2x4_t, float32_t, 2s, s, f32,)
-__LD4_LANE_FUNC (float64x1x4_t, float64_t, 1d, d, f64,)
-__LD4_LANE_FUNC (poly8x8x4_t, poly8_t, 8b, b, p8,)
-__LD4_LANE_FUNC (poly16x4x4_t, poly16_t, 4h, h, p16,)
-__LD4_LANE_FUNC (int16x4x4_t, int16_t, 4h, h, s16,)
-__LD4_LANE_FUNC (int32x2x4_t, int32_t, 2s, s, s32,)
-__LD4_LANE_FUNC (int64x1x4_t, int64_t, 1d, d, s64,)
-__LD4_LANE_FUNC (uint8x8x4_t, uint8_t, 8b, b, u8,)
-__LD4_LANE_FUNC (uint16x4x4_t, uint16_t, 4h, h, u16,)
-__LD4_LANE_FUNC (uint32x2x4_t, uint32_t, 2s, s, u32,)
-__LD4_LANE_FUNC (uint64x1x4_t, uint64_t, 1d, d, u64,)
-__LD4_LANE_FUNC (float32x4x4_t, float32_t, 4s, s, f32, q)
-__LD4_LANE_FUNC (float64x2x4_t, float64_t, 2d, d, f64, q)
-__LD4_LANE_FUNC (poly8x16x4_t, poly8_t, 16b, b, p8, q)
-__LD4_LANE_FUNC (poly16x8x4_t, poly16_t, 8h, h, p16, q)
-__LD4_LANE_FUNC (int8x16x4_t, int8_t, 16b, b, s8, q)
-__LD4_LANE_FUNC (int16x8x4_t, int16_t, 8h, h, s16, q)
-__LD4_LANE_FUNC (int32x4x4_t, int32_t, 4s, s, s32, q)
-__LD4_LANE_FUNC (int64x2x4_t, int64_t, 2d, d, s64, q)
-__LD4_LANE_FUNC (uint8x16x4_t, uint8_t, 16b, b, u8, q)
-__LD4_LANE_FUNC (uint16x8x4_t, uint16_t, 8h, h, u16, q)
-__LD4_LANE_FUNC (uint32x4x4_t, uint32_t, 4s, s, u32, q)
-__LD4_LANE_FUNC (uint64x2x4_t, uint64_t, 2d, d, u64, q)
+#define __LD4_LANE_FUNC(intype, vectype, largetype, ptrtype,		   \
+			 mode, ptrmode, funcsuffix, signedtype)		   \
+__extension__ static __inline intype __attribute__ ((__always_inline__))   \
+vld4_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
+{									   \
+  __builtin_aarch64_simd_xi __o;					   \
+  largetype __temp;							   \
+  __temp.val[0] = 							   \
+    vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0));	   \
+  __temp.val[1] =							   \
+    vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0));	   \
+  __temp.val[2] =							   \
+    vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0));	   \
+  __temp.val[3] =							   \
+    vcombine_##funcsuffix (__b.val[3], vcreate_##funcsuffix (0));	   \
+  __o = __builtin_aarch64_set_qregxi##mode (__o,			   \
+					   (signedtype) __temp.val[0],	   \
+					   0);				   \
+  __o = __builtin_aarch64_set_qregxi##mode (__o,			   \
+					   (signedtype) __temp.val[1],	   \
+					   1);				   \
+  __o = __builtin_aarch64_set_qregxi##mode (__o,			   \
+					   (signedtype) __temp.val[2],	   \
+					   2);				   \
+  __o = __builtin_aarch64_set_qregxi##mode (__o,			   \
+					   (signedtype) __temp.val[3],	   \
+					   3);				   \
+  __o =	__builtin_aarch64_ld4_lane##mode (				   \
+	  (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);	   \
+  __b.val[0] = (vectype) __builtin_aarch64_get_dregxidi (__o, 0);	   \
+  __b.val[1] = (vectype) __builtin_aarch64_get_dregxidi (__o, 1);	   \
+  __b.val[2] = (vectype) __builtin_aarch64_get_dregxidi (__o, 2);	   \
+  __b.val[3] = (vectype) __builtin_aarch64_get_dregxidi (__o, 3);	   \
+  return __b;								   \
+}
+
+__LD4_LANE_FUNC (float32x2x4_t, float32x2_t, float32x4x4_t, float32_t, v4sf,
+		 sf, f32, float32x4_t)
+__LD4_LANE_FUNC (float64x1x4_t, float64x1_t, float64x2x4_t, float64_t, v2df,
+		 df, f64, float64x2_t)
+__LD4_LANE_FUNC (poly8x8x4_t, poly8x8_t, poly8x16x4_t, poly8_t, v16qi, qi, p8,
+		 int8x16_t)
+__LD4_LANE_FUNC (poly16x4x4_t, poly16x4_t, poly16x8x4_t, poly16_t, v8hi, hi,
+		 p16, int16x8_t)
+__LD4_LANE_FUNC (int8x8x4_t, int8x8_t, int8x16x4_t, int8_t, v16qi, qi, s8,
+		 int8x16_t)
+__LD4_LANE_FUNC (int16x4x4_t, int16x4_t, int16x8x4_t, int16_t, v8hi, hi, s16,
+		 int16x8_t)
+__LD4_LANE_FUNC (int32x2x4_t, int32x2_t, int32x4x4_t, int32_t, v4si, si, s32,
+		 int32x4_t)
+__LD4_LANE_FUNC (int64x1x4_t, int64x1_t, int64x2x4_t, int64_t, v2di, di, s64,
+		 int64x2_t)
+__LD4_LANE_FUNC (uint8x8x4_t, uint8x8_t, uint8x16x4_t, uint8_t, v16qi, qi, u8,
+		 int8x16_t)
+__LD4_LANE_FUNC (uint16x4x4_t, uint16x4_t, uint16x8x4_t, uint16_t, v8hi, hi,
+		 u16, int16x8_t)
+__LD4_LANE_FUNC (uint32x2x4_t, uint32x2_t, uint32x4x4_t, uint32_t, v4si, si,
+		 u32, int32x4_t)
+__LD4_LANE_FUNC (uint64x1x4_t, uint64x1_t, uint64x2x4_t, uint64_t, v2di, di,
+		 u64, int64x2_t)
+
+#undef __LD4_LANE_FUNC
+#define __LD4_LANE_FUNC(intype, ptrtype, mode, ptrmode, funcsuffix)	   \
+__extension__ static __inline intype __attribute__ ((__always_inline__))   \
+vld4q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{									   \
+  union { intype __i;							   \
+	  __builtin_aarch64_simd_xi __o; } __temp = { __b };		   \
+  __temp.__o = __builtin_aarch64_ld4_lane##mode (			   \
+	(__builtin_aarch64_simd_##ptrmode *) __ptr, __temp.__o, __c);	   \
+  return __temp.__i;							   \
+}
+
+__LD4_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32)
+__LD4_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64)
+__LD4_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8)
+__LD4_LANE_FUNC (poly16x8x4_t, poly16_t, v8hi, hi, p16)
+__LD4_LANE_FUNC (int8x16x4_t, int8_t, v16qi, qi, s8)
+__LD4_LANE_FUNC (int16x8x4_t, int16_t, v8hi, hi, s16)
+__LD4_LANE_FUNC (int32x4x4_t, int32_t, v4si, si, s32)
+__LD4_LANE_FUNC (int64x2x4_t, int64_t, v2di, di, s64)
+__LD4_LANE_FUNC (uint8x16x4_t, uint8_t, v16qi, qi, u8)
+__LD4_LANE_FUNC (uint16x8x4_t, uint16_t, v8hi, hi, u16)
+__LD4_LANE_FUNC (uint32x4x4_t, uint32_t, v4si, si, u32)
+__LD4_LANE_FUNC (uint64x2x4_t, uint64_t, v2di, di, u64)
 
 #define __ST2_LANE_FUNC(intype, largetype, ptrtype,			     \
 			mode, ptr_mode, funcsuffix, signedtype)		     \