diff mbox

[2/2,AARCH64,NEON] Convert arm_neon.h to use new builtins for vld[234](q?)_lane_*

Message ID CADnVucDj5hBwpvrj+EJGqunDVoT7mfXMgZeqGQ4h_uaVcb1Csg@mail.gmail.com
State New
Headers show

Commit Message

Charles Baylis Oct. 23, 2014, 5:51 p.m. UTC
On 23 October 2014 11:14, Marcus Shawcroft <marcus.shawcroft@gmail.com> wrote:
> On 8 October 2014 18:27,  <charles.baylis@linaro.org> wrote:
>
>> +#define __LD2_LANE_FUNC(intype, vectype, largetype, ptrtype,              \
>
> Just above the insertion point in arm-neon.h is the comment:
>
> /* Start of temporary inline asm for vldn, vstn and friends.  */
>
> This patch removes the "temporary inline asm vldn" implementation, the
> replacement implementation should be inserted in the section below,
> look for the comment that starts:

OK. The vstN_lane intrinsics are similarly misplaced, I'll do a
separate patch to move them.

> "Start of optimal implementations...."
>
>> +                        mode, ptrmode, funcsuffix, signedtype)            \
>> +__extension__ static __inline intype __attribute__ ((__always_inline__))   \
>> +vld2_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
>> +{                                                                         \
>> +  __builtin_aarch64_simd_oi __o;                                          \
>> +  largetype __temp;                                                       \
>> +  __temp.val[0] =                                                         \
>
> There is something odd about the white space here, space before tab?
> This is repeated in various places through the rest of the patch.

There are a few spaces before tabs, but the weird misalignment of the
\'s when reading the patch is just due to the effect of the unified
diff misaligning the code with the tabstops.

I have respun the patch with those spaces removed.

> Otherwise this and the previous 1/2 associated patch look good, can
> you respin with these tidy ups?

OK for trunk?

Comments

Marcus Shawcroft Oct. 24, 2014, 10:23 a.m. UTC | #1
On 23 October 2014 18:51, Charles Baylis <charles.baylis@linaro.org> wrote:

>> Otherwise this and the previous 1/2 associated patch look good, can
>> you respin with these tidy ups?
>
> OK for trunk?

OK
/Marcus
Charles Baylis Oct. 24, 2014, 3:12 p.m. UTC | #2
On 24 October 2014 11:23, Marcus Shawcroft <marcus.shawcroft@gmail.com> wrote:
> On 23 October 2014 18:51, Charles Baylis <charles.baylis@linaro.org> wrote:
>
>>> Otherwise this and the previous 1/2 associated patch look good, can
>>> you respin with these tidy ups?
>>
>> OK for trunk?
>
> OK
> /Marcus

Committed to trunk as r216671 and r216672.
diff mbox

Patch

From a37d24c57f6c7abe4ade05c1f383e82ebd20c052 Mon Sep 17 00:00:00 2001
From: Charles Baylis <charles.baylis@linaro.org>
Date: Wed, 10 Sep 2014 13:45:25 +0100
Subject: [PATCH 2/2] [AARCH64,NEON] Convert arm_neon.h to use new builtins for
 vld[234](q?)_lane_*

This patch replaces the inline assembler implementations of the
vld[234](q?)_lane_* intrinsics with new versions which exploit the new builtin
functions added in patch 1.

Tested (with the rest of the patch series) with make check on aarch64-oe-linux
with qemu, and also causes no regressions in clyon's NEON intrinsics tests.

<DATE>  Charles Baylis  <charles.baylis@linaro.org>

	* config/aarch64/arm_neon.h (__LD2_LANE_FUNC): Rewrite using builtins,
	update uses to use new macro arguments.
	(__LD3_LANE_FUNC): Likewise.
	(__LD4_LANE_FUNC): Likewise.
---
 gcc/config/aarch64/arm_neon.h | 404 +++++++++++++++++++++++++++++-------------
 1 file changed, 281 insertions(+), 123 deletions(-)

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 9b1873f..18c6e92 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -11805,47 +11805,6 @@  __LD2R_FUNC (uint16x8x2_t, uint16x2_t, uint16_t, 8h, u16, q)
 __LD2R_FUNC (uint32x4x2_t, uint32x2_t, uint32_t, 4s, u32, q)
 __LD2R_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, 2d, u64, q)
 
-#define __LD2_LANE_FUNC(rettype, ptrtype, regsuffix,			\
-			lnsuffix, funcsuffix, Q)			\
-  __extension__ static __inline rettype					\
-  __attribute__ ((__always_inline__))					\
-  vld2 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr,		\
-				     rettype b, const int c)		\
-  {									\
-    rettype result;							\
-    __asm__ ("ld1 {v16." #regsuffix ", v17." #regsuffix "}, %1\n\t"	\
-	     "ld2 {v16." #lnsuffix ", v17." #lnsuffix "}[%3], %2\n\t"	\
-	     "st1 {v16." #regsuffix ", v17." #regsuffix "}, %0\n\t"	\
-	     : "=Q"(result)						\
-	     : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c)		\
-	     : "memory", "v16", "v17");					\
-    return result;							\
-  }
-
-__LD2_LANE_FUNC (int8x8x2_t, uint8_t, 8b, b, s8,)
-__LD2_LANE_FUNC (float32x2x2_t, float32_t, 2s, s, f32,)
-__LD2_LANE_FUNC (float64x1x2_t, float64_t, 1d, d, f64,)
-__LD2_LANE_FUNC (poly8x8x2_t, poly8_t, 8b, b, p8,)
-__LD2_LANE_FUNC (poly16x4x2_t, poly16_t, 4h, h, p16,)
-__LD2_LANE_FUNC (int16x4x2_t, int16_t, 4h, h, s16,)
-__LD2_LANE_FUNC (int32x2x2_t, int32_t, 2s, s, s32,)
-__LD2_LANE_FUNC (int64x1x2_t, int64_t, 1d, d, s64,)
-__LD2_LANE_FUNC (uint8x8x2_t, uint8_t, 8b, b, u8,)
-__LD2_LANE_FUNC (uint16x4x2_t, uint16_t, 4h, h, u16,)
-__LD2_LANE_FUNC (uint32x2x2_t, uint32_t, 2s, s, u32,)
-__LD2_LANE_FUNC (uint64x1x2_t, uint64_t, 1d, d, u64,)
-__LD2_LANE_FUNC (float32x4x2_t, float32_t, 4s, s, f32, q)
-__LD2_LANE_FUNC (float64x2x2_t, float64_t, 2d, d, f64, q)
-__LD2_LANE_FUNC (poly8x16x2_t, poly8_t, 16b, b, p8, q)
-__LD2_LANE_FUNC (poly16x8x2_t, poly16_t, 8h, h, p16, q)
-__LD2_LANE_FUNC (int8x16x2_t, int8_t, 16b, b, s8, q)
-__LD2_LANE_FUNC (int16x8x2_t, int16_t, 8h, h, s16, q)
-__LD2_LANE_FUNC (int32x4x2_t, int32_t, 4s, s, s32, q)
-__LD2_LANE_FUNC (int64x2x2_t, int64_t, 2d, d, s64, q)
-__LD2_LANE_FUNC (uint8x16x2_t, uint8_t, 16b, b, u8, q)
-__LD2_LANE_FUNC (uint16x8x2_t, uint16_t, 8h, h, u16, q)
-__LD2_LANE_FUNC (uint32x4x2_t, uint32_t, 4s, s, u32, q)
-__LD2_LANE_FUNC (uint64x2x2_t, uint64_t, 2d, d, u64, q)
 
 #define __LD3R_FUNC(rettype, structtype, ptrtype,			\
 		    regsuffix, funcsuffix, Q)				\
@@ -11887,48 +11846,6 @@  __LD3R_FUNC (uint16x8x3_t, uint16x3_t, uint16_t, 8h, u16, q)
 __LD3R_FUNC (uint32x4x3_t, uint32x3_t, uint32_t, 4s, u32, q)
 __LD3R_FUNC (uint64x2x3_t, uint64x3_t, uint64_t, 2d, u64, q)
 
-#define __LD3_LANE_FUNC(rettype, ptrtype, regsuffix,			\
-			lnsuffix, funcsuffix, Q)			\
-  __extension__ static __inline rettype					\
-  __attribute__ ((__always_inline__))					\
-  vld3 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr,		\
-				     rettype b, const int c)		\
-  {									\
-    rettype result;							\
-    __asm__ ("ld1 {v16." #regsuffix " - v18." #regsuffix "}, %1\n\t"	\
-	     "ld3 {v16." #lnsuffix " - v18." #lnsuffix "}[%3], %2\n\t"	\
-	     "st1 {v16." #regsuffix " - v18." #regsuffix "}, %0\n\t"	\
-	     : "=Q"(result)						\
-	     : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c)		\
-	     : "memory", "v16", "v17", "v18");				\
-    return result;							\
-  }
-
-__LD3_LANE_FUNC (int8x8x3_t, uint8_t, 8b, b, s8,)
-__LD3_LANE_FUNC (float32x2x3_t, float32_t, 2s, s, f32,)
-__LD3_LANE_FUNC (float64x1x3_t, float64_t, 1d, d, f64,)
-__LD3_LANE_FUNC (poly8x8x3_t, poly8_t, 8b, b, p8,)
-__LD3_LANE_FUNC (poly16x4x3_t, poly16_t, 4h, h, p16,)
-__LD3_LANE_FUNC (int16x4x3_t, int16_t, 4h, h, s16,)
-__LD3_LANE_FUNC (int32x2x3_t, int32_t, 2s, s, s32,)
-__LD3_LANE_FUNC (int64x1x3_t, int64_t, 1d, d, s64,)
-__LD3_LANE_FUNC (uint8x8x3_t, uint8_t, 8b, b, u8,)
-__LD3_LANE_FUNC (uint16x4x3_t, uint16_t, 4h, h, u16,)
-__LD3_LANE_FUNC (uint32x2x3_t, uint32_t, 2s, s, u32,)
-__LD3_LANE_FUNC (uint64x1x3_t, uint64_t, 1d, d, u64,)
-__LD3_LANE_FUNC (float32x4x3_t, float32_t, 4s, s, f32, q)
-__LD3_LANE_FUNC (float64x2x3_t, float64_t, 2d, d, f64, q)
-__LD3_LANE_FUNC (poly8x16x3_t, poly8_t, 16b, b, p8, q)
-__LD3_LANE_FUNC (poly16x8x3_t, poly16_t, 8h, h, p16, q)
-__LD3_LANE_FUNC (int8x16x3_t, int8_t, 16b, b, s8, q)
-__LD3_LANE_FUNC (int16x8x3_t, int16_t, 8h, h, s16, q)
-__LD3_LANE_FUNC (int32x4x3_t, int32_t, 4s, s, s32, q)
-__LD3_LANE_FUNC (int64x2x3_t, int64_t, 2d, d, s64, q)
-__LD3_LANE_FUNC (uint8x16x3_t, uint8_t, 16b, b, u8, q)
-__LD3_LANE_FUNC (uint16x8x3_t, uint16_t, 8h, h, u16, q)
-__LD3_LANE_FUNC (uint32x4x3_t, uint32_t, 4s, s, u32, q)
-__LD3_LANE_FUNC (uint64x2x3_t, uint64_t, 2d, d, u64, q)
-
 #define __LD4R_FUNC(rettype, structtype, ptrtype,			\
 		    regsuffix, funcsuffix, Q)				\
   __extension__ static __inline rettype					\
@@ -11969,47 +11886,7 @@  __LD4R_FUNC (uint16x8x4_t, uint16x4_t, uint16_t, 8h, u16, q)
 __LD4R_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, 4s, u32, q)
 __LD4R_FUNC (uint64x2x4_t, uint64x4_t, uint64_t, 2d, u64, q)
 
-#define __LD4_LANE_FUNC(rettype, ptrtype, regsuffix,			\
-			lnsuffix, funcsuffix, Q)			\
-  __extension__ static __inline rettype					\
-  __attribute__ ((__always_inline__))					\
-  vld4 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr,		\
-				     rettype b, const int c)		\
-  {									\
-    rettype result;							\
-    __asm__ ("ld1 {v16." #regsuffix " - v19." #regsuffix "}, %1\n\t"	\
-	     "ld4 {v16." #lnsuffix " - v19." #lnsuffix "}[%3], %2\n\t"	\
-	     "st1 {v16." #regsuffix " - v19." #regsuffix "}, %0\n\t"	\
-	     : "=Q"(result)						\
-	     : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c)		\
-	     : "memory", "v16", "v17", "v18", "v19");			\
-    return result;							\
-  }
 
-__LD4_LANE_FUNC (int8x8x4_t, uint8_t, 8b, b, s8,)
-__LD4_LANE_FUNC (float32x2x4_t, float32_t, 2s, s, f32,)
-__LD4_LANE_FUNC (float64x1x4_t, float64_t, 1d, d, f64,)
-__LD4_LANE_FUNC (poly8x8x4_t, poly8_t, 8b, b, p8,)
-__LD4_LANE_FUNC (poly16x4x4_t, poly16_t, 4h, h, p16,)
-__LD4_LANE_FUNC (int16x4x4_t, int16_t, 4h, h, s16,)
-__LD4_LANE_FUNC (int32x2x4_t, int32_t, 2s, s, s32,)
-__LD4_LANE_FUNC (int64x1x4_t, int64_t, 1d, d, s64,)
-__LD4_LANE_FUNC (uint8x8x4_t, uint8_t, 8b, b, u8,)
-__LD4_LANE_FUNC (uint16x4x4_t, uint16_t, 4h, h, u16,)
-__LD4_LANE_FUNC (uint32x2x4_t, uint32_t, 2s, s, u32,)
-__LD4_LANE_FUNC (uint64x1x4_t, uint64_t, 1d, d, u64,)
-__LD4_LANE_FUNC (float32x4x4_t, float32_t, 4s, s, f32, q)
-__LD4_LANE_FUNC (float64x2x4_t, float64_t, 2d, d, f64, q)
-__LD4_LANE_FUNC (poly8x16x4_t, poly8_t, 16b, b, p8, q)
-__LD4_LANE_FUNC (poly16x8x4_t, poly16_t, 8h, h, p16, q)
-__LD4_LANE_FUNC (int8x16x4_t, int8_t, 16b, b, s8, q)
-__LD4_LANE_FUNC (int16x8x4_t, int16_t, 8h, h, s16, q)
-__LD4_LANE_FUNC (int32x4x4_t, int32_t, 4s, s, s32, q)
-__LD4_LANE_FUNC (int64x2x4_t, int64_t, 2d, d, s64, q)
-__LD4_LANE_FUNC (uint8x16x4_t, uint8_t, 16b, b, u8, q)
-__LD4_LANE_FUNC (uint16x8x4_t, uint16_t, 8h, h, u16, q)
-__LD4_LANE_FUNC (uint32x4x4_t, uint32_t, 4s, s, u32, q)
-__LD4_LANE_FUNC (uint64x2x4_t, uint64_t, 2d, d, u64, q)
 
 #define __ST2_LANE_FUNC(intype, largetype, ptrtype,			     \
 			mode, ptr_mode, funcsuffix, signedtype)		     \
@@ -17583,6 +17460,287 @@  vld4q_f64 (const float64_t * __a)
   return ret;
 }
 
+/* vld2_lane */
+
+#define __LD2_LANE_FUNC(intype, vectype, largetype, ptrtype,		   \
+			 mode, ptrmode, funcsuffix, signedtype)		   \
+__extension__ static __inline intype __attribute__ ((__always_inline__))   \
+vld2_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
+{									   \
+  __builtin_aarch64_simd_oi __o;					   \
+  largetype __temp;							   \
+  __temp.val[0] =							   \
+    vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0));	   \
+  __temp.val[1] =							   \
+    vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0));	   \
+  __o = __builtin_aarch64_set_qregoi##mode (__o,			   \
+					   (signedtype) __temp.val[0],	   \
+					   0);				   \
+  __o = __builtin_aarch64_set_qregoi##mode (__o,			   \
+					   (signedtype) __temp.val[1],	   \
+					   1);				   \
+  __o =	__builtin_aarch64_ld2_lane##mode (				   \
+	  (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);	   \
+  __b.val[0] = (vectype) __builtin_aarch64_get_dregoidi (__o, 0);	   \
+  __b.val[1] = (vectype) __builtin_aarch64_get_dregoidi (__o, 1);	   \
+  return __b;								   \
+}
+
+__LD2_LANE_FUNC (float32x2x2_t, float32x2_t, float32x4x2_t, float32_t, v4sf,
+		 sf, f32, float32x4_t)
+__LD2_LANE_FUNC (float64x1x2_t, float64x1_t, float64x2x2_t, float64_t, v2df,
+		 df, f64, float64x2_t)
+__LD2_LANE_FUNC (poly8x8x2_t, poly8x8_t, poly8x16x2_t, poly8_t, v16qi, qi, p8,
+		 int8x16_t)
+__LD2_LANE_FUNC (poly16x4x2_t, poly16x4_t, poly16x8x2_t, poly16_t, v8hi, hi,
+		 p16, int16x8_t)
+__LD2_LANE_FUNC (int8x8x2_t, int8x8_t, int8x16x2_t, int8_t, v16qi, qi, s8,
+		 int8x16_t)
+__LD2_LANE_FUNC (int16x4x2_t, int16x4_t, int16x8x2_t, int16_t, v8hi, hi, s16,
+		 int16x8_t)
+__LD2_LANE_FUNC (int32x2x2_t, int32x2_t, int32x4x2_t, int32_t, v4si, si, s32,
+		 int32x4_t)
+__LD2_LANE_FUNC (int64x1x2_t, int64x1_t, int64x2x2_t, int64_t, v2di, di, s64,
+		 int64x2_t)
+__LD2_LANE_FUNC (uint8x8x2_t, uint8x8_t, uint8x16x2_t, uint8_t, v16qi, qi, u8,
+		 int8x16_t)
+__LD2_LANE_FUNC (uint16x4x2_t, uint16x4_t, uint16x8x2_t, uint16_t, v8hi, hi,
+		 u16, int16x8_t)
+__LD2_LANE_FUNC (uint32x2x2_t, uint32x2_t, uint32x4x2_t, uint32_t, v4si, si,
+		 u32, int32x4_t)
+__LD2_LANE_FUNC (uint64x1x2_t, uint64x1_t, uint64x2x2_t, uint64_t, v2di, di,
+		 u64, int64x2_t)
+
+#undef __LD2_LANE_FUNC
+
+/* vld2q_lane */
+
+#define __LD2_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
+__extension__ static __inline intype __attribute__ ((__always_inline__))   \
+vld2q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{									   \
+  __builtin_aarch64_simd_oi __o;					   \
+  intype ret;								   \
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[0], 0); \
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[1], 1); \
+  __o = __builtin_aarch64_ld2_lane##mode (				   \
+	(__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);		   \
+  ret.val[0] = (vtype) __builtin_aarch64_get_qregoiv4si (__o, 0);	   \
+  ret.val[1] = (vtype) __builtin_aarch64_get_qregoiv4si (__o, 1);	   \
+  return ret;								   \
+}
+
+__LD2_LANE_FUNC (float32x4x2_t, float32x4_t, float32_t, v4sf, sf, f32)
+__LD2_LANE_FUNC (float64x2x2_t, float64x2_t, float64_t, v2df, df, f64)
+__LD2_LANE_FUNC (poly8x16x2_t, poly8x16_t, poly8_t, v16qi, qi, p8)
+__LD2_LANE_FUNC (poly16x8x2_t, poly16x8_t, poly16_t, v8hi, hi, p16)
+__LD2_LANE_FUNC (int8x16x2_t, int8x16_t, int8_t, v16qi, qi, s8)
+__LD2_LANE_FUNC (int16x8x2_t, int16x8_t, int16_t, v8hi, hi, s16)
+__LD2_LANE_FUNC (int32x4x2_t, int32x4_t, int32_t, v4si, si, s32)
+__LD2_LANE_FUNC (int64x2x2_t, int64x2_t, int64_t, v2di, di, s64)
+__LD2_LANE_FUNC (uint8x16x2_t, uint8x16_t, uint8_t, v16qi, qi, u8)
+__LD2_LANE_FUNC (uint16x8x2_t, uint16x8_t, uint16_t, v8hi, hi, u16)
+__LD2_LANE_FUNC (uint32x4x2_t, uint32x4_t, uint32_t, v4si, si, u32)
+__LD2_LANE_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, v2di, di, u64)
+
+#undef __LD2_LANE_FUNC
+
+/* vld3_lane */
+
+#define __LD3_LANE_FUNC(intype, vectype, largetype, ptrtype,		   \
+			 mode, ptrmode, funcsuffix, signedtype)		   \
+__extension__ static __inline intype __attribute__ ((__always_inline__))   \
+vld3_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
+{									   \
+  __builtin_aarch64_simd_ci __o;					   \
+  largetype __temp;							   \
+  __temp.val[0] =							   \
+    vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0));	   \
+  __temp.val[1] =							   \
+    vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0));	   \
+  __temp.val[2] =							   \
+    vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0));	   \
+  __o = __builtin_aarch64_set_qregci##mode (__o,			   \
+					   (signedtype) __temp.val[0],	   \
+					   0);				   \
+  __o = __builtin_aarch64_set_qregci##mode (__o,			   \
+					   (signedtype) __temp.val[1],	   \
+					   1);				   \
+  __o = __builtin_aarch64_set_qregci##mode (__o,			   \
+					   (signedtype) __temp.val[2],	   \
+					   2);				   \
+  __o =	__builtin_aarch64_ld3_lane##mode (				   \
+	  (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);	   \
+  __b.val[0] = (vectype) __builtin_aarch64_get_dregcidi (__o, 0);	   \
+  __b.val[1] = (vectype) __builtin_aarch64_get_dregcidi (__o, 1);	   \
+  __b.val[2] = (vectype) __builtin_aarch64_get_dregcidi (__o, 2);	   \
+  return __b;								   \
+}
+
+__LD3_LANE_FUNC (float32x2x3_t, float32x2_t, float32x4x3_t, float32_t, v4sf,
+		 sf, f32, float32x4_t)
+__LD3_LANE_FUNC (float64x1x3_t, float64x1_t, float64x2x3_t, float64_t, v2df,
+		 df, f64, float64x2_t)
+__LD3_LANE_FUNC (poly8x8x3_t, poly8x8_t, poly8x16x3_t, poly8_t, v16qi, qi, p8,
+		 int8x16_t)
+__LD3_LANE_FUNC (poly16x4x3_t, poly16x4_t, poly16x8x3_t, poly16_t, v8hi, hi,
+		 p16, int16x8_t)
+__LD3_LANE_FUNC (int8x8x3_t, int8x8_t, int8x16x3_t, int8_t, v16qi, qi, s8,
+		 int8x16_t)
+__LD3_LANE_FUNC (int16x4x3_t, int16x4_t, int16x8x3_t, int16_t, v8hi, hi, s16,
+		 int16x8_t)
+__LD3_LANE_FUNC (int32x2x3_t, int32x2_t, int32x4x3_t, int32_t, v4si, si, s32,
+		 int32x4_t)
+__LD3_LANE_FUNC (int64x1x3_t, int64x1_t, int64x2x3_t, int64_t, v2di, di, s64,
+		 int64x2_t)
+__LD3_LANE_FUNC (uint8x8x3_t, uint8x8_t, uint8x16x3_t, uint8_t, v16qi, qi, u8,
+		 int8x16_t)
+__LD3_LANE_FUNC (uint16x4x3_t, uint16x4_t, uint16x8x3_t, uint16_t, v8hi, hi,
+		 u16, int16x8_t)
+__LD3_LANE_FUNC (uint32x2x3_t, uint32x2_t, uint32x4x3_t, uint32_t, v4si, si,
+		 u32, int32x4_t)
+__LD3_LANE_FUNC (uint64x1x3_t, uint64x1_t, uint64x2x3_t, uint64_t, v2di, di,
+		 u64, int64x2_t)
+
+#undef __LD3_LANE_FUNC
+
+/* vld3q_lane */
+
+#define __LD3_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
+__extension__ static __inline intype __attribute__ ((__always_inline__))   \
+vld3q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{									   \
+  __builtin_aarch64_simd_ci __o;					   \
+  intype ret;								   \
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0); \
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1); \
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2); \
+  __o = __builtin_aarch64_ld3_lane##mode (				   \
+	(__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);		   \
+  ret.val[0] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 0);	   \
+  ret.val[1] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 1);	   \
+  ret.val[2] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 2);	   \
+  return ret;								   \
+}
+
+__LD3_LANE_FUNC (float32x4x3_t, float32x4_t, float32_t, v4sf, sf, f32)
+__LD3_LANE_FUNC (float64x2x3_t, float64x2_t, float64_t, v2df, df, f64)
+__LD3_LANE_FUNC (poly8x16x3_t, poly8x16_t, poly8_t, v16qi, qi, p8)
+__LD3_LANE_FUNC (poly16x8x3_t, poly16x8_t, poly16_t, v8hi, hi, p16)
+__LD3_LANE_FUNC (int8x16x3_t, int8x16_t, int8_t, v16qi, qi, s8)
+__LD3_LANE_FUNC (int16x8x3_t, int16x8_t, int16_t, v8hi, hi, s16)
+__LD3_LANE_FUNC (int32x4x3_t, int32x4_t, int32_t, v4si, si, s32)
+__LD3_LANE_FUNC (int64x2x3_t, int64x2_t, int64_t, v2di, di, s64)
+__LD3_LANE_FUNC (uint8x16x3_t, uint8x16_t, uint8_t, v16qi, qi, u8)
+__LD3_LANE_FUNC (uint16x8x3_t, uint16x8_t, uint16_t, v8hi, hi, u16)
+__LD3_LANE_FUNC (uint32x4x3_t, uint32x4_t, uint32_t, v4si, si, u32)
+__LD3_LANE_FUNC (uint64x2x3_t, uint64x2_t, uint64_t, v2di, di, u64)
+
+#undef __LD3_LANE_FUNC
+
+/* vld4_lane */
+
+#define __LD4_LANE_FUNC(intype, vectype, largetype, ptrtype,		   \
+			 mode, ptrmode, funcsuffix, signedtype)		   \
+__extension__ static __inline intype __attribute__ ((__always_inline__))   \
+vld4_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
+{									   \
+  __builtin_aarch64_simd_xi __o;					   \
+  largetype __temp;							   \
+  __temp.val[0] =							   \
+    vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0));	   \
+  __temp.val[1] =							   \
+    vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0));	   \
+  __temp.val[2] =							   \
+    vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0));	   \
+  __temp.val[3] =							   \
+    vcombine_##funcsuffix (__b.val[3], vcreate_##funcsuffix (0));	   \
+  __o = __builtin_aarch64_set_qregxi##mode (__o,			   \
+					   (signedtype) __temp.val[0],	   \
+					   0);				   \
+  __o = __builtin_aarch64_set_qregxi##mode (__o,			   \
+					   (signedtype) __temp.val[1],	   \
+					   1);				   \
+  __o = __builtin_aarch64_set_qregxi##mode (__o,			   \
+					   (signedtype) __temp.val[2],	   \
+					   2);				   \
+  __o = __builtin_aarch64_set_qregxi##mode (__o,			   \
+					   (signedtype) __temp.val[3],	   \
+					   3);				   \
+  __o =	__builtin_aarch64_ld4_lane##mode (				   \
+	  (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);	   \
+  __b.val[0] = (vectype) __builtin_aarch64_get_dregxidi (__o, 0);	   \
+  __b.val[1] = (vectype) __builtin_aarch64_get_dregxidi (__o, 1);	   \
+  __b.val[2] = (vectype) __builtin_aarch64_get_dregxidi (__o, 2);	   \
+  __b.val[3] = (vectype) __builtin_aarch64_get_dregxidi (__o, 3);	   \
+  return __b;								   \
+}
+
+/* vld4q_lane */
+
+__LD4_LANE_FUNC (float32x2x4_t, float32x2_t, float32x4x4_t, float32_t, v4sf,
+		 sf, f32, float32x4_t)
+__LD4_LANE_FUNC (float64x1x4_t, float64x1_t, float64x2x4_t, float64_t, v2df,
+		 df, f64, float64x2_t)
+__LD4_LANE_FUNC (poly8x8x4_t, poly8x8_t, poly8x16x4_t, poly8_t, v16qi, qi, p8,
+		 int8x16_t)
+__LD4_LANE_FUNC (poly16x4x4_t, poly16x4_t, poly16x8x4_t, poly16_t, v8hi, hi,
+		 p16, int16x8_t)
+__LD4_LANE_FUNC (int8x8x4_t, int8x8_t, int8x16x4_t, int8_t, v16qi, qi, s8,
+		 int8x16_t)
+__LD4_LANE_FUNC (int16x4x4_t, int16x4_t, int16x8x4_t, int16_t, v8hi, hi, s16,
+		 int16x8_t)
+__LD4_LANE_FUNC (int32x2x4_t, int32x2_t, int32x4x4_t, int32_t, v4si, si, s32,
+		 int32x4_t)
+__LD4_LANE_FUNC (int64x1x4_t, int64x1_t, int64x2x4_t, int64_t, v2di, di, s64,
+		 int64x2_t)
+__LD4_LANE_FUNC (uint8x8x4_t, uint8x8_t, uint8x16x4_t, uint8_t, v16qi, qi, u8,
+		 int8x16_t)
+__LD4_LANE_FUNC (uint16x4x4_t, uint16x4_t, uint16x8x4_t, uint16_t, v8hi, hi,
+		 u16, int16x8_t)
+__LD4_LANE_FUNC (uint32x2x4_t, uint32x2_t, uint32x4x4_t, uint32_t, v4si, si,
+		 u32, int32x4_t)
+__LD4_LANE_FUNC (uint64x1x4_t, uint64x1_t, uint64x2x4_t, uint64_t, v2di, di,
+		 u64, int64x2_t)
+
+#undef __LD4_LANE_FUNC
+
+/* vld4q_lane */
+
+#define __LD4_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
+__extension__ static __inline intype __attribute__ ((__always_inline__))   \
+vld4q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{									   \
+  __builtin_aarch64_simd_xi __o;					   \
+  intype ret;								   \
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); \
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); \
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); \
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); \
+  __o = __builtin_aarch64_ld4_lane##mode (				   \
+	(__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);		   \
+  ret.val[0] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 0);	   \
+  ret.val[1] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 1);	   \
+  ret.val[2] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 2);	   \
+  ret.val[3] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 3);	   \
+  return ret;								   \
+}
+
+__LD4_LANE_FUNC (float32x4x4_t, float32x4_t, float32_t, v4sf, sf, f32)
+__LD4_LANE_FUNC (float64x2x4_t, float64x2_t, float64_t, v2df, df, f64)
+__LD4_LANE_FUNC (poly8x16x4_t, poly8x16_t, poly8_t, v16qi, qi, p8)
+__LD4_LANE_FUNC (poly16x8x4_t, poly16x8_t, poly16_t, v8hi, hi, p16)
+__LD4_LANE_FUNC (int8x16x4_t, int8x16_t, int8_t, v16qi, qi, s8)
+__LD4_LANE_FUNC (int16x8x4_t, int16x8_t, int16_t, v8hi, hi, s16)
+__LD4_LANE_FUNC (int32x4x4_t, int32x4_t, int32_t, v4si, si, s32)
+__LD4_LANE_FUNC (int64x2x4_t, int64x2_t, int64_t, v2di, di, s64)
+__LD4_LANE_FUNC (uint8x16x4_t, uint8x16_t, uint8_t, v16qi, qi, u8)
+__LD4_LANE_FUNC (uint16x8x4_t, uint16x8_t, uint16_t, v8hi, hi, u16)
+__LD4_LANE_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, v4si, si, u32)
+__LD4_LANE_FUNC (uint64x2x4_t, uint64x2_t, uint64_t, v2di, di, u64)
+
+#undef __LD4_LANE_FUNC
+
 /* vmax */
 
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-- 
1.9.1