diff mbox

[2/3,AArch64] Extend aarch64_simd_vec_set pattern, replace asm for vld1_lane

Message ID 5465DD6F.2060000@arm.com
State New
Headers show

Commit Message

Alan Lawrence Nov. 14, 2014, 10:46 a.m. UTC
The vld1_lane intrinsic is currently implemented using inline asm. This patch 
replaces that with a load and a straightforward use of vset_lane (this gives us 
correct bigendian lane-flipping in a simple manner).

Naively this would produce assembler along the lines of (for vld1_lane_u8):
         ldrb    w0, [x0]
         ins     v0.b[5], w0
Hence, the patch also extends the aarch64_simd_vec_set pattern, adding a variant 
that reads from a memory operand, producing the expected:
         ld1     {v0.b}[5], [x0]
...and thus we'll also get that assembler from a programmer writing natively in 
GCC vector extensions and not using intrinsics :).

I've also added a testcase, as existing tests in aarch64 and advsimd-intrinsics 
seemed only to cover vld{2,3,4}_lane, not vld1_lane.

gcc/ChangeLog:

	* config/aarch64/aarch64-simd.md (aarch64_simd_vec_set<mode>): Add
	variant reading from memory and assembling to ld1.

	* config/aarch64/arm_neon.h (vld1_lane_f32, vld1_lane_f64, vld1_lane_p8,
	vld1_lane_p16, vld1_lane_s8, vld1_lane_s16, vld1_lane_s32,
	vld1_lane_s64, vld1_lane_u8, vld1_lane_u16, vld1_lane_u32,
	vld1_lane_u64, vld1q_lane_f32, vld1q_lane_f64, vld1q_lane_p8,
	vld1q_lane_p16, vld1q_lane_s8, vld1q_lane_s16, vld1q_lane_s32,
	vld1q_lane_s64, vld1q_lane_u8, vld1q_lane_u16, vld1q_lane_u32,
	vld1q_lane_u64): Replace asm with vset_lane and pointer dereference.

gcc/testsuite/ChangeLog:

	gcc.target/aarch64/vld1_lane.c: New test.

Comments

Marcus Shawcroft Nov. 17, 2014, 5:27 p.m. UTC | #1
On 14 November 2014 10:46, Alan Lawrence <alan.lawrence@arm.com> wrote:

> gcc/ChangeLog:
>
>         * config/aarch64/aarch64-simd.md (aarch64_simd_vec_set<mode>): Add
>         variant reading from memory and assembling to ld1.
>
>         * config/aarch64/arm_neon.h (vld1_lane_f32, vld1_lane_f64,
> vld1_lane_p8,
>         vld1_lane_p16, vld1_lane_s8, vld1_lane_s16, vld1_lane_s32,
>         vld1_lane_s64, vld1_lane_u8, vld1_lane_u16, vld1_lane_u32,
>         vld1_lane_u64, vld1q_lane_f32, vld1q_lane_f64, vld1q_lane_p8,
>         vld1q_lane_p16, vld1q_lane_s8, vld1q_lane_s16, vld1q_lane_s32,
>         vld1q_lane_s64, vld1q_lane_u8, vld1q_lane_u16, vld1q_lane_u32,
>         vld1q_lane_u64): Replace asm with vset_lane and pointer dereference.
>
> gcc/testsuite/ChangeLog:
>
>         gcc.target/aarch64/vld1_lane.c: New test.

OK /Marcus
diff mbox

Patch

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 00b59d3a352325e77632daa9723f3df4850cf922..b77a4f831c44df9df8fac609216ee3c501e0e54a 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -455,12 +455,12 @@ 
 )
 
 (define_insn "aarch64_simd_vec_set<mode>"
-  [(set (match_operand:VQ_S 0 "register_operand" "=w,w")
+  [(set (match_operand:VQ_S 0 "register_operand" "=w,w,w")
         (vec_merge:VQ_S
 	    (vec_duplicate:VQ_S
-		(match_operand:<VEL> 1 "register_operand" "r,w"))
-	    (match_operand:VQ_S 3 "register_operand" "0,0")
-	    (match_operand:SI 2 "immediate_operand" "i,i")))]
+		(match_operand:<VEL> 1 "aarch64_simd_general_operand" "r,w,Utv"))
+	    (match_operand:VQ_S 3 "register_operand" "0,0,0")
+	    (match_operand:SI 2 "immediate_operand" "i,i,i")))]
   "TARGET_SIMD"
   {
    int elt = ENDIAN_LANE_N (<MODE>mode, exact_log2 (INTVAL (operands[2])));
@@ -471,11 +471,13 @@ 
 	return "ins\\t%0.<Vetype>[%p2], %w1";
      case 1:
 	return "ins\\t%0.<Vetype>[%p2], %1.<Vetype>[0]";
+     case 2:
+        return "ld1\\t{%0.<Vetype>}[%p2], %1";
      default:
 	gcc_unreachable ();
      }
   }
-  [(set_attr "type" "neon_from_gp<q>, neon_ins<q>")]
+  [(set_attr "type" "neon_from_gp<q>, neon_ins<q>, neon_load1_1reg<q>")]
 )
 
 (define_insn "aarch64_simd_lshr<mode>"
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 4a0d718642f8a3cb56281a70435b1b6445ee35be..f036f7c0ba2733a822661027b815e7c3654db1bc 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -628,7 +628,7 @@  typedef struct poly16x8x4_t
 #define __aarch64_vdupq_laneq_u64(__a, __b) \
    __aarch64_vdup_lane_any (u64, q, q, __a, __b)
 
-/* vset_lane internal macro.  */
+/* vset_lane and vld1_lane internal macro.  */
 
 #ifdef __AARCH64EB__
 /* For big-endian, GCC's vector indices are the opposite way around
@@ -6275,162 +6275,6 @@  vld1_dup_u64 (const uint64_t * a)
   return result;
 }
 
-#define vld1_lane_f32(a, b, c)                                          \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x2_t b_ = (b);                                            \
-       const float32_t * a_ = (a);                                      \
-       float32x2_t result;                                              \
-       __asm__ ("ld1 {%0.s}[%1], %2"                                    \
-                : "=w"(result)                                          \
-                : "i" (c), "Utv"(*a_), "0"(b_)                          \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vld1_lane_f64(a, b, c)                                          \
-  __extension__                                                         \
-    ({                                                                  \
-       float64x1_t b_ = (b);                                            \
-       const float64_t * a_ = (a);                                      \
-       float64x1_t result;                                              \
-       __asm__ ("ld1 {%0.d}[%1], %2"                                    \
-                : "=w"(result)                                          \
-                : "i" (c), "Utv"(*a_), "0"(b_)                          \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vld1_lane_p8(a, b, c)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       poly8x8_t b_ = (b);                                              \
-       const poly8_t * a_ = (a);                                        \
-       poly8x8_t result;                                                \
-       __asm__ ("ld1 {%0.b}[%1], %2"                                    \
-                : "=w"(result)                                          \
-                : "i" (c), "Utv"(*a_), "0"(b_)                          \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vld1_lane_p16(a, b, c)                                          \
-  __extension__                                                         \
-    ({                                                                  \
-       poly16x4_t b_ = (b);                                             \
-       const poly16_t * a_ = (a);                                       \
-       poly16x4_t result;                                               \
-       __asm__ ("ld1 {%0.h}[%1], %2"                                    \
-                : "=w"(result)                                          \
-                : "i" (c), "Utv"(*a_), "0"(b_)                          \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vld1_lane_s8(a, b, c)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       int8x8_t b_ = (b);                                               \
-       const int8_t * a_ = (a);                                         \
-       int8x8_t result;                                                 \
-       __asm__ ("ld1 {%0.b}[%1], %2"                                    \
-                : "=w"(result)                                          \
-                : "i" (c), "Utv"(*a_), "0"(b_)                          \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vld1_lane_s16(a, b, c)                                          \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x4_t b_ = (b);                                              \
-       const int16_t * a_ = (a);                                        \
-       int16x4_t result;                                                \
-       __asm__ ("ld1 {%0.h}[%1], %2"                                    \
-                : "=w"(result)                                          \
-                : "i" (c), "Utv"(*a_), "0"(b_)                          \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vld1_lane_s32(a, b, c)                                          \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x2_t b_ = (b);                                              \
-       const int32_t * a_ = (a);                                        \
-       int32x2_t result;                                                \
-       __asm__ ("ld1 {%0.s}[%1], %2"                                    \
-                : "=w"(result)                                          \
-                : "i" (c), "Utv"(*a_), "0"(b_)                          \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vld1_lane_s64(a, b, c)                                          \
-  __extension__                                                         \
-    ({                                                                  \
-       int64x1_t b_ = (b);                                              \
-       const int64_t * a_ = (a);                                        \
-       int64x1_t result;                                                \
-       __asm__ ("ld1 {%0.d}[%1], %2"                                    \
-                : "=w"(result)                                          \
-                : "i" (c), "Utv"(*a_), "0"(b_)                          \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vld1_lane_u8(a, b, c)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       uint8x8_t b_ = (b);                                              \
-       const uint8_t * a_ = (a);                                        \
-       uint8x8_t result;                                                \
-       __asm__ ("ld1 {%0.b}[%1], %2"                                    \
-                : "=w"(result)                                          \
-                : "i" (c), "Utv"(*a_), "0"(b_)                          \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vld1_lane_u16(a, b, c)                                          \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x4_t b_ = (b);                                             \
-       const uint16_t * a_ = (a);                                       \
-       uint16x4_t result;                                               \
-       __asm__ ("ld1 {%0.h}[%1], %2"                                    \
-                : "=w"(result)                                          \
-                : "i" (c), "Utv"(*a_), "0"(b_)                          \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vld1_lane_u32(a, b, c)                                          \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x2_t b_ = (b);                                             \
-       const uint32_t * a_ = (a);                                       \
-       uint32x2_t result;                                               \
-       __asm__ ("ld1 {%0.s}[%1], %2"                                    \
-                : "=w"(result)                                          \
-                : "i" (c), "Utv"(*a_), "0"(b_)                          \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vld1_lane_u64(a, b, c)                                          \
-  __extension__                                                         \
-    ({                                                                  \
-       uint64x1_t b_ = (b);                                             \
-       const uint64_t * a_ = (a);                                       \
-       uint64x1_t result;                                               \
-       __asm__ ("ld1 {%0.d}[%1], %2"                                    \
-                : "=w"(result)                                          \
-                : "i" (c), "Utv"(*a_), "0"(b_)                          \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vld1q_dup_f32 (const float32_t * a)
 {
@@ -6563,162 +6407,6 @@  vld1q_dup_u64 (const uint64_t * a)
   return result;
 }
 
-#define vld1q_lane_f32(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x4_t b_ = (b);                                            \
-       const float32_t * a_ = (a);                                      \
-       float32x4_t result;                                              \
-       __asm__ ("ld1 {%0.s}[%1], %2"                                    \
-                : "=w"(result)                                          \
-                : "i"(c), "Utv"(*a_), "0"(b_)                           \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vld1q_lane_f64(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       float64x2_t b_ = (b);                                            \
-       const float64_t * a_ = (a);                                      \
-       float64x2_t result;                                              \
-       __asm__ ("ld1 {%0.d}[%1], %2"                                    \
-                : "=w"(result)                                          \
-                : "i"(c), "Utv"(*a_), "0"(b_)                           \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vld1q_lane_p8(a, b, c)                                          \
-  __extension__                                                         \
-    ({                                                                  \
-       poly8x16_t b_ = (b);                                             \
-       const poly8_t * a_ = (a);                                        \
-       poly8x16_t result;                                               \
-       __asm__ ("ld1 {%0.b}[%1], %2"                                    \
-                : "=w"(result)                                          \
-                : "i"(c), "Utv"(*a_), "0"(b_)                           \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vld1q_lane_p16(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       poly16x8_t b_ = (b);                                             \
-       const poly16_t * a_ = (a);                                       \
-       poly16x8_t result;                                               \
-       __asm__ ("ld1 {%0.h}[%1], %2"                                    \
-                : "=w"(result)                                          \
-                : "i"(c), "Utv"(*a_), "0"(b_)                           \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vld1q_lane_s8(a, b, c)                                          \
-  __extension__                                                         \
-    ({                                                                  \
-       int8x16_t b_ = (b);                                              \
-       const int8_t * a_ = (a);                                         \
-       int8x16_t result;                                                \
-       __asm__ ("ld1 {%0.b}[%1], %2"                                    \
-                : "=w"(result)                                          \
-                : "i"(c), "Utv"(*a_), "0"(b_)                           \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vld1q_lane_s16(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t b_ = (b);                                              \
-       const int16_t * a_ = (a);                                        \
-       int16x8_t result;                                                \
-       __asm__ ("ld1 {%0.h}[%1], %2"                                    \
-                : "=w"(result)                                          \
-                : "i"(c), "Utv"(*a_), "0"(b_)                           \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vld1q_lane_s32(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t b_ = (b);                                              \
-       const int32_t * a_ = (a);                                        \
-       int32x4_t result;                                                \
-       __asm__ ("ld1 {%0.s}[%1], %2"                                    \
-                : "=w"(result)                                          \
-                : "i"(c), "Utv"(*a_), "0"(b_)                           \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vld1q_lane_s64(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       int64x2_t b_ = (b);                                              \
-       const int64_t * a_ = (a);                                        \
-       int64x2_t result;                                                \
-       __asm__ ("ld1 {%0.d}[%1], %2"                                    \
-                : "=w"(result)                                          \
-                : "i"(c), "Utv"(*a_), "0"(b_)                           \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vld1q_lane_u8(a, b, c)                                          \
-  __extension__                                                         \
-    ({                                                                  \
-       uint8x16_t b_ = (b);                                             \
-       const uint8_t * a_ = (a);                                        \
-       uint8x16_t result;                                               \
-       __asm__ ("ld1 {%0.b}[%1], %2"                                    \
-                : "=w"(result)                                          \
-                : "i"(c), "Utv"(*a_), "0"(b_)                           \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vld1q_lane_u16(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x8_t b_ = (b);                                             \
-       const uint16_t * a_ = (a);                                       \
-       uint16x8_t result;                                               \
-       __asm__ ("ld1 {%0.h}[%1], %2"                                    \
-                : "=w"(result)                                          \
-                : "i"(c), "Utv"(*a_), "0"(b_)                           \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vld1q_lane_u32(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t b_ = (b);                                             \
-       const uint32_t * a_ = (a);                                       \
-       uint32x4_t result;                                               \
-       __asm__ ("ld1 {%0.s}[%1], %2"                                    \
-                : "=w"(result)                                          \
-                : "i"(c), "Utv"(*a_), "0"(b_)                           \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vld1q_lane_u64(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       uint64x2_t b_ = (b);                                             \
-       const uint64_t * a_ = (a);                                       \
-       uint64x2_t result;                                               \
-       __asm__ ("ld1 {%0.d}[%1], %2"                                    \
-                : "=w"(result)                                          \
-                : "i"(c), "Utv"(*a_), "0"(b_)                           \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vmla_n_f32 (float32x2_t a, float32x2_t b, float32_t c)
 {
@@ -16454,6 +16142,154 @@  vld1q_u64 (const uint64_t *a)
     __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) a);
 }
 
+/* vld1_lane  */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vld1_lane_f32 (const float32_t *__src, float32x2_t __vec, const int __lane)
+{
+  return vset_lane_f32 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+vld1_lane_f64 (const float64_t *__src, float64x1_t __vec, const int __lane)
+{
+  return vset_lane_f64 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vld1_lane_p8 (const poly8_t *__src, poly8x8_t __vec, const int __lane)
+{
+  return vset_lane_p8 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vld1_lane_p16 (const poly16_t *__src, poly16x4_t __vec, const int __lane)
+{
+  return vset_lane_p16 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vld1_lane_s8 (const int8_t *__src, int8x8_t __vec, const int __lane)
+{
+  return vset_lane_s8 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vld1_lane_s16 (const int16_t *__src, int16x4_t __vec, const int __lane)
+{
+  return vset_lane_s16 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vld1_lane_s32 (const int32_t *__src, int32x2_t __vec, const int __lane)
+{
+  return vset_lane_s32 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vld1_lane_s64 (const int64_t *__src, int64x1_t __vec, const int __lane)
+{
+  return vset_lane_s64 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vld1_lane_u8 (const uint8_t *__src, uint8x8_t __vec, const int __lane)
+{
+  return vset_lane_u8 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vld1_lane_u16 (const uint16_t *__src, uint16x4_t __vec, const int __lane)
+{
+  return vset_lane_u16 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vld1_lane_u32 (const uint32_t *__src, uint32x2_t __vec, const int __lane)
+{
+  return vset_lane_u32 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vld1_lane_u64 (const uint64_t *__src, uint64x1_t __vec, const int __lane)
+{
+  return vset_lane_u64 (*__src, __vec, __lane);
+}
+
+/* vld1q_lane  */
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vld1q_lane_f32 (const float32_t *__src, float32x4_t __vec, const int __lane)
+{
+  return vsetq_lane_f32 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vld1q_lane_f64 (const float64_t *__src, float64x2_t __vec, const int __lane)
+{
+  return vsetq_lane_f64 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vld1q_lane_p8 (const poly8_t *__src, poly8x16_t __vec, const int __lane)
+{
+  return vsetq_lane_p8 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vld1q_lane_p16 (const poly16_t *__src, poly16x8_t __vec, const int __lane)
+{
+  return vsetq_lane_p16 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vld1q_lane_s8 (const int8_t *__src, int8x16_t __vec, const int __lane)
+{
+  return vsetq_lane_s8 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vld1q_lane_s16 (const int16_t *__src, int16x8_t __vec, const int __lane)
+{
+  return vsetq_lane_s16 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vld1q_lane_s32 (const int32_t *__src, int32x4_t __vec, const int __lane)
+{
+  return vsetq_lane_s32 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vld1q_lane_s64 (const int64_t *__src, int64x2_t __vec, const int __lane)
+{
+  return vsetq_lane_s64 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vld1q_lane_u8 (const uint8_t *__src, uint8x16_t __vec, const int __lane)
+{
+  return vsetq_lane_u8 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vld1q_lane_u16 (const uint16_t *__src, uint16x8_t __vec, const int __lane)
+{
+  return vsetq_lane_u16 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vld1q_lane_u32 (const uint32_t *__src, uint32x4_t __vec, const int __lane)
+{
+  return vsetq_lane_u32 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vld1q_lane_u64 (const uint64_t *__src, uint64x2_t __vec, const int __lane)
+{
+  return vsetq_lane_u64 (*__src, __vec, __lane);
+}
+
 /* vldn */
 
 __extension__ static __inline int64x1x2_t __attribute__ ((__always_inline__))
diff --git a/gcc/testsuite/gcc.target/aarch64/vld1_lane.c b/gcc/testsuite/gcc.target/aarch64/vld1_lane.c
new file mode 100644
index 0000000000000000000000000000000000000000..c2445f8df53034027051722155a40161b86574bb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vld1_lane.c
@@ -0,0 +1,85 @@ 
+/* { dg-do run } */
+/* { dg-options "-O3 -fno-inline" } */
+
+#include <arm_neon.h>
+
+extern void abort (void);
+
+#define VARIANTS(VARIANT)	\
+VARIANT (uint8, , 8, _u8, 5)	\
+VARIANT (uint16, , 4, _u16, 3)	\
+VARIANT (uint32, , 2, _u32, 1)	\
+VARIANT (uint64, , 1, _u64, 0)	\
+VARIANT (int8, , 8, _s8, 3)	\
+VARIANT (int16, , 4, _s16, 2)	\
+VARIANT (int32, , 2, _s32, 0)	\
+VARIANT (int64, , 1, _s64, 0)	\
+VARIANT (poly8, , 8, _p8, 7)	\
+VARIANT (poly16, , 4, _p16, 2)	\
+VARIANT (float32, , 2, _f32, 1)	\
+VARIANT (float64, , 1, _f64, 0)	\
+VARIANT (uint8, q, 16, _u8, 13)	\
+VARIANT (uint16, q, 8, _u16, 5)	\
+VARIANT (uint32, q, 4, _u32, 1)	\
+VARIANT (uint64, q, 2, _u64, 0)	\
+VARIANT (int8, q, 16, _s8, 15)	\
+VARIANT (int16, q, 8, _s16, 3)	\
+VARIANT (int32, q, 4, _s32, 1)	\
+VARIANT (int64, q, 2, _s64, 1)	\
+VARIANT (poly8, q, 16, _p8, 7)	\
+VARIANT (poly16, q, 8, _p16, 4)	\
+VARIANT (float32, q, 4, _f32, 2)\
+VARIANT (float64, q, 2, _f64, 1)
+
+#define TESTMETH(BASE, Q, ELTS, SUFFIX, LANE)			\
+__attribute__((noinline)) BASE##x##ELTS##_t			\
+wrap_vld1##Q##_lane##SUFFIX (const BASE##_t *load,		\
+			     BASE##x##ELTS##_t vec)		\
+{ return vld1##Q##_lane##SUFFIX (load, vec, LANE); }		\
+int								\
+test_vld1##Q##_lane##SUFFIX (const BASE##_t *data,		\
+			     const BASE##_t *overwrite)		\
+{								\
+  BASE##_t out[ELTS];						\
+  int j;							\
+  BASE##x##ELTS##_t in = vld1##Q##SUFFIX (data);		\
+  in = wrap_vld1##Q##_lane##SUFFIX (overwrite, in);		\
+  vst1##Q##SUFFIX (out, in);					\
+    for (j = 0; j < ELTS; j++)					\
+      if (out[j] != (j == LANE ? *overwrite : data[j]))		\
+        return 1;						\
+  return 0;							\
+}
+
+
+VARIANTS (TESTMETH)
+
+#define CHECK(BASE, Q, ELTS, SUFFIX, LANE)			\
+  if (test_vld1##Q##_lane##SUFFIX ((const BASE##_t *)orig_data,	\
+				   BASE##_data) != 0)	\
+    abort ();
+
+int
+main (int argc, char **argv)
+{
+  /* Original data for all vector formats.  */
+  uint64_t orig_data[2] = {0x1234567890abcdefULL, 0x13579bdf02468aceULL};
+
+  /* Data with which vldN_lane will overwrite some of previous.  */
+  uint8_t uint8_data[4] = { 7, 11, 13, 17 };
+  uint16_t uint16_data[4] = { 257, 263, 269, 271 };
+  uint32_t uint32_data[4] = { 65537, 65539, 65543, 65551 };
+  uint64_t uint64_data[4] = { 0xdeadbeefcafebabeULL, 0x0123456789abcdefULL,
+			      0xfedcba9876543210LL, 0xdeadbabecafebeefLL };
+  int8_t int8_data[4] = { -1, 3, -5, 7 };
+  int16_t int16_data[4] = { 257, -259, 261, -263 };
+  int32_t int32_data[4] = { 123456789, -987654321, -135792468, 975318642 };
+  int64_t *int64_data = (int64_t *)uint64_data;
+  poly8_t poly8_data[4] = { 0, 7, 13, 18, };
+  poly16_t poly16_data[4] = { 11111, 2222, 333, 44 };
+  float32_t float32_data[4] = { 3.14159, 2.718, 1.414, 100.0 };
+  float64_t float64_data[4] = { 1.010010001, 12345.6789, -9876.54321, 1.618 };
+
+  VARIANTS (CHECK);
+  return 0;
+}