Patchwork [AArch64] Implement vcopy intrinsics.

login
register
mail settings
Submitter James Greenhalgh
Date Sept. 13, 2013, 6:39 p.m.
Message ID <1379097593-27994-1-git-send-email-james.greenhalgh@arm.com>
Download mbox | patch
Permalink /patch/274856/
State New
Headers show

Comments

James Greenhalgh - Sept. 13, 2013, 6:39 p.m.
Hi,

This patch adds intrinsics for vcopy<q>_lane<q>_<pfsu><8,16,32,64>.

These are implemented in an optimal way using the vget_lane and vset_lane
intrinsics and a combine pattern.

I've added a testcase and run a full regression run for aarch64-none-elf.

OK?

Thanks,
James

---
gcc/

2013-09-13  James Greenhalgh  <james.greenhalgh@arm.com>

	* config/aarch64/aarch64-simd.md
	(*aarch64_simd_vec_copy_lane<mode>): New.
	(*aarch64_simd_vec_copy_lane_<vswap_width_name><mode>): Likewise.
	* config/aarch64/arm_neon.h
	(vcopy<q>_lane_<pfsu><8,16,32,64>): Remove asm implementations.
	(vcopy<q>_lane<q>_<pfsu><8,16,32,64>): Implement optimally.

gcc/testsuite

2013-09-13  James Greenhalgh  <james.greenhalgh@arm.com>

	* gcc.target/aarch64/vect_copy_lane_1.c: New.
Marcus Shawcroft - Sept. 16, 2013, 8:27 a.m.
On 13/09/13 19:39, James Greenhalgh wrote:
>
> Hi,
>
> This patch adds intrinsics for vcopy<q>_lane<q>_<pfsu><8,16,32,64>.
>
> These are implemented in an optimal way using the vget_lane and vset_lane
> intrinsics and a combine pattern.
>
> I've added a testcase and run a full regression run for aarch64-none-elf.
>
> OK?
>
> Thanks,
> James

OK
/Marcus

Patch

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index f13cd5b7cdbdff95bbc378a76a6dd05de031487d..9703dd934a2f8335ffc5086e8a421db609fe0236 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -750,6 +750,54 @@  (define_insn "aarch64_simd_vec_set<mode>
    (set_attr "simd_mode" "<MODE>")]
 )
 
+(define_insn_and_split "*aarch64_simd_vec_copy_lane<mode>"
+  [(set (match_operand:VALL 0 "register_operand" "=w")
+	(vec_merge:VALL
+	    (vec_duplicate:VALL
+	      (vec_select:<VEL>
+		(match_operand:VALL 3 "register_operand" "w")
+		(parallel
+		  [(match_operand:SI 4 "immediate_operand" "i")])))
+	    (match_operand:VALL 1 "register_operand" "0")
+	    (match_operand:SI 2 "immediate_operand" "i")))]
+  "TARGET_SIMD"
+  "ins\t%0.<Vetype>[%p2], %3.<Vetype>[%4]";
+  "reload_completed
+   && REGNO (operands[0]) == REGNO (operands[3])
+   && (exact_log2 (INTVAL (operands[2])) == INTVAL (operands[4]))"
+  [(const_int 0)]
+  {
+    emit_note (NOTE_INSN_DELETED);
+    DONE;
+  }
+  [(set_attr "simd_type" "simd_ins")
+   (set_attr "simd_mode" "<MODE>")]
+)
+
+(define_insn_and_split "*aarch64_simd_vec_copy_lane_<vswap_width_name><mode>"
+  [(set (match_operand:VALL 0 "register_operand" "=w")
+	(vec_merge:VALL
+	    (vec_duplicate:VALL
+	      (vec_select:<VEL>
+		(match_operand:<VSWAP_WIDTH> 3 "register_operand" "w")
+		(parallel
+		  [(match_operand:SI 4 "immediate_operand" "i")])))
+	    (match_operand:VALL 1 "register_operand" "0")
+	    (match_operand:SI 2 "immediate_operand" "i")))]
+  "TARGET_SIMD"
+  "ins\t%0.<Vetype>[%p2], %3.<Vetype>[%4]";
+  "reload_completed
+   && REGNO (operands[0]) == REGNO (operands[3])
+   && (exact_log2 (INTVAL (operands[2])) == INTVAL (operands[4]))"
+  [(const_int 0)]
+  {
+    emit_note (NOTE_INSN_DELETED);
+    DONE;
+  }
+  [(set_attr "simd_type" "simd_ins")
+   (set_attr "simd_mode" "<MODE>")]
+)
+
 (define_insn "aarch64_simd_lshr<mode>"
  [(set (match_operand:VDQ 0 "register_operand" "=w")
        (lshiftrt:VDQ (match_operand:VDQ 1 "register_operand" "w")
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 6335ddf..64f8825 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -5538,162 +5538,6 @@  vcntq_u8 (uint8x16_t a)
   return result;
 }
 
-#define vcopyq_lane_f32(a, b, c, d)                                     \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x4_t c_ = (c);                                            \
-       float32x4_t a_ = (a);                                            \
-       float32x4_t result;                                              \
-       __asm__ ("ins %0.s[%2], %3.s[%4]"                                \
-                : "=w"(result)                                          \
-                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcopyq_lane_f64(a, b, c, d)                                     \
-  __extension__                                                         \
-    ({                                                                  \
-       float64x2_t c_ = (c);                                            \
-       float64x2_t a_ = (a);                                            \
-       float64x2_t result;                                              \
-       __asm__ ("ins %0.d[%2], %3.d[%4]"                                \
-                : "=w"(result)                                          \
-                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcopyq_lane_p8(a, b, c, d)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       poly8x16_t c_ = (c);                                             \
-       poly8x16_t a_ = (a);                                             \
-       poly8x16_t result;                                               \
-       __asm__ ("ins %0.b[%2], %3.b[%4]"                                \
-                : "=w"(result)                                          \
-                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcopyq_lane_p16(a, b, c, d)                                     \
-  __extension__                                                         \
-    ({                                                                  \
-       poly16x8_t c_ = (c);                                             \
-       poly16x8_t a_ = (a);                                             \
-       poly16x8_t result;                                               \
-       __asm__ ("ins %0.h[%2], %3.h[%4]"                                \
-                : "=w"(result)                                          \
-                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcopyq_lane_s8(a, b, c, d)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       int8x16_t c_ = (c);                                              \
-       int8x16_t a_ = (a);                                              \
-       int8x16_t result;                                                \
-       __asm__ ("ins %0.b[%2], %3.b[%4]"                                \
-                : "=w"(result)                                          \
-                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcopyq_lane_s16(a, b, c, d)                                     \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t c_ = (c);                                              \
-       int16x8_t a_ = (a);                                              \
-       int16x8_t result;                                                \
-       __asm__ ("ins %0.h[%2], %3.h[%4]"                                \
-                : "=w"(result)                                          \
-                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcopyq_lane_s32(a, b, c, d)                                     \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t c_ = (c);                                              \
-       int32x4_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("ins %0.s[%2], %3.s[%4]"                                \
-                : "=w"(result)                                          \
-                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcopyq_lane_s64(a, b, c, d)                                     \
-  __extension__                                                         \
-    ({                                                                  \
-       int64x2_t c_ = (c);                                              \
-       int64x2_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("ins %0.d[%2], %3.d[%4]"                                \
-                : "=w"(result)                                          \
-                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcopyq_lane_u8(a, b, c, d)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       uint8x16_t c_ = (c);                                             \
-       uint8x16_t a_ = (a);                                             \
-       uint8x16_t result;                                               \
-       __asm__ ("ins %0.b[%2], %3.b[%4]"                                \
-                : "=w"(result)                                          \
-                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcopyq_lane_u16(a, b, c, d)                                     \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x8_t c_ = (c);                                             \
-       uint16x8_t a_ = (a);                                             \
-       uint16x8_t result;                                               \
-       __asm__ ("ins %0.h[%2], %3.h[%4]"                                \
-                : "=w"(result)                                          \
-                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcopyq_lane_u32(a, b, c, d)                                     \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t c_ = (c);                                             \
-       uint32x4_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("ins %0.s[%2], %3.s[%4]"                                \
-                : "=w"(result)                                          \
-                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcopyq_lane_u64(a, b, c, d)                                     \
-  __extension__                                                         \
-    ({                                                                  \
-       uint64x2_t c_ = (c);                                             \
-       uint64x2_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("ins %0.d[%2], %3.d[%4]"                                \
-                : "=w"(result)                                          \
-                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
 /* vcvt_f16_f32 not supported */
 
 /* vcvt_f32_f16 not supported */
@@ -17913,6 +17757,398 @@  vcltzd_f64 (float64_t __a)
   return __a < 0.0 ? -1ll : 0ll;
 }
 
+/* vcopy_lane.  */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vcopy_lane_f32 (float32x2_t __a, const int __lane1,
+		float32x2_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_f32 (__aarch64_vget_lane_f32 (__b, __lane2),
+				  __a, __lane1);
+}
+
+__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+vcopy_lane_f64 (float64x1_t __a, const int __lane1,
+		float64x1_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_f64 (__aarch64_vget_lane_f64 (__b, __lane2),
+				  __a, __lane1);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vcopy_lane_p8 (poly8x8_t __a, const int __lane1,
+	       poly8x8_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_p8 (__aarch64_vget_lane_p8 (__b, __lane2),
+				 __a, __lane1);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vcopy_lane_p16 (poly16x4_t __a, const int __lane1,
+		poly16x4_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_p16 (__aarch64_vget_lane_p16 (__b, __lane2),
+				  __a, __lane1);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vcopy_lane_s8 (int8x8_t __a, const int __lane1,
+	       int8x8_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_s8 (__aarch64_vget_lane_s8 (__b, __lane2),
+				 __a, __lane1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vcopy_lane_s16 (int16x4_t __a, const int __lane1,
+		int16x4_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_s16 (__aarch64_vget_lane_s16 (__b, __lane2),
+				  __a, __lane1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vcopy_lane_s32 (int32x2_t __a, const int __lane1,
+		int32x2_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_s32 (__aarch64_vget_lane_s32 (__b, __lane2),
+				  __a, __lane1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vcopy_lane_s64 (int64x1_t __a, const int __lane1,
+		int64x1_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_s64 (__aarch64_vget_lane_s64 (__b, __lane2),
+				  __a, __lane1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vcopy_lane_u8 (uint8x8_t __a, const int __lane1,
+	       uint8x8_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_u8 (__aarch64_vget_lane_u8 (__b, __lane2),
+				 __a, __lane1);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcopy_lane_u16 (uint16x4_t __a, const int __lane1,
+		uint16x4_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_u16 (__aarch64_vget_lane_u16 (__b, __lane2),
+				  __a, __lane1);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcopy_lane_u32 (uint32x2_t __a, const int __lane1,
+		uint32x2_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_u32 (__aarch64_vget_lane_u32 (__b, __lane2),
+				  __a, __lane1);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vcopy_lane_u64 (uint64x1_t __a, const int __lane1,
+		uint64x1_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_u64 (__aarch64_vget_lane_u64 (__b, __lane2),
+				  __a, __lane1);
+}
+
+/* vcopy_laneq.  */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vcopy_laneq_f32 (float32x2_t __a, const int __lane1,
+		 float32x4_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_f32 (__aarch64_vgetq_lane_f32 (__b, __lane2),
+				  __a, __lane1);
+}
+
+__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+vcopy_laneq_f64 (float64x1_t __a, const int __lane1,
+		 float64x2_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_f64 (__aarch64_vgetq_lane_f64 (__b, __lane2),
+				  __a, __lane1);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vcopy_laneq_p8 (poly8x8_t __a, const int __lane1,
+		poly8x16_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_p8 (__aarch64_vgetq_lane_p8 (__b, __lane2),
+				 __a, __lane1);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vcopy_laneq_p16 (poly16x4_t __a, const int __lane1,
+		 poly16x8_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_p16 (__aarch64_vgetq_lane_p16 (__b, __lane2),
+				  __a, __lane1);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vcopy_laneq_s8 (int8x8_t __a, const int __lane1,
+		int8x16_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_s8 (__aarch64_vgetq_lane_s8 (__b, __lane2),
+				 __a, __lane1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vcopy_laneq_s16 (int16x4_t __a, const int __lane1,
+		 int16x8_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_s16 (__aarch64_vgetq_lane_s16 (__b, __lane2),
+				  __a, __lane1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vcopy_laneq_s32 (int32x2_t __a, const int __lane1,
+		 int32x4_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_s32 (__aarch64_vgetq_lane_s32 (__b, __lane2),
+				  __a, __lane1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vcopy_laneq_s64 (int64x1_t __a, const int __lane1,
+		 int64x2_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_s64 (__aarch64_vgetq_lane_s64 (__b, __lane2),
+				  __a, __lane1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vcopy_laneq_u8 (uint8x8_t __a, const int __lane1,
+		uint8x16_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_u8 (__aarch64_vgetq_lane_u8 (__b, __lane2),
+				 __a, __lane1);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcopy_laneq_u16 (uint16x4_t __a, const int __lane1,
+		 uint16x8_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_u16 (__aarch64_vgetq_lane_u16 (__b, __lane2),
+				  __a, __lane1);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcopy_laneq_u32 (uint32x2_t __a, const int __lane1,
+		 uint32x4_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_u32 (__aarch64_vgetq_lane_u32 (__b, __lane2),
+				  __a, __lane1);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vcopy_laneq_u64 (uint64x1_t __a, const int __lane1,
+		 uint64x2_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_u64 (__aarch64_vgetq_lane_u64 (__b, __lane2),
+				  __a, __lane1);
+}
+
+/* vcopyq_lane.  */
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vcopyq_lane_f32 (float32x4_t __a, const int __lane1,
+		 float32x2_t __b, const int __lane2)
+{
+  return __aarch64_vsetq_lane_f32 (__aarch64_vget_lane_f32 (__b, __lane2),
+				   __a, __lane1);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vcopyq_lane_f64 (float64x2_t __a, const int __lane1,
+		 float64x1_t __b, const int __lane2)
+{
+  return __aarch64_vsetq_lane_f64 (__aarch64_vget_lane_f64 (__b, __lane2),
+				   __a, __lane1);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vcopyq_lane_p8 (poly8x16_t __a, const int __lane1,
+		poly8x8_t __b, const int __lane2)
+{
+  return __aarch64_vsetq_lane_p8 (__aarch64_vget_lane_p8 (__b, __lane2),
+				  __a, __lane1);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vcopyq_lane_p16 (poly16x8_t __a, const int __lane1,
+		 poly16x4_t __b, const int __lane2)
+{
+  return __aarch64_vsetq_lane_p16 (__aarch64_vget_lane_p16 (__b, __lane2),
+				   __a, __lane1);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vcopyq_lane_s8 (int8x16_t __a, const int __lane1,
+		int8x8_t __b, const int __lane2)
+{
+  return __aarch64_vsetq_lane_s8 (__aarch64_vget_lane_s8 (__b, __lane2),
+				  __a, __lane1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vcopyq_lane_s16 (int16x8_t __a, const int __lane1,
+		 int16x4_t __b, const int __lane2)
+{
+  return __aarch64_vsetq_lane_s16 (__aarch64_vget_lane_s16 (__b, __lane2),
+				   __a, __lane1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vcopyq_lane_s32 (int32x4_t __a, const int __lane1,
+		 int32x2_t __b, const int __lane2)
+{
+  return __aarch64_vsetq_lane_s32 (__aarch64_vget_lane_s32 (__b, __lane2),
+				   __a, __lane1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vcopyq_lane_s64 (int64x2_t __a, const int __lane1,
+		 int64x1_t __b, const int __lane2)
+{
+  return __aarch64_vsetq_lane_s64 (__aarch64_vget_lane_s64 (__b, __lane2),
+				   __a, __lane1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcopyq_lane_u8 (uint8x16_t __a, const int __lane1,
+		uint8x8_t __b, const int __lane2)
+{
+  return __aarch64_vsetq_lane_u8 (__aarch64_vget_lane_u8 (__b, __lane2),
+				  __a, __lane1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcopyq_lane_u16 (uint16x8_t __a, const int __lane1,
+		 uint16x4_t __b, const int __lane2)
+{
+  return __aarch64_vsetq_lane_u16 (__aarch64_vget_lane_u16 (__b, __lane2),
+				   __a, __lane1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcopyq_lane_u32 (uint32x4_t __a, const int __lane1,
+		 uint32x2_t __b, const int __lane2)
+{
+  return __aarch64_vsetq_lane_u32 (__aarch64_vget_lane_u32 (__b, __lane2),
+				   __a, __lane1);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vcopyq_lane_u64 (uint64x2_t __a, const int __lane1,
+		 uint64x1_t __b, const int __lane2)
+{
+  return __aarch64_vsetq_lane_u64 (__aarch64_vget_lane_u64 (__b, __lane2),
+				   __a, __lane1);
+}
+
+/* vcopyq_laneq.  */
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vcopyq_laneq_f32 (float32x4_t __a, const int __lane1,
+		  float32x4_t __b, const int __lane2)
+{
+  return __aarch64_vsetq_lane_f32 (__aarch64_vgetq_lane_f32 (__b, __lane2),
+				   __a, __lane1);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vcopyq_laneq_f64 (float64x2_t __a, const int __lane1,
+		  float64x2_t __b, const int __lane2)
+{
+  return __aarch64_vsetq_lane_f64 (__aarch64_vgetq_lane_f64 (__b, __lane2),
+				   __a, __lane1);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vcopyq_laneq_p8 (poly8x16_t __a, const int __lane1,
+		 poly8x16_t __b, const int __lane2)
+{
+  return __aarch64_vsetq_lane_p8 (__aarch64_vgetq_lane_p8 (__b, __lane2),
+				  __a, __lane1);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vcopyq_laneq_p16 (poly16x8_t __a, const int __lane1,
+		  poly16x8_t __b, const int __lane2)
+{
+  return __aarch64_vsetq_lane_p16 (__aarch64_vgetq_lane_p16 (__b, __lane2),
+				   __a, __lane1);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vcopyq_laneq_s8 (int8x16_t __a, const int __lane1,
+		 int8x16_t __b, const int __lane2)
+{
+  return __aarch64_vsetq_lane_s8 (__aarch64_vgetq_lane_s8 (__b, __lane2),
+				  __a, __lane1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vcopyq_laneq_s16 (int16x8_t __a, const int __lane1,
+		  int16x8_t __b, const int __lane2)
+{
+  return __aarch64_vsetq_lane_s16 (__aarch64_vgetq_lane_s16 (__b, __lane2),
+				   __a, __lane1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vcopyq_laneq_s32 (int32x4_t __a, const int __lane1,
+		  int32x4_t __b, const int __lane2)
+{
+  return __aarch64_vsetq_lane_s32 (__aarch64_vgetq_lane_s32 (__b, __lane2),
+				   __a, __lane1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vcopyq_laneq_s64 (int64x2_t __a, const int __lane1,
+		  int64x2_t __b, const int __lane2)
+{
+  return __aarch64_vsetq_lane_s64 (__aarch64_vgetq_lane_s64 (__b, __lane2),
+				   __a, __lane1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcopyq_laneq_u8 (uint8x16_t __a, const int __lane1,
+		 uint8x16_t __b, const int __lane2)
+{
+  return __aarch64_vsetq_lane_u8 (__aarch64_vgetq_lane_u8 (__b, __lane2),
+				  __a, __lane1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcopyq_laneq_u16 (uint16x8_t __a, const int __lane1,
+		  uint16x8_t __b, const int __lane2)
+{
+  return __aarch64_vsetq_lane_u16 (__aarch64_vgetq_lane_u16 (__b, __lane2),
+				   __a, __lane1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcopyq_laneq_u32 (uint32x4_t __a, const int __lane1,
+		  uint32x4_t __b, const int __lane2)
+{
+  return __aarch64_vsetq_lane_u32 (__aarch64_vgetq_lane_u32 (__b, __lane2),
+				   __a, __lane1);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vcopyq_laneq_u64 (uint64x2_t __a, const int __lane1,
+		  uint64x2_t __b, const int __lane2)
+{
+  return __aarch64_vsetq_lane_u64 (__aarch64_vgetq_lane_u64 (__b, __lane2),
+				   __a, __lane1);
+}
+
 /* vcvt (double -> float).  */
 
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
diff --git a/gcc/testsuite/gcc.target/aarch64/vect_copy_lane_1.c b/gcc/testsuite/gcc.target/aarch64/vect_copy_lane_1.c
new file mode 100644
index 0000000..c4f28e6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect_copy_lane_1.c
@@ -0,0 +1,90 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include "arm_neon.h"
+
+#define BUILD_TEST(TYPE1, TYPE2, Q1, Q2, SUFFIX, INDEX1, INDEX2)	\
+TYPE1									\
+test_copy##Q1##_lane##Q2##_##SUFFIX (TYPE1 a, TYPE2 b)			\
+{									\
+  return vcopy##Q1##_lane##Q2##_##SUFFIX (a, INDEX1, b, INDEX2);	\
+}
+
+/* vcopy_lane.  */
+BUILD_TEST (poly8x8_t, poly8x8_t, , , p8, 7, 6)
+BUILD_TEST (int8x8_t,  int8x8_t,  , , s8, 7, 6)
+BUILD_TEST (uint8x8_t, uint8x8_t, , , u8, 7, 6)
+/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[7\\\], v1.b\\\[6\\\]" 3 } } */
+BUILD_TEST (poly16x4_t, poly16x4_t, , , p16, 3, 2)
+BUILD_TEST (int16x4_t,  int16x4_t,  , , s16, 3, 2)
+BUILD_TEST (uint16x4_t, uint16x4_t, , , u16, 3, 2)
+/* { dg-final { scan-assembler-times "ins\\tv0.h\\\[3\\\], v1.h\\\[2\\\]" 3 } } */
+BUILD_TEST (float32x2_t, float32x2_t, , , f32, 1, 0)
+BUILD_TEST (int32x2_t,   int32x2_t,   , , s32, 1, 0)
+BUILD_TEST (uint32x2_t,  uint32x2_t,  , , u32, 1, 0)
+/* { dg-final { scan-assembler-times "ins\\tv0.s\\\[1\\\], v1.s\\\[0\\\]" 3 } } */
+BUILD_TEST (float64x1_t, float64x1_t, , , f64, 0, 0)
+/* { dg-final { scan-assembler-times "fmov\\td0, d1" 1 } } */
+BUILD_TEST (int64x1_t,   int64x1_t,   , , s64, 0, 0)
+BUILD_TEST (uint64x1_t,  uint64x1_t,  , , u64, 0, 0)
+/* { dg-final { scan-assembler-times "mov\\tx0, x1" 2 } } */
+
+/* vcopy_laneq.  */
+
+BUILD_TEST (poly8x8_t, poly8x16_t, , q, p8, 7, 15)
+BUILD_TEST (int8x8_t,  int8x16_t,  , q, s8, 7, 15)
+BUILD_TEST (uint8x8_t, uint8x16_t, , q, u8, 7, 15)
+/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[7\\\], v1.b\\\[15\\\]" 3 } } */
+BUILD_TEST (poly16x4_t, poly16x8_t, , q, p16, 3, 7)
+BUILD_TEST (int16x4_t,  int16x8_t,  , q, s16, 3, 7)
+BUILD_TEST (uint16x4_t, uint16x8_t, , q, u16, 3, 7)
+/* { dg-final { scan-assembler-times "ins\\tv0.h\\\[3\\\], v1.h\\\[7\\\]" 3 } } */
+BUILD_TEST (float32x2_t, float32x4_t, , q, f32, 1, 3)
+BUILD_TEST (int32x2_t,   int32x4_t,   , q, s32, 1, 3)
+BUILD_TEST (uint32x2_t,  uint32x4_t,  , q, u32, 1, 3)
+/* { dg-final { scan-assembler-times "ins\\tv0.s\\\[1\\\], v1.s\\\[3\\\]" 3 } } */
+BUILD_TEST (float64x1_t, float64x2_t, , q, f64, 0, 1)
+/* { dg-final { scan-assembler-times "dup\\td0, v1.d\\\[1\\\]" 1 } } */
+BUILD_TEST (int64x1_t,  int64x2_t,    , q, s64, 0, 1)
+BUILD_TEST (uint64x1_t, uint64x2_t,   , q, u64, 0, 1)
+/* { dg-final { scan-assembler-times "umov\\tx0, v0.d\\\[1\\\]" 2 } } */
+
+/* vcopyq_lane.  */
+BUILD_TEST (poly8x16_t, poly8x8_t, q, , p8, 15, 7)
+BUILD_TEST (int8x16_t,  int8x8_t,  q, , s8, 15, 7)
+BUILD_TEST (uint8x16_t, uint8x8_t, q, , u8, 15, 7)
+/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[15\\\], v1.b\\\[7\\\]" 3 } } */
+BUILD_TEST (poly16x8_t, poly16x4_t, q, , p16, 7, 3)
+BUILD_TEST (int16x8_t,  int16x4_t,  q, , s16, 7, 3)
+BUILD_TEST (uint16x8_t, uint16x4_t, q, , u16, 7, 3)
+/* { dg-final { scan-assembler-times "ins\\tv0.h\\\[7\\\], v1.h\\\[3\\\]" 3 } } */
+BUILD_TEST (float32x4_t, float32x2_t, q, , f32, 3, 1)
+BUILD_TEST (int32x4_t,   int32x2_t,   q, , s32, 3, 1)
+BUILD_TEST (uint32x4_t,  uint32x2_t,  q, , u32, 3, 1)
+/* { dg-final { scan-assembler-times "ins\\tv0.s\\\[3\\\], v1.s\\\[1\\\]" 3 } } */
+BUILD_TEST (float64x2_t, float64x1_t, q, , f64, 1, 0)
+/* { dg-final { scan-assembler-times "ins\\tv0.d\\\[1\\\], v1.d\\\[0\\\]" 1 } } */
+BUILD_TEST (int64x2_t,   int64x1_t,   q, , s64, 1, 0)
+BUILD_TEST (uint64x2_t,  uint64x1_t,  q, , u64, 1, 0)
+/* { dg-final { scan-assembler-times "ins\\tv0.d\\\[1\\\], x0" 2 } } */
+
+/* vcopyq_laneq.  */
+
+BUILD_TEST (poly8x16_t, poly8x16_t, q, q, p8, 14, 15)
+BUILD_TEST (int8x16_t,  int8x16_t,  q, q, s8, 14, 15)
+BUILD_TEST (uint8x16_t, uint8x16_t, q, q, u8, 14, 15)
+/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[14\\\], v1.b\\\[15\\\]" 3 } } */
+BUILD_TEST (poly16x8_t, poly16x8_t, q, q, p16, 6, 7)
+BUILD_TEST (int16x8_t,  int16x8_t,  q, q, s16, 6, 7)
+BUILD_TEST (uint16x8_t, uint16x8_t, q, q, u16, 6, 7)
+/* { dg-final { scan-assembler-times "ins\\tv0.h\\\[6\\\], v1.h\\\[7\\\]" 3 } } */
+BUILD_TEST (float32x4_t, float32x4_t, q, q, f32, 2, 3)
+BUILD_TEST (int32x4_t,   int32x4_t,   q, q, s32, 2, 3)
+BUILD_TEST (uint32x4_t,  uint32x4_t,  q, q, u32, 2, 3)
+/* { dg-final { scan-assembler-times "ins\\tv0.s\\\[2\\\], v1.s\\\[3\\\]" 3 } } */
+BUILD_TEST (float64x2_t, float64x2_t, q, q, f64, 1, 1)
+BUILD_TEST (int64x2_t,   int64x2_t,   q,  q, s64, 1, 1)
+BUILD_TEST (uint64x2_t,  uint64x2_t,  q, q, u64, 1, 1)
+/* { dg-final { scan-assembler-times "ins\\tv0.d\\\[1\\\], v1.d\\\[1\\\]" 3 } } */
+
+/* { dg-final { cleanup-saved-temps } } */