diff mbox

[AArch64] Fix ICE at -O0 on vld1_lane intrinsics

Message ID 54748C4A.5080406@arm.com
State New
Headers show

Commit Message

Alan Lawrence Nov. 25, 2014, 2:03 p.m. UTC
vld1_lane intrinsics ICE at -O0 because they contain a call to the vset_lane 
intrinsics, through which the lane index is not constant-propagated. (They are 
fine at -O1 and higher!). This fixes the ICE by replacing said call by a macro.

Rather than defining many individual macros 
__aarch64_vset(q?)_lane_[uspf](8|16|32|64), instead this introduces a 
__AARCH64_NUM_LANES macro using sizeof(), such that a single 
__aarch64_vset_lane_any macro handles all variants (with bounds-checking and 
endianness-flipping). This reduces potential for error vs. writing the number of 
lanes for each variant by hand as previously.

Also factor the endianness-flipping out to a separate macro __aarch64_lane; I 
intend to use this for vget_lane too in another patch.

Tested with check-gcc on aarch64-none-elf and aarch64_be-none-elf (including new 
test that FAILs without this patch).

Ok for trunk?


gcc/ChangeLog:

	* config/aarch64/arm_neon.h (__AARCH64_NUM_LANES, __aarch64_lane *2):
	New.
	(aarch64_vset_lane_any): Redefine using previous, same for BE + LE.
	(vset_lane_f32, vset_lane_f64, vset_lane_p8, vset_lane_p16,
	vset_lane_s8, vset_lane_s16, vset_lane_s32, vset_lane_s64,
	vset_lane_u8, vset_lane_u16, vset_lane_u32, vset_lane_u64): Remove
	number of lanes.
	(vld1_lane_f32, vld1_lane_f64, vld1_lane_p8, vld1_lane_p16,
	vld1_lane_s8, vld1_lane_s16, vld1_lane_s32, vld1_lane_s64,
	vld1_lane_u8, vld1_lane_u16, vld1_lane_u32, vld1_lane_u64): Call
	__aarch64_vset_lane_any rather than vset_lane_xxx.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/vld1_lane-o0.c: New test.

Comments

Alan Lawrence Dec. 3, 2014, 3:46 p.m. UTC | #1
Ping.

Alan Lawrence wrote:
> vld1_lane intrinsics ICE at -O0 because they contain a call to the vset_lane 
> intrinsics, through which the lane index is not constant-propagated. (They are 
> fine at -O1 and higher!). This fixes the ICE by replacing said call by a macro.
> 
> Rather than defining many individual macros 
> __aarch64_vset(q?)_lane_[uspf](8|16|32|64), instead this introduces a 
> __AARCH64_NUM_LANES macro using sizeof(), such that a single 
> __aarch64_vset_lane_any macro handles all variants (with bounds-checking and 
> endianness-flipping). This reduces potential for error vs. writing the number of 
> lanes for each variant by hand as previously.
> 
> Also factor the endianness-flipping out to a separate macro __aarch64_lane; I 
> intend to use this for vget_lane too in another patch.
> 
> Tested with check-gcc on aarch64-none-elf and aarch64_be-none-elf (including new 
> test that FAILs without this patch).
> 
> Ok for trunk?
> 
> 
> gcc/ChangeLog:
> 
> 	* config/aarch64/arm_neon.h (__AARCH64_NUM_LANES, __aarch64_lane *2):
> 	New.
> 	(aarch64_vset_lane_any): Redefine using previous, same for BE + LE.
> 	(vset_lane_f32, vset_lane_f64, vset_lane_p8, vset_lane_p16,
> 	vset_lane_s8, vset_lane_s16, vset_lane_s32, vset_lane_s64,
> 	vset_lane_u8, vset_lane_u16, vset_lane_u32, vset_lane_u64): Remove
> 	number of lanes.
> 	(vld1_lane_f32, vld1_lane_f64, vld1_lane_p8, vld1_lane_p16,
> 	vld1_lane_s8, vld1_lane_s16, vld1_lane_s32, vld1_lane_s64,
> 	vld1_lane_u8, vld1_lane_u16, vld1_lane_u32, vld1_lane_u64): Call
> 	__aarch64_vset_lane_any rather than vset_lane_xxx.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.target/aarch64/vld1_lane-o0.c: New test.
Marcus Shawcroft Dec. 5, 2014, 6:22 p.m. UTC | #2
On 25 November 2014 at 14:03, Alan Lawrence <alan.lawrence@arm.com> wrote:

> gcc/ChangeLog:
>
>         * config/aarch64/arm_neon.h (__AARCH64_NUM_LANES, __aarch64_lane
> *2):
>         New.
>         (aarch64_vset_lane_any): Redefine using previous, same for BE + LE.
>         (vset_lane_f32, vset_lane_f64, vset_lane_p8, vset_lane_p16,
>         vset_lane_s8, vset_lane_s16, vset_lane_s32, vset_lane_s64,
>         vset_lane_u8, vset_lane_u16, vset_lane_u32, vset_lane_u64): Remove
>         number of lanes.
>         (vld1_lane_f32, vld1_lane_f64, vld1_lane_p8, vld1_lane_p16,
>         vld1_lane_s8, vld1_lane_s16, vld1_lane_s32, vld1_lane_s64,
>         vld1_lane_u8, vld1_lane_u16, vld1_lane_u32, vld1_lane_u64): Call
>         __aarch64_vset_lane_any rather than vset_lane_xxx.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/aarch64/vld1_lane-o0.c: New test.

OK /Marcus
diff mbox

Patch

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 921a5db..1291a8d 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -604,173 +604,28 @@  typedef struct poly16x8x4_t
 #define __aarch64_vdupq_laneq_u64(__a, __b) \
    __aarch64_vdup_lane_any (u64, q, q, __a, __b)
 
-/* vset_lane and vld1_lane internal macro.  */
+/* Internal macro for lane indices.  */
+
+#define __AARCH64_NUM_LANES(__v) (sizeof (__v) / sizeof (__v[0]))
 
-#ifdef __AARCH64EB__
 /* For big-endian, GCC's vector indices are the opposite way around
    to the architectural lane indices used by Neon intrinsics.  */
-#define __aarch64_vset_lane_any(__vec, __index, __val, __lanes) \
-  __extension__							\
-  ({								\
-    __builtin_aarch64_im_lane_boundsi (__index, __lanes);	\
-    __vec[__lanes - 1 - __index] = __val;			\
-    __vec;							\
-  })
+#ifdef __AARCH64EB__
+#define __aarch64_lane(__vec, __idx) (__AARCH64_NUM_LANES (__vec) - 1 - __idx)
 #else
-#define __aarch64_vset_lane_any(__vec, __index, __val, __lanes) \
-  __extension__							\
-  ({								\
-    __builtin_aarch64_im_lane_boundsi (__index, __lanes);	\
-    __vec[__index] = __val;					\
-    __vec;							\
-  })
+#define __aarch64_lane(__vec, __idx) __idx
 #endif
 
-/* vset_lane  */
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vset_lane_f32 (float32_t __elem, float32x2_t __vec, const int __index)
-{
-  return __aarch64_vset_lane_any (__vec, __index, __elem, 2);
-}
-
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vset_lane_f64 (float64_t __elem, float64x1_t __vec, const int __index)
-{
-  return __aarch64_vset_lane_any (__vec, __index, __elem, 1);
-}
-
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vset_lane_p8 (poly8_t __elem, poly8x8_t __vec, const int __index)
-{
-  return __aarch64_vset_lane_any (__vec, __index, __elem, 8);
-}
-
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vset_lane_p16 (poly16_t __elem, poly16x4_t __vec, const int __index)
-{
-  return __aarch64_vset_lane_any (__vec, __index, __elem, 4);
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vset_lane_s8 (int8_t __elem, int8x8_t __vec, const int __index)
-{
-  return __aarch64_vset_lane_any (__vec, __index, __elem, 8);
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vset_lane_s16 (int16_t __elem, int16x4_t __vec, const int __index)
-{
-  return __aarch64_vset_lane_any (__vec, __index, __elem, 4);
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vset_lane_s32 (int32_t __elem, int32x2_t __vec, const int __index)
-{
-  return __aarch64_vset_lane_any (__vec, __index, __elem, 2);
-}
-
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vset_lane_s64 (int64_t __elem, int64x1_t __vec, const int __index)
-{
-  return __aarch64_vset_lane_any (__vec, __index, __elem, 1);
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vset_lane_u8 (uint8_t __elem, uint8x8_t __vec, const int __index)
-{
-  return __aarch64_vset_lane_any (__vec, __index, __elem, 8);
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vset_lane_u16 (uint16_t __elem, uint16x4_t __vec, const int __index)
-{
-  return __aarch64_vset_lane_any (__vec, __index, __elem, 4);
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vset_lane_u32 (uint32_t __elem, uint32x2_t __vec, const int __index)
-{
-  return __aarch64_vset_lane_any (__vec, __index, __elem, 2);
-}
-
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vset_lane_u64 (uint64_t __elem, uint64x1_t __vec, const int __index)
-{
-  return __aarch64_vset_lane_any (__vec, __index, __elem, 1);
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vsetq_lane_f32 (float32_t __elem, float32x4_t __vec, const int __index)
-{
-  return __aarch64_vset_lane_any (__vec, __index, __elem, 4);
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vsetq_lane_f64 (float64_t __elem, float64x2_t __vec, const int __index)
-{
-  return __aarch64_vset_lane_any (__vec, __index, __elem, 2);
-}
-
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vsetq_lane_p8 (poly8_t __elem, poly8x16_t __vec, const int __index)
-{
-  return __aarch64_vset_lane_any (__vec, __index, __elem, 16);
-}
-
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vsetq_lane_p16 (poly16_t __elem, poly16x8_t __vec, const int __index)
-{
-  return __aarch64_vset_lane_any (__vec, __index, __elem, 8);
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vsetq_lane_s8 (int8_t __elem, int8x16_t __vec, const int __index)
-{
-  return __aarch64_vset_lane_any (__vec, __index, __elem, 16);
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vsetq_lane_s16 (int16_t __elem, int16x8_t __vec, const int __index)
-{
-  return __aarch64_vset_lane_any (__vec, __index, __elem, 8);
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vsetq_lane_s32 (int32_t __elem, int32x4_t __vec, const int __index)
-{
-  return __aarch64_vset_lane_any (__vec, __index, __elem, 4);
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vsetq_lane_s64 (int64_t __elem, int64x2_t __vec, const int __index)
-{
-  return __aarch64_vset_lane_any (__vec, __index, __elem, 2);
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vsetq_lane_u8 (uint8_t __elem, uint8x16_t __vec, const int __index)
-{
-  return __aarch64_vset_lane_any (__vec, __index, __elem, 16);
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vsetq_lane_u16 (uint16_t __elem, uint16x8_t __vec, const int __index)
-{
-  return __aarch64_vset_lane_any (__vec, __index, __elem, 8);
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vsetq_lane_u32 (uint32_t __elem, uint32x4_t __vec, const int __index)
-{
-  return __aarch64_vset_lane_any (__vec, __index, __elem, 4);
-}
+/* vset_lane and vld1_lane internal macro.  */
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vsetq_lane_u64 (uint64_t __elem, uint64x2_t __vec, const int __index)
-{
-  return __aarch64_vset_lane_any (__vec, __index, __elem, 2);
-}
+#define __aarch64_vset_lane_any(__elem, __vec, __index)			\
+  __extension__								\
+  ({									\
+    __builtin_aarch64_im_lane_boundsi (__index,			\
+       __AARCH64_NUM_LANES (__vec));					\
+    __vec[__aarch64_lane (__vec, __index)] = __elem;			\
+    __vec;								\
+  })
 
 /* vadd  */
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
@@ -4387,6 +4242,154 @@  vreinterpretq_u32_p16 (poly16x8_t __a)
   return (uint32x4_t) __a;
 }
 
+/* vset_lane  */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vset_lane_f32 (float32_t __elem, float32x2_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
+__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+vset_lane_f64 (float64_t __elem, float64x1_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vset_lane_p8 (poly8_t __elem, poly8x8_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vset_lane_p16 (poly16_t __elem, poly16x4_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vset_lane_s8 (int8_t __elem, int8x8_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vset_lane_s16 (int16_t __elem, int16x4_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vset_lane_s32 (int32_t __elem, int32x2_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vset_lane_s64 (int64_t __elem, int64x1_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vset_lane_u8 (uint8_t __elem, uint8x8_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vset_lane_u16 (uint16_t __elem, uint16x4_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vset_lane_u32 (uint32_t __elem, uint32x2_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vset_lane_u64 (uint64_t __elem, uint64x1_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
+/* vsetq_lane  */
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vsetq_lane_f32 (float32_t __elem, float32x4_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vsetq_lane_f64 (float64_t __elem, float64x2_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vsetq_lane_p8 (poly8_t __elem, poly8x16_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vsetq_lane_p16 (poly16_t __elem, poly16x8_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vsetq_lane_s8 (int8_t __elem, int8x16_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vsetq_lane_s16 (int16_t __elem, int16x8_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vsetq_lane_s32 (int32_t __elem, int32x4_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vsetq_lane_s64 (int64_t __elem, int64x2_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vsetq_lane_u8 (uint8_t __elem, uint8x16_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vsetq_lane_u16 (uint16_t __elem, uint16x8_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vsetq_lane_u32 (uint32_t __elem, uint32x4_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vsetq_lane_u64 (uint64_t __elem, uint64x2_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
 #define __GET_LOW(__TYPE) \
   uint64x2_t tmp = vreinterpretq_u64_##__TYPE (__a);  \
   uint64x1_t lo = vcreate_u64 (vgetq_lane_u64 (tmp, 0));  \
@@ -16007,73 +16010,73 @@  vld1q_dup_u64 (const uint64_t* __a)
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vld1_lane_f32 (const float32_t *__src, float32x2_t __vec, const int __lane)
 {
-  return vset_lane_f32 (*__src, __vec, __lane);
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
 __extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
 vld1_lane_f64 (const float64_t *__src, float64x1_t __vec, const int __lane)
 {
-  return vset_lane_f64 (*__src, __vec, __lane);
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
 __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 vld1_lane_p8 (const poly8_t *__src, poly8x8_t __vec, const int __lane)
 {
-  return vset_lane_p8 (*__src, __vec, __lane);
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
 __extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
 vld1_lane_p16 (const poly16_t *__src, poly16x4_t __vec, const int __lane)
 {
-  return vset_lane_p16 (*__src, __vec, __lane);
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vld1_lane_s8 (const int8_t *__src, int8x8_t __vec, const int __lane)
 {
-  return vset_lane_s8 (*__src, __vec, __lane);
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
 __extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 vld1_lane_s16 (const int16_t *__src, int16x4_t __vec, const int __lane)
 {
-  return vset_lane_s16 (*__src, __vec, __lane);
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
 __extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 vld1_lane_s32 (const int32_t *__src, int32x2_t __vec, const int __lane)
 {
-  return vset_lane_s32 (*__src, __vec, __lane);
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
 __extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
 vld1_lane_s64 (const int64_t *__src, int64x1_t __vec, const int __lane)
 {
-  return vset_lane_s64 (*__src, __vec, __lane);
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 vld1_lane_u8 (const uint8_t *__src, uint8x8_t __vec, const int __lane)
 {
-  return vset_lane_u8 (*__src, __vec, __lane);
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
 __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 vld1_lane_u16 (const uint16_t *__src, uint16x4_t __vec, const int __lane)
 {
-  return vset_lane_u16 (*__src, __vec, __lane);
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
 __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 vld1_lane_u32 (const uint32_t *__src, uint32x2_t __vec, const int __lane)
 {
-  return vset_lane_u32 (*__src, __vec, __lane);
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vld1_lane_u64 (const uint64_t *__src, uint64x1_t __vec, const int __lane)
 {
-  return vset_lane_u64 (*__src, __vec, __lane);
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
 /* vld1q_lane  */
@@ -16081,73 +16084,73 @@  vld1_lane_u64 (const uint64_t *__src, uint64x1_t __vec, const int __lane)
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vld1q_lane_f32 (const float32_t *__src, float32x4_t __vec, const int __lane)
 {
-  return vsetq_lane_f32 (*__src, __vec, __lane);
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
 __extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 vld1q_lane_f64 (const float64_t *__src, float64x2_t __vec, const int __lane)
 {
-  return vsetq_lane_f64 (*__src, __vec, __lane);
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
 __extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
 vld1q_lane_p8 (const poly8_t *__src, poly8x16_t __vec, const int __lane)
 {
-  return vsetq_lane_p8 (*__src, __vec, __lane);
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
 __extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
 vld1q_lane_p16 (const poly16_t *__src, poly16x8_t __vec, const int __lane)
 {
-  return vsetq_lane_p16 (*__src, __vec, __lane);
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
 __extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 vld1q_lane_s8 (const int8_t *__src, int8x16_t __vec, const int __lane)
 {
-  return vsetq_lane_s8 (*__src, __vec, __lane);
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
 __extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 vld1q_lane_s16 (const int16_t *__src, int16x8_t __vec, const int __lane)
 {
-  return vsetq_lane_s16 (*__src, __vec, __lane);
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
 __extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 vld1q_lane_s32 (const int32_t *__src, int32x4_t __vec, const int __lane)
 {
-  return vsetq_lane_s32 (*__src, __vec, __lane);
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
 __extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 vld1q_lane_s64 (const int64_t *__src, int64x2_t __vec, const int __lane)
 {
-  return vsetq_lane_s64 (*__src, __vec, __lane);
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
 __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 vld1q_lane_u8 (const uint8_t *__src, uint8x16_t __vec, const int __lane)
 {
-  return vsetq_lane_u8 (*__src, __vec, __lane);
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
 __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 vld1q_lane_u16 (const uint16_t *__src, uint16x8_t __vec, const int __lane)
 {
-  return vsetq_lane_u16 (*__src, __vec, __lane);
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
 __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 vld1q_lane_u32 (const uint32_t *__src, uint32x4_t __vec, const int __lane)
 {
-  return vsetq_lane_u32 (*__src, __vec, __lane);
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
 __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 vld1q_lane_u64 (const uint64_t *__src, uint64x2_t __vec, const int __lane)
 {
-  return vsetq_lane_u64 (*__src, __vec, __lane);
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
 /* vldn */
diff --git a/gcc/testsuite/gcc.target/aarch64/vld1_lane-o0.c b/gcc/testsuite/gcc.target/aarch64/vld1_lane-o0.c
new file mode 100644
index 0000000..58e0c9d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vld1_lane-o0.c
@@ -0,0 +1,13 @@ 
+/* PR/63950 Test bounds checking at -O0.  */
+
+/* { dg-options "-std=c99 -O0" } */
+
+#include <arm_neon.h>
+
+int
+main (int argc, char **argv)
+{
+  int16x4_t in = vcreate_s16 (0xdeadbeef00000000ULL);
+  int16_t src = 17;
+  int16x4_t out = vld1_lane_s16 (&src, in, 1);
+}