Patchwork [AArch64] Convert ld1, st1 arm_neon.h intrinsics to RTL builtins.

login
register
mail settings
Submitter James Greenhalgh
Date July 2, 2013, 9:01 a.m.
Message ID <1372755694-15809-1-git-send-email-james.greenhalgh@arm.com>
Download mbox | patch
Permalink /patch/256305/
State New
Headers show

Comments

James Greenhalgh - July 2, 2013, 9:01 a.m.
Hello,

As they stand the ld1 and st1 intrinsics are broken. The constraints
used in their assembler definitions are not strong enough to prevent
GCC from performing some exciting reorders moving around loads and
stores.

Rewriting these intrinsics as RTL builtins settles GCC's errant
behaviour and results in better code generation.

There may be a time in the future at which we are safe to write
these intrinsics using the vector extension array indexing. But
this is currently not safe for big-endian code. For now, this fixes
a bug and is not too ugly.

Regression tested on aarch64-none-elf and aarch64_be-none-elf with
no regressions.

OK?

Thanks,
James

---
gcc/

2013-07-02  James Greenhalgh  <james.greenhalgh@arm.com>

	* config/aarch64/aarch64-builtins.c
	(aarch64_simd_expand_builtin): Handle AARCH64_SIMD_STORE1.
	* config/aarch64/aarch64-simd-builtins.def (ld1): New.
	(st1): Likewise.
	* config/aarch64/aarch64-simd.md
	(aarch64_ld1<VALL:mode>): New.
	(aarch64_st1<VALL:mode>): Likewise.
	* config/aarch64/arm_neon.h
	(vld1<q>_<fpsu><8, 16, 32, 64>): Convert to RTL builtins.
Marcus Shawcroft - July 3, 2013, 9:21 a.m.
On 02/07/13 10:01, James Greenhalgh wrote:

> 2013-07-02  James Greenhalgh  <james.greenhalgh@arm.com>
>
> 	* config/aarch64/aarch64-builtins.c
> 	(aarch64_simd_expand_builtin): Handle AARCH64_SIMD_STORE1.
> 	* config/aarch64/aarch64-simd-builtins.def (ld1): New.
> 	(st1): Likewise.
> 	* config/aarch64/aarch64-simd.md
> 	(aarch64_ld1<VALL:mode>): New.
> 	(aarch64_st1<VALL:mode>): Likewise.
> 	* config/aarch64/arm_neon.h
> 	(vld1<q>_<fpsu><8, 16, 32, 64>): Convert to RTL builtins.
>

OK
/MArcus

Patch

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 2a0e5fd..f49f06b 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -1123,6 +1123,7 @@  aarch64_simd_expand_builtin (int fcode, tree exp, rtx target)
       return aarch64_simd_expand_args (target, icode, 1, exp,
 				       SIMD_ARG_COPY_TO_REG, SIMD_ARG_STOP);
 
+    case AARCH64_SIMD_STORE1:
     case AARCH64_SIMD_STORESTRUCT:
       return aarch64_simd_expand_args (target, icode, 0, exp,
 				       SIMD_ARG_COPY_TO_REG,
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 4d9b966..af2dd6e 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -354,3 +354,10 @@ 
 
   VAR1 (UNOP, float_extend_lo_, 0, v2df)
   VAR1 (UNOP, float_truncate_lo_, 0, v2sf)
+
+  /* Implemented by aarch64_ld1<VALL:mode>.  */
+  BUILTIN_VALL (LOAD1, ld1, 0)
+
+  /* Implemented by aarch64_st1<VALL:mode>.  */
+  BUILTIN_VALL (STORE1, st1, 0)
+
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 08826b5..178efdc 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3882,6 +3882,17 @@ 
   DONE;
 })
 
+(define_expand "aarch64_ld1<VALL:mode>"
+ [(match_operand:VALL 0 "register_operand")
+  (match_operand:DI 1 "register_operand")]
+  "TARGET_SIMD"
+{
+  enum machine_mode mode = <VALL:MODE>mode;
+  rtx mem = gen_rtx_MEM (mode, operands[1]);
+  emit_move_insn (operands[0], mem);
+  DONE;
+})
+
 (define_expand "aarch64_ld<VSTRUCT:nregs><VQ:mode>"
  [(match_operand:VSTRUCT 0 "register_operand" "=w")
   (match_operand:DI 1 "register_operand" "r")
@@ -4098,6 +4109,17 @@ 
   DONE;
 })
 
+(define_expand "aarch64_st1<VALL:mode>"
+ [(match_operand:DI 0 "register_operand")
+  (match_operand:VALL 1 "register_operand")]
+  "TARGET_SIMD"
+{
+  enum machine_mode mode = <VALL:MODE>mode;
+  rtx mem = gen_rtx_MEM (mode, operands[0]);
+  emit_move_insn (mem, operands[1]);
+  DONE;
+})
+
 ;; Expander for builtins to insert vector registers into large
 ;; opaque integer modes.
 
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 760ba3d..13ef11d 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -7209,28 +7209,6 @@  vld1_dup_u64 (const uint64_t * a)
   return result;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vld1_f32 (const float32_t * a)
-{
-  float32x2_t result;
-  __asm__ ("ld1 {%0.2s}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const float32x2_t *_a = (float32x2_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vld1_f64 (const float64_t * a)
-{
-  float64x1_t result;
-  __asm__ ("ld1 {%0.1d}, %1"
-	   : "=w"(result)
-	   : "Utv"(*a)
-	   : /* No clobbers */);
-  return result;
-}
-
 #define vld1_lane_f32(a, b, c)                                          \
   __extension__                                                         \
     ({                                                                  \
@@ -7387,116 +7365,6 @@  vld1_f64 (const float64_t * a)
        result;                                                          \
      })
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vld1_p8 (const poly8_t * a)
-{
-  poly8x8_t result;
-  __asm__ ("ld1 {%0.8b}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const poly8x8_t *_a = (poly8x8_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vld1_p16 (const poly16_t * a)
-{
-  poly16x4_t result;
-  __asm__ ("ld1 {%0.4h}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const poly16x4_t *_a = (poly16x4_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vld1_s8 (const int8_t * a)
-{
-  int8x8_t result;
-  __asm__ ("ld1 {%0.8b}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const int8x8_t *_a = (int8x8_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vld1_s16 (const int16_t * a)
-{
-  int16x4_t result;
-  __asm__ ("ld1 {%0.4h}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const int16x4_t *_a = (int16x4_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vld1_s32 (const int32_t * a)
-{
-  int32x2_t result;
-  __asm__ ("ld1 {%0.2s}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const int32x2_t *_a = (int32x2_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vld1_s64 (const int64_t * a)
-{
-  int64x1_t result;
-  __asm__ ("ld1 {%0.1d}, %1"
-	   : "=w"(result)
-	   : "Utv"(*a)
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vld1_u8 (const uint8_t * a)
-{
-  uint8x8_t result;
-  __asm__ ("ld1 {%0.8b}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const uint8x8_t *_a = (uint8x8_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vld1_u16 (const uint16_t * a)
-{
-  uint16x4_t result;
-  __asm__ ("ld1 {%0.4h}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const uint16x4_t *_a = (uint16x4_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vld1_u32 (const uint32_t * a)
-{
-  uint32x2_t result;
-  __asm__ ("ld1 {%0.2s}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const uint32x2_t *_a = (uint32x2_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vld1_u64 (const uint64_t * a)
-{
-  uint64x1_t result;
-  __asm__ ("ld1 {%0.1d}, %1"
-	   : "=w"(result)
-	   : "Utv"(*a)
-	   : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vld1q_dup_f32 (const float32_t * a)
 {
@@ -7629,28 +7497,6 @@  vld1q_dup_u64 (const uint64_t * a)
   return result;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vld1q_f32 (const float32_t * a)
-{
-  float32x4_t result;
-  __asm__ ("ld1 {%0.4s}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const float32x4_t *_a = (float32x4_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vld1q_f64 (const float64_t * a)
-{
-  float64x2_t result;
-  __asm__ ("ld1 {%0.2d}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const float64x2_t *_a = (float64x2_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
 #define vld1q_lane_f32(a, b, c)                                         \
   __extension__                                                         \
     ({                                                                  \
@@ -7807,116 +7653,6 @@  vld1q_f64 (const float64_t * a)
        result;                                                          \
      })
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vld1q_p8 (const poly8_t * a)
-{
-  poly8x16_t result;
-  __asm__ ("ld1 {%0.16b}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const poly8x16_t *_a = (poly8x16_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vld1q_p16 (const poly16_t * a)
-{
-  poly16x8_t result;
-  __asm__ ("ld1 {%0.16b}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const poly16x8_t *_a = (poly16x8_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vld1q_s8 (const int8_t * a)
-{
-  int8x16_t result;
-  __asm__ ("ld1 {%0.16b}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const int8x16_t *_a = (int8x16_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vld1q_s16 (const int16_t * a)
-{
-  int16x8_t result;
-  __asm__ ("ld1 {%0.8h}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const int16x8_t *_a = (int16x8_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vld1q_s32 (const int32_t * a)
-{
-  int32x4_t result;
-  __asm__ ("ld1 {%0.4s}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const int32x4_t *_a = (int32x4_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vld1q_s64 (const int64_t * a)
-{
-  int64x2_t result;
-  __asm__ ("ld1 {%0.2d}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const int64x2_t *_a = (int64x2_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vld1q_u8 (const uint8_t * a)
-{
-  uint8x16_t result;
-  __asm__ ("ld1 {%0.16b}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const uint8x16_t *_a = (uint8x16_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vld1q_u16 (const uint16_t * a)
-{
-  uint16x8_t result;
-  __asm__ ("ld1 {%0.8h}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const uint16x8_t *_a = (uint16x8_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vld1q_u32 (const uint32_t * a)
-{
-  uint32x4_t result;
-  __asm__ ("ld1 {%0.4s}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const uint32x4_t *_a = (uint32x4_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vld1q_u64 (const uint64_t * a)
-{
-  uint64x2_t result;
-  __asm__ ("ld1 {%0.2d}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const uint64x2_t *_a = (uint64x2_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
 #define vmla_lane_f32(a, b, c, d)                                       \
   __extension__                                                         \
     ({                                                                  \
@@ -14382,24 +14118,6 @@  vrsubhn_u64 (uint64x2_t a, uint64x2_t b)
        result;                                                          \
      })
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_f32 (float32_t * a, float32x2_t b)
-{
-  __asm__ ("st1 {%1.2s},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_f64 (float64_t * a, float64x1_t b)
-{
-  __asm__ ("st1 {%1.1d},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
 #define vst1_lane_f32(a, b, c)                                          \
   __extension__                                                         \
     ({                                                                  \
@@ -14532,113 +14250,6 @@  vst1_f64 (float64_t * a, float64x1_t b)
                 : "memory");                                            \
      })
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_p8 (poly8_t * a, poly8x8_t b)
-{
-  __asm__ ("st1 {%1.8b},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_p16 (poly16_t * a, poly16x4_t b)
-{
-  __asm__ ("st1 {%1.4h},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_s8 (int8_t * a, int8x8_t b)
-{
-  __asm__ ("st1 {%1.8b},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_s16 (int16_t * a, int16x4_t b)
-{
-  __asm__ ("st1 {%1.4h},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_s32 (int32_t * a, int32x2_t b)
-{
-  __asm__ ("st1 {%1.2s},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_s64 (int64_t * a, int64x1_t b)
-{
-  __asm__ ("st1 {%1.1d},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_u8 (uint8_t * a, uint8x8_t b)
-{
-  __asm__ ("st1 {%1.8b},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_u16 (uint16_t * a, uint16x4_t b)
-{
-  __asm__ ("st1 {%1.4h},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_u32 (uint32_t * a, uint32x2_t b)
-{
-  __asm__ ("st1 {%1.2s},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_u64 (uint64_t * a, uint64x1_t b)
-{
-  __asm__ ("st1 {%1.1d},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_f32 (float32_t * a, float32x4_t b)
-{
-  __asm__ ("st1 {%1.4s},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_f64 (float64_t * a, float64x2_t b)
-{
-  __asm__ ("st1 {%1.2d},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
 
 #define vst1q_lane_f32(a, b, c)                                         \
   __extension__                                                         \
@@ -14772,96 +14383,6 @@  vst1q_f64 (float64_t * a, float64x2_t b)
                 : "memory");                                            \
      })
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_p8 (poly8_t * a, poly8x16_t b)
-{
-  __asm__ ("st1 {%1.16b},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_p16 (poly16_t * a, poly16x8_t b)
-{
-  __asm__ ("st1 {%1.8h},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_s8 (int8_t * a, int8x16_t b)
-{
-  __asm__ ("st1 {%1.16b},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_s16 (int16_t * a, int16x8_t b)
-{
-  __asm__ ("st1 {%1.8h},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_s32 (int32_t * a, int32x4_t b)
-{
-  __asm__ ("st1 {%1.4s},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_s64 (int64_t * a, int64x2_t b)
-{
-  __asm__ ("st1 {%1.2d},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_u8 (uint8_t * a, uint8x16_t b)
-{
-  __asm__ ("st1 {%1.16b},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_u16 (uint16_t * a, uint16x8_t b)
-{
-  __asm__ ("st1 {%1.8h},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_u32 (uint32_t * a, uint32x4_t b)
-{
-  __asm__ ("st1 {%1.4s},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_u64 (uint64_t * a, uint64x2_t b)
-{
-  __asm__ ("st1 {%1.2d},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
 __extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 vsubhn_high_s16 (int8x8_t a, int16x8_t b, int16x8_t c)
 {
@@ -20279,6 +19800,165 @@  vdupd_lane_u64 (uint64x2_t a, int const b)
   return (uint64x1_t) __builtin_aarch64_dup_lane_scalarv2di ((int64x2_t) a, b);
 }
 
+/* vld1 */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vld1_f32 (const float32_t *a)
+{
+  return __builtin_aarch64_ld1v2sf ((const __builtin_aarch64_simd_sf *) a);
+}
+
+__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+vld1_f64 (const float64_t *a)
+{
+  return *a;
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vld1_p8 (const poly8_t *a)
+{
+  return (poly8x8_t)
+    __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vld1_p16 (const poly16_t *a)
+{
+  return (poly16x4_t)
+    __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vld1_s8 (const int8_t *a)
+{
+  return __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vld1_s16 (const int16_t *a)
+{
+  return __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vld1_s32 (const int32_t *a)
+{
+  return __builtin_aarch64_ld1v2si ((const __builtin_aarch64_simd_si *) a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vld1_s64 (const int64_t *a)
+{
+  return *a;
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vld1_u8 (const uint8_t *a)
+{
+  return (uint8x8_t)
+    __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vld1_u16 (const uint16_t *a)
+{
+  return (uint16x4_t)
+    __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vld1_u32 (const uint32_t *a)
+{
+  return (uint32x2_t)
+    __builtin_aarch64_ld1v2si ((const __builtin_aarch64_simd_si *) a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vld1_u64 (const uint64_t *a)
+{
+  return *a;
+}
+
+/* vld1q */
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vld1q_f32 (const float32_t *a)
+{
+  return __builtin_aarch64_ld1v4sf ((const __builtin_aarch64_simd_sf *) a);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vld1q_f64 (const float64_t *a)
+{
+  return __builtin_aarch64_ld1v2df ((const __builtin_aarch64_simd_df *) a);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vld1q_p8 (const poly8_t *a)
+{
+  return (poly8x16_t)
+    __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vld1q_p16 (const poly16_t *a)
+{
+  return (poly16x8_t)
+    __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vld1q_s8 (const int8_t *a)
+{
+  return __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vld1q_s16 (const int16_t *a)
+{
+  return __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vld1q_s32 (const int32_t *a)
+{
+  return __builtin_aarch64_ld1v4si ((const __builtin_aarch64_simd_si *) a);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vld1q_s64 (const int64_t *a)
+{
+  return __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) a);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vld1q_u8 (const uint8_t *a)
+{
+  return (uint8x16_t)
+    __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vld1q_u16 (const uint16_t *a)
+{
+  return (uint16x8_t)
+    __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vld1q_u32 (const uint32_t *a)
+{
+  return (uint32x4_t)
+    __builtin_aarch64_ld1v4si ((const __builtin_aarch64_simd_si *) a);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vld1q_u64 (const uint64_t *a)
+{
+  return (uint64x2_t)
+    __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) a);
+}
+
 /* vldn */
 
 __extension__ static __inline int64x1x2_t __attribute__ ((__always_inline__))
@@ -24542,6 +24222,165 @@  vsrid_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
   return (uint64x1_t) __builtin_aarch64_usri_ndi (__a, __b, __c);
 }
 
+/* vst1 */
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_f32 (float32_t *a, float32x2_t b)
+{
+  __builtin_aarch64_st1v2sf ((__builtin_aarch64_simd_sf *) a, b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_f64 (float64_t *a, float64x1_t b)
+{
+  *a = b;
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_p8 (poly8_t *a, poly8x8_t b)
+{
+  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a,
+			     (int8x8_t) b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_p16 (poly16_t *a, poly16x4_t b)
+{
+  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a,
+			     (int16x4_t) b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_s8 (int8_t *a, int8x8_t b)
+{
+  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a, b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_s16 (int16_t *a, int16x4_t b)
+{
+  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a, b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_s32 (int32_t *a, int32x2_t b)
+{
+  __builtin_aarch64_st1v2si ((__builtin_aarch64_simd_si *) a, b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_s64 (int64_t *a, int64x1_t b)
+{
+  *a = b;
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_u8 (uint8_t *a, uint8x8_t b)
+{
+  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a,
+			     (int8x8_t) b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_u16 (uint16_t *a, uint16x4_t b)
+{
+  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a,
+			     (int16x4_t) b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_u32 (uint32_t *a, uint32x2_t b)
+{
+  __builtin_aarch64_st1v2si ((__builtin_aarch64_simd_si *) a,
+			     (int32x2_t) b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_u64 (uint64_t *a, uint64x1_t b)
+{
+  *a = b;
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_f32 (float32_t *a, float32x4_t b)
+{
+  __builtin_aarch64_st1v4sf ((__builtin_aarch64_simd_sf *) a, b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_f64 (float64_t *a, float64x2_t b)
+{
+  __builtin_aarch64_st1v2df ((__builtin_aarch64_simd_df *) a, b);
+}
+
+/* vst1q */
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_p8 (poly8_t *a, poly8x16_t b)
+{
+  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a,
+			      (int8x16_t) b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_p16 (poly16_t *a, poly16x8_t b)
+{
+  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a,
+			     (int16x8_t) b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_s8 (int8_t *a, int8x16_t b)
+{
+  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a, b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_s16 (int16_t *a, int16x8_t b)
+{
+  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a, b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_s32 (int32_t *a, int32x4_t b)
+{
+  __builtin_aarch64_st1v4si ((__builtin_aarch64_simd_si *) a, b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_s64 (int64_t *a, int64x2_t b)
+{
+  __builtin_aarch64_st1v2di ((__builtin_aarch64_simd_di *) a, b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_u8 (uint8_t *a, uint8x16_t b)
+{
+  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a,
+			      (int8x16_t) b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_u16 (uint16_t *a, uint16x8_t b)
+{
+  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a,
+			     (int16x8_t) b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_u32 (uint32_t *a, uint32x4_t b)
+{
+  __builtin_aarch64_st1v4si ((__builtin_aarch64_simd_si *) a,
+			     (int32x4_t) b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_u64 (uint64_t *a, uint64x2_t b)
+{
+  __builtin_aarch64_st1v2di ((__builtin_aarch64_simd_di *) a,
+			     (int64x2_t) b);
+}
+
 /* vstn */
 
 __extension__ static __inline void