Patchwork [AArch64] Convert ld1, st1 arm_neon.h intrinsics to RTL builtins.

login
register
mail settings
Submitter James Greenhalgh
Date July 4, 2013, 8:17 a.m.
Message ID <1372925877-30824-1-git-send-email-james.greenhalgh@arm.com>
Download mbox | patch
Permalink /patch/256818/
State New
Headers show

Comments

James Greenhalgh - July 4, 2013, 8:17 a.m.
> > 2013-07-02  James Greenhalgh  <james.greenhalgh@arm.com>
> >
> > 	* config/aarch64/aarch64-builtins.c
> > 	(aarch64_simd_expand_builtin): Handle AARCH64_SIMD_STORE1.
> > 	* config/aarch64/aarch64-simd-builtins.def (ld1): New.
> > 	(st1): Likewise.
> > 	* config/aarch64/aarch64-simd.md
> > 	(aarch64_ld1<VALL:mode>): New.
> > 	(aarch64_st1<VALL:mode>): Likewise.
> > 	* config/aarch64/arm_neon.h
> > 	(vld1<q>_<fpsu><8, 16, 32, 64>): Convert to RTL builtins.
> >
>
> OK
> /Marcus

Thanks Marcus,

I've committed this as r200634.

As this is a bug-fix I'd like to backport it to the 4.8 branch.
I've attached a copy of the patch that applies to 4.8
and run it through testing on aarch64-none-elf with no issues.

Is this OK to commit to gcc-4_8-branch?

Thanks,
James

---
gcc/

2013-07-04  James Greenhalgh  <james.greenhalgh@arm.com>

	Backport From mainline:
	2013-07-03  James Greenhalgh  <james.greenhalgh@arm.com>

	* config/aarch64/aarch64-builtins.c
	(aarch64_simd_expand_builtin): Handle AARCH64_SIMD_STORE1.
	* config/aarch64/aarch64-simd-builtins.def (ld1): New.
	(st1): Likewise.
	* config/aarch64/aarch64-simd.md
	(aarch64_ld1<VALL:mode>): New.
	(aarch64_st1<VALL:mode>): Likewise.
	* config/aarch64/arm_neon.h
	(vld1<q>_<fpsu><8, 16, 32, 64>): Convert to RTL builtins.
James Greenhalgh - July 16, 2013, 1:45 p.m.
Ping?

> -----Original Message-----
> From: James Greenhalgh
> Sent: 04 July 2013 09:18
> To: gcc-patches@gcc.gnu.org
> Cc: Marcus Shawcroft
> Subject: RE: [AArch64] Convert ld1, st1 arm_neon.h intrinsics to RTL
> builtins.
> 
> 
> > > 2013-07-02  James Greenhalgh  <james.greenhalgh@arm.com>
> > >
> > > 	* config/aarch64/aarch64-builtins.c
> > > 	(aarch64_simd_expand_builtin): Handle AARCH64_SIMD_STORE1.
> > > 	* config/aarch64/aarch64-simd-builtins.def (ld1): New.
> > > 	(st1): Likewise.
> > > 	* config/aarch64/aarch64-simd.md
> > > 	(aarch64_ld1<VALL:mode>): New.
> > > 	(aarch64_st1<VALL:mode>): Likewise.
> > > 	* config/aarch64/arm_neon.h
> > > 	(vld1<q>_<fpsu><8, 16, 32, 64>): Convert to RTL builtins.
> > >
> >
> > OK
> > /Marcus
> 
> Thanks Marcus,
> 
> I've committed this as r200634.
> 
> As this is a bug-fix I'd like to backport it to the 4.8 branch.
> I've attached a copy of the patch that applies to 4.8
> and run it through testing on aarch64-none-elf with no issues.
> 
> Is this OK to commit to gcc-4_8-branch?
> 
> Thanks,
> James
> 
> ---
> gcc/
> 
> 2013-07-04  James Greenhalgh  <james.greenhalgh@arm.com>
> 
> 	Backport From mainline:
> 	2013-07-03  James Greenhalgh  <james.greenhalgh@arm.com>
> 
> 	* config/aarch64/aarch64-builtins.c
> 	(aarch64_simd_expand_builtin): Handle AARCH64_SIMD_STORE1.
> 	* config/aarch64/aarch64-simd-builtins.def (ld1): New.
> 	(st1): Likewise.
> 	* config/aarch64/aarch64-simd.md
> 	(aarch64_ld1<VALL:mode>): New.
> 	(aarch64_st1<VALL:mode>): Likewise.
> 	* config/aarch64/arm_neon.h
> 	(vld1<q>_<fpsu><8, 16, 32, 64>): Convert to RTL builtins.
Marcus Shawcroft - July 16, 2013, 2:06 p.m.
On 04/07/13 09:17, James Greenhalgh wrote:

> I've committed this as r200634.
>
> As this is a bug-fix I'd like to backport it to the 4.8 branch.
> I've attached a copy of the patch that applies to 4.8
> and run it through testing on aarch64-none-elf with no issues.
>
> Is this OK to commit to gcc-4_8-branch?

OK
/Marcus

Patch

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 1ea55a8..b2901db 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -1154,6 +1154,7 @@  aarch64_simd_expand_builtin (int fcode, tree exp, rtx target)
       return aarch64_simd_expand_args (target, icode, 1, exp,
 				       SIMD_ARG_COPY_TO_REG, SIMD_ARG_STOP);
 
+    case AARCH64_SIMD_STORE1:
     case AARCH64_SIMD_STORESTRUCT:
       return aarch64_simd_expand_args (target, icode, 0, exp,
 				       SIMD_ARG_COPY_TO_REG,
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index a6a5e12..955da26 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -256,3 +256,10 @@ 
   BUILTIN_VALL (BINOP, uzp2)
   BUILTIN_VALL (BINOP, trn1)
   BUILTIN_VALL (BINOP, trn2)
+
+  /* Implemented by aarch64_ld1<VALL:mode>.  */
+  BUILTIN_VALL (LOAD1, ld1)
+
+  /* Implemented by aarch64_st1<VALL:mode>.  */
+  BUILTIN_VALL (STORE1, st1)
+
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 79c3093..00f3c31 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3457,6 +3457,17 @@ 
   DONE;
 })
 
+(define_expand "aarch64_ld1<VALL:mode>"
+ [(match_operand:VALL 0 "register_operand")
+  (match_operand:DI 1 "register_operand")]
+  "TARGET_SIMD"
+{
+  enum machine_mode mode = <VALL:MODE>mode;
+  rtx mem = gen_rtx_MEM (mode, operands[1]);
+  emit_move_insn (operands[0], mem);
+  DONE;
+})
+
 (define_expand "aarch64_ld<VSTRUCT:nregs><VQ:mode>"
  [(match_operand:VSTRUCT 0 "register_operand" "=w")
   (match_operand:DI 1 "register_operand" "r")
@@ -3673,6 +3684,17 @@ 
   DONE;
 })
 
+(define_expand "aarch64_st1<VALL:mode>"
+ [(match_operand:DI 0 "register_operand")
+  (match_operand:VALL 1 "register_operand")]
+  "TARGET_SIMD"
+{
+  enum machine_mode mode = <VALL:MODE>mode;
+  rtx mem = gen_rtx_MEM (mode, operands[0]);
+  emit_move_insn (mem, operands[1]);
+  DONE;
+})
+
 ;; Expander for builtins to insert vector registers into large
 ;; opaque integer modes.
 
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 669217e..60e1f7d 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -8518,28 +8518,6 @@  vld1_dup_u64 (const uint64_t * a)
   return result;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vld1_f32 (const float32_t * a)
-{
-  float32x2_t result;
-  __asm__ ("ld1 {%0.2s}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const float32x2_t *_a = (float32x2_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vld1_f64 (const float64_t * a)
-{
-  float64x1_t result;
-  __asm__ ("ld1 {%0.1d}, %1"
-	   : "=w"(result)
-	   : "Utv"(*a)
-	   : /* No clobbers */);
-  return result;
-}
-
 #define vld1_lane_f32(a, b, c)                                          \
   __extension__                                                         \
     ({                                                                  \
@@ -8696,116 +8674,6 @@  vld1_f64 (const float64_t * a)
        result;                                                          \
      })
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vld1_p8 (const poly8_t * a)
-{
-  poly8x8_t result;
-  __asm__ ("ld1 {%0.8b}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const poly8x8_t *_a = (poly8x8_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vld1_p16 (const poly16_t * a)
-{
-  poly16x4_t result;
-  __asm__ ("ld1 {%0.4h}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const poly16x4_t *_a = (poly16x4_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vld1_s8 (const int8_t * a)
-{
-  int8x8_t result;
-  __asm__ ("ld1 {%0.8b}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const int8x8_t *_a = (int8x8_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vld1_s16 (const int16_t * a)
-{
-  int16x4_t result;
-  __asm__ ("ld1 {%0.4h}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const int16x4_t *_a = (int16x4_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vld1_s32 (const int32_t * a)
-{
-  int32x2_t result;
-  __asm__ ("ld1 {%0.2s}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const int32x2_t *_a = (int32x2_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vld1_s64 (const int64_t * a)
-{
-  int64x1_t result;
-  __asm__ ("ld1 {%0.1d}, %1"
-	   : "=w"(result)
-	   : "Utv"(*a)
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vld1_u8 (const uint8_t * a)
-{
-  uint8x8_t result;
-  __asm__ ("ld1 {%0.8b}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const uint8x8_t *_a = (uint8x8_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vld1_u16 (const uint16_t * a)
-{
-  uint16x4_t result;
-  __asm__ ("ld1 {%0.4h}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const uint16x4_t *_a = (uint16x4_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vld1_u32 (const uint32_t * a)
-{
-  uint32x2_t result;
-  __asm__ ("ld1 {%0.2s}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const uint32x2_t *_a = (uint32x2_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vld1_u64 (const uint64_t * a)
-{
-  uint64x1_t result;
-  __asm__ ("ld1 {%0.1d}, %1"
-	   : "=w"(result)
-	   : "Utv"(*a)
-	   : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vld1q_dup_f32 (const float32_t * a)
 {
@@ -8938,28 +8806,6 @@  vld1q_dup_u64 (const uint64_t * a)
   return result;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vld1q_f32 (const float32_t * a)
-{
-  float32x4_t result;
-  __asm__ ("ld1 {%0.4s}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const float32x4_t *_a = (float32x4_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vld1q_f64 (const float64_t * a)
-{
-  float64x2_t result;
-  __asm__ ("ld1 {%0.2d}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const float64x2_t *_a = (float64x2_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
 #define vld1q_lane_f32(a, b, c)                                         \
   __extension__                                                         \
     ({                                                                  \
@@ -9116,116 +8962,6 @@  vld1q_f64 (const float64_t * a)
        result;                                                          \
      })
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vld1q_p8 (const poly8_t * a)
-{
-  poly8x16_t result;
-  __asm__ ("ld1 {%0.16b}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const poly8x16_t *_a = (poly8x16_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vld1q_p16 (const poly16_t * a)
-{
-  poly16x8_t result;
-  __asm__ ("ld1 {%0.16b}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const poly16x8_t *_a = (poly16x8_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vld1q_s8 (const int8_t * a)
-{
-  int8x16_t result;
-  __asm__ ("ld1 {%0.16b}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const int8x16_t *_a = (int8x16_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vld1q_s16 (const int16_t * a)
-{
-  int16x8_t result;
-  __asm__ ("ld1 {%0.8h}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const int16x8_t *_a = (int16x8_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vld1q_s32 (const int32_t * a)
-{
-  int32x4_t result;
-  __asm__ ("ld1 {%0.4s}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const int32x4_t *_a = (int32x4_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vld1q_s64 (const int64_t * a)
-{
-  int64x2_t result;
-  __asm__ ("ld1 {%0.2d}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const int64x2_t *_a = (int64x2_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vld1q_u8 (const uint8_t * a)
-{
-  uint8x16_t result;
-  __asm__ ("ld1 {%0.16b}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const uint8x16_t *_a = (uint8x16_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vld1q_u16 (const uint16_t * a)
-{
-  uint16x8_t result;
-  __asm__ ("ld1 {%0.8h}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const uint16x8_t *_a = (uint16x8_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vld1q_u32 (const uint32_t * a)
-{
-  uint32x4_t result;
-  __asm__ ("ld1 {%0.4s}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const uint32x4_t *_a = (uint32x4_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vld1q_u64 (const uint64_t * a)
-{
-  uint64x2_t result;
-  __asm__ ("ld1 {%0.2d}, %1"
-	   : "=w"(result)
-	   : "Utv"(({const uint64x2_t *_a = (uint64x2_t *) a; *_a;}))
-	   : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vmaxnm_f32 (float32x2_t a, float32x2_t b)
 {
@@ -16285,24 +16021,6 @@  vrsubhn_u64 (uint64x2_t a, uint64x2_t b)
        result;                                                          \
      })
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_f32 (float32_t * a, float32x2_t b)
-{
-  __asm__ ("st1 {%1.2s},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_f64 (float64_t * a, float64x1_t b)
-{
-  __asm__ ("st1 {%1.1d},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
 #define vst1_lane_f32(a, b, c)                                          \
   __extension__                                                         \
     ({                                                                  \
@@ -16435,113 +16153,6 @@  vst1_f64 (float64_t * a, float64x1_t b)
                 : "memory");                                            \
      })
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_p8 (poly8_t * a, poly8x8_t b)
-{
-  __asm__ ("st1 {%1.8b},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_p16 (poly16_t * a, poly16x4_t b)
-{
-  __asm__ ("st1 {%1.4h},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_s8 (int8_t * a, int8x8_t b)
-{
-  __asm__ ("st1 {%1.8b},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_s16 (int16_t * a, int16x4_t b)
-{
-  __asm__ ("st1 {%1.4h},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_s32 (int32_t * a, int32x2_t b)
-{
-  __asm__ ("st1 {%1.2s},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_s64 (int64_t * a, int64x1_t b)
-{
-  __asm__ ("st1 {%1.1d},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_u8 (uint8_t * a, uint8x8_t b)
-{
-  __asm__ ("st1 {%1.8b},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_u16 (uint16_t * a, uint16x4_t b)
-{
-  __asm__ ("st1 {%1.4h},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_u32 (uint32_t * a, uint32x2_t b)
-{
-  __asm__ ("st1 {%1.2s},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_u64 (uint64_t * a, uint64x1_t b)
-{
-  __asm__ ("st1 {%1.1d},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_f32 (float32_t * a, float32x4_t b)
-{
-  __asm__ ("st1 {%1.4s},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_f64 (float64_t * a, float64x2_t b)
-{
-  __asm__ ("st1 {%1.2d},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
 
 #define vst1q_lane_f32(a, b, c)                                         \
   __extension__                                                         \
@@ -16675,96 +16286,6 @@  vst1q_f64 (float64_t * a, float64x2_t b)
                 : "memory");                                            \
      })
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_p8 (poly8_t * a, poly8x16_t b)
-{
-  __asm__ ("st1 {%1.16b},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_p16 (poly16_t * a, poly16x8_t b)
-{
-  __asm__ ("st1 {%1.8h},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_s8 (int8_t * a, int8x16_t b)
-{
-  __asm__ ("st1 {%1.16b},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_s16 (int16_t * a, int16x8_t b)
-{
-  __asm__ ("st1 {%1.8h},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_s32 (int32_t * a, int32x4_t b)
-{
-  __asm__ ("st1 {%1.4s},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_s64 (int64_t * a, int64x2_t b)
-{
-  __asm__ ("st1 {%1.2d},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_u8 (uint8_t * a, uint8x16_t b)
-{
-  __asm__ ("st1 {%1.16b},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_u16 (uint16_t * a, uint16x8_t b)
-{
-  __asm__ ("st1 {%1.8h},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_u32 (uint32_t * a, uint32x4_t b)
-{
-  __asm__ ("st1 {%1.4s},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_u64 (uint64_t * a, uint64x2_t b)
-{
-  __asm__ ("st1 {%1.2d},[%0]"
-           :
-           : "r"(a), "w"(b)
-           : "memory");
-}
-
 __extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 vsubhn_high_s16 (int8x8_t a, int16x8_t b, int16x8_t c)
 {
@@ -20537,6 +20058,165 @@  vdupd_lane_u64 (uint64x2_t a, int const b)
   return (uint64x1_t) __builtin_aarch64_dup_lanedi ((int64x2_t) a, b);
 }
 
+/* vld1 */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vld1_f32 (const float32_t *a)
+{
+  return __builtin_aarch64_ld1v2sf ((const __builtin_aarch64_simd_sf *) a);
+}
+
+__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+vld1_f64 (const float64_t *a)
+{
+  return *a;
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vld1_p8 (const poly8_t *a)
+{
+  return (poly8x8_t)
+    __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vld1_p16 (const poly16_t *a)
+{
+  return (poly16x4_t)
+    __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vld1_s8 (const int8_t *a)
+{
+  return __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vld1_s16 (const int16_t *a)
+{
+  return __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vld1_s32 (const int32_t *a)
+{
+  return __builtin_aarch64_ld1v2si ((const __builtin_aarch64_simd_si *) a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vld1_s64 (const int64_t *a)
+{
+  return *a;
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vld1_u8 (const uint8_t *a)
+{
+  return (uint8x8_t)
+    __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vld1_u16 (const uint16_t *a)
+{
+  return (uint16x4_t)
+    __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vld1_u32 (const uint32_t *a)
+{
+  return (uint32x2_t)
+    __builtin_aarch64_ld1v2si ((const __builtin_aarch64_simd_si *) a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vld1_u64 (const uint64_t *a)
+{
+  return *a;
+}
+
+/* vld1q */
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vld1q_f32 (const float32_t *a)
+{
+  return __builtin_aarch64_ld1v4sf ((const __builtin_aarch64_simd_sf *) a);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vld1q_f64 (const float64_t *a)
+{
+  return __builtin_aarch64_ld1v2df ((const __builtin_aarch64_simd_df *) a);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vld1q_p8 (const poly8_t *a)
+{
+  return (poly8x16_t)
+    __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vld1q_p16 (const poly16_t *a)
+{
+  return (poly16x8_t)
+    __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vld1q_s8 (const int8_t *a)
+{
+  return __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vld1q_s16 (const int16_t *a)
+{
+  return __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vld1q_s32 (const int32_t *a)
+{
+  return __builtin_aarch64_ld1v4si ((const __builtin_aarch64_simd_si *) a);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vld1q_s64 (const int64_t *a)
+{
+  return __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) a);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vld1q_u8 (const uint8_t *a)
+{
+  return (uint8x16_t)
+    __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vld1q_u16 (const uint16_t *a)
+{
+  return (uint16x8_t)
+    __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vld1q_u32 (const uint32_t *a)
+{
+  return (uint32x4_t)
+    __builtin_aarch64_ld1v4si ((const __builtin_aarch64_simd_si *) a);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vld1q_u64 (const uint64_t *a)
+{
+  return (uint64x2_t)
+    __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) a);
+}
+
 /* vldn */
 
 __extension__ static __inline int64x1x2_t __attribute__ ((__always_inline__))
@@ -24307,6 +23987,165 @@  vsrid_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
   return (uint64x1_t) __builtin_aarch64_usri_ndi (__a, __b, __c);
 }
 
+/* vst1 */
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_f32 (float32_t *a, float32x2_t b)
+{
+  __builtin_aarch64_st1v2sf ((__builtin_aarch64_simd_sf *) a, b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_f64 (float64_t *a, float64x1_t b)
+{
+  *a = b;
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_p8 (poly8_t *a, poly8x8_t b)
+{
+  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a,
+			     (int8x8_t) b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_p16 (poly16_t *a, poly16x4_t b)
+{
+  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a,
+			     (int16x4_t) b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_s8 (int8_t *a, int8x8_t b)
+{
+  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a, b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_s16 (int16_t *a, int16x4_t b)
+{
+  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a, b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_s32 (int32_t *a, int32x2_t b)
+{
+  __builtin_aarch64_st1v2si ((__builtin_aarch64_simd_si *) a, b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_s64 (int64_t *a, int64x1_t b)
+{
+  *a = b;
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_u8 (uint8_t *a, uint8x8_t b)
+{
+  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a,
+			     (int8x8_t) b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_u16 (uint16_t *a, uint16x4_t b)
+{
+  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a,
+			     (int16x4_t) b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_u32 (uint32_t *a, uint32x2_t b)
+{
+  __builtin_aarch64_st1v2si ((__builtin_aarch64_simd_si *) a,
+			     (int32x2_t) b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_u64 (uint64_t *a, uint64x1_t b)
+{
+  *a = b;
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_f32 (float32_t *a, float32x4_t b)
+{
+  __builtin_aarch64_st1v4sf ((__builtin_aarch64_simd_sf *) a, b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_f64 (float64_t *a, float64x2_t b)
+{
+  __builtin_aarch64_st1v2df ((__builtin_aarch64_simd_df *) a, b);
+}
+
+/* vst1q */
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_p8 (poly8_t *a, poly8x16_t b)
+{
+  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a,
+			      (int8x16_t) b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_p16 (poly16_t *a, poly16x8_t b)
+{
+  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a,
+			     (int16x8_t) b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_s8 (int8_t *a, int8x16_t b)
+{
+  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a, b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_s16 (int16_t *a, int16x8_t b)
+{
+  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a, b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_s32 (int32_t *a, int32x4_t b)
+{
+  __builtin_aarch64_st1v4si ((__builtin_aarch64_simd_si *) a, b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_s64 (int64_t *a, int64x2_t b)
+{
+  __builtin_aarch64_st1v2di ((__builtin_aarch64_simd_di *) a, b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_u8 (uint8_t *a, uint8x16_t b)
+{
+  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a,
+			      (int8x16_t) b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_u16 (uint16_t *a, uint16x8_t b)
+{
+  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a,
+			     (int16x8_t) b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_u32 (uint32_t *a, uint32x4_t b)
+{
+  __builtin_aarch64_st1v4si ((__builtin_aarch64_simd_si *) a,
+			     (int32x4_t) b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_u64 (uint64_t *a, uint64x2_t b)
+{
+  __builtin_aarch64_st1v2di ((__builtin_aarch64_simd_di *) a,
+			     (int64x2_t) b);
+}
+
 /* vstn */
 
 __extension__ static __inline void