Patchwork [AArch64] Fix vld1<q>_* asm constraints in arm_neon.h

login
register
mail settings
Submitter James Greenhalgh
Date April 24, 2013, 2:34 p.m.
Message ID <1366814077-27335-1-git-send-email-james.greenhalgh@arm.com>
Download mbox | patch
Permalink /patch/239216/
State New
Headers show

Comments

James Greenhalgh - April 24, 2013, 2:34 p.m.
Hi,

The vld1<q>_* patterns in arm_neon.h did not correctly describe
their register/memory constraints. This could lead to incorrect
code generation where they were used.

This patch fixes the bug by giving the patterns the correct
register constraints.

Regression tested on aarch64-none-elf without regressions.

OK?

Thanks,
James Greenhalgh

---
gcc/

2013-04-24  James Greenhalgh  <james.greenhalgh@arm.com>

	* config/aarch64/arm_neon.h (vld1<q>_lane*): Fix constraints.
	(vld1<q>_dup_<sufp><8, 16, 32, 64>): Likewise.
	(vld1<q>_<sufp><8, 16, 32, 64>): Likewise.
Marcus Shawcroft - April 24, 2013, 3:09 p.m.
On 24/04/13 15:34, James Greenhalgh wrote:
> Hi,
>
> The vld1<q>_* patterns in arm_neon.h did not correctly describe
> their register/memory constraints. This could lead to incorrect
> code generation where they were used.
>
> This patch fixes the bug by giving the patterns the correct
> register constraints.
>
> Regression tested on aarch64-none-elf without regressions.
>
> OK?
>
> Thanks,
> James Greenhalgh
>
> ---
> gcc/
>
> 2013-04-24  James Greenhalgh  <james.greenhalgh@arm.com>
>
> 	* config/aarch64/arm_neon.h (vld1<q>_lane*): Fix constraints.
> 	(vld1<q>_dup_<sufp><8, 16, 32, 64>): Likewise.
> 	(vld1<q>_<sufp><8, 16, 32, 64>): Likewise.
>

OK
/Marcus
Marcus Shawcroft - April 24, 2013, 3:11 p.m.
On 24/04/13 16:09, Marcus Shawcroft wrote:
> On 24/04/13 15:34, James Greenhalgh wrote:
>> Hi,
>>
>> The vld1<q>_* patterns in arm_neon.h did not correctly describe
>> their register/memory constraints. This could lead to incorrect
>> code generation where they were used.
>>
>> This patch fixes the bug by giving the patterns the correct
>> register constraints.
>>
>> Regression tested on aarch64-none-elf without regressions.
>>
>> OK?
>>
>> Thanks,
>> James Greenhalgh
>>
>> ---
>> gcc/
>>
>> 2013-04-24  James Greenhalgh  <james.greenhalgh@arm.com>
>>
>> 	* config/aarch64/arm_neon.h (vld1<q>_lane*): Fix constraints.
>> 	(vld1<q>_dup_<sufp><8, 16, 32, 64>): Likewise.
>> 	(vld1<q>_<sufp><8, 16, 32, 64>): Likewise.
>>
>
> OK
> /Marcus
>
>

and backport to 4.8 and arm/4.7 please.
/Marcus

Patch

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 5e25c77..58343a7 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -8390,10 +8390,10 @@  __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vld1_dup_f32 (const float32_t * a)
 {
   float32x2_t result;
-  __asm__ ("ld1r {%0.2s},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1r {%0.2s}, %1"
+	   : "=w"(result)
+	   : "Utv"(*a)
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8401,10 +8401,10 @@  __extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
 vld1_dup_f64 (const float64_t * a)
 {
   float64x1_t result;
-  __asm__ ("ld1 {%0.1d},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1r {%0.1d}, %1"
+	   : "=w"(result)
+	   : "Utv"(*a)
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8412,10 +8412,10 @@  __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 vld1_dup_p8 (const poly8_t * a)
 {
   poly8x8_t result;
-  __asm__ ("ld1r {%0.8b},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1r {%0.8b}, %1"
+	   : "=w"(result)
+	   : "Utv"(*a)
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8423,10 +8423,10 @@  __extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
 vld1_dup_p16 (const poly16_t * a)
 {
   poly16x4_t result;
-  __asm__ ("ld1r {%0.4h},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1r {%0.4h}, %1"
+	   : "=w"(result)
+	   : "Utv"(*a)
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8434,10 +8434,10 @@  __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vld1_dup_s8 (const int8_t * a)
 {
   int8x8_t result;
-  __asm__ ("ld1r {%0.8b},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1r {%0.8b}, %1"
+	   : "=w"(result)
+	   : "Utv"(*a)
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8445,10 +8445,10 @@  __extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 vld1_dup_s16 (const int16_t * a)
 {
   int16x4_t result;
-  __asm__ ("ld1r {%0.4h},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1r {%0.4h}, %1"
+	   : "=w"(result)
+	   : "Utv"(*a)
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8456,10 +8456,10 @@  __extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 vld1_dup_s32 (const int32_t * a)
 {
   int32x2_t result;
-  __asm__ ("ld1r {%0.2s},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1r {%0.2s}, %1"
+	   : "=w"(result)
+	   : "Utv"(*a)
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8467,10 +8467,10 @@  __extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
 vld1_dup_s64 (const int64_t * a)
 {
   int64x1_t result;
-  __asm__ ("ld1 {%0.1d},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1r {%0.1d}, %1"
+	   : "=w"(result)
+	   : "Utv"(*a)
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8478,10 +8478,10 @@  __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 vld1_dup_u8 (const uint8_t * a)
 {
   uint8x8_t result;
-  __asm__ ("ld1r {%0.8b},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1r {%0.8b}, %1"
+	   : "=w"(result)
+	   : "Utv"(*a)
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8489,10 +8489,10 @@  __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 vld1_dup_u16 (const uint16_t * a)
 {
   uint16x4_t result;
-  __asm__ ("ld1r {%0.4h},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1r {%0.4h}, %1"
+	   : "=w"(result)
+	   : "Utv"(*a)
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8500,10 +8500,10 @@  __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 vld1_dup_u32 (const uint32_t * a)
 {
   uint32x2_t result;
-  __asm__ ("ld1r {%0.2s},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1r {%0.2s}, %1"
+	   : "=w"(result)
+	   : "Utv"(*a)
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8511,10 +8511,10 @@  __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vld1_dup_u64 (const uint64_t * a)
 {
   uint64x1_t result;
-  __asm__ ("ld1 {%0.1d},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1r {%0.1d}, %1"
+	   : "=w"(result)
+	   : "Utv"(*a)
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8522,10 +8522,10 @@  __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vld1_f32 (const float32_t * a)
 {
   float32x2_t result;
-  __asm__ ("ld1 {%0.2s},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1 {%0.2s}, %1"
+	   : "=w"(result)
+	   : "Utv"(({const float32x2_t *_a = (float32x2_t *) a; *_a;}))
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8533,10 +8533,10 @@  __extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
 vld1_f64 (const float64_t * a)
 {
   float64x1_t result;
-  __asm__ ("ld1 {%0.1d},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1 {%0.1d}, %1"
+	   : "=w"(result)
+	   : "Utv"(*a)
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8546,9 +8546,9 @@  vld1_f64 (const float64_t * a)
        float32x2_t b_ = (b);                                            \
        const float32_t * a_ = (a);                                      \
        float32x2_t result;                                              \
-       __asm__ ("ld1 {%0.s}[%3],[%1]"                                   \
+       __asm__ ("ld1 {%0.s}[%1], %2"                                    \
                 : "=w"(result)                                          \
-                : "r"(a_), "0"(b_), "i"(c)                              \
+                : "i" (c), "Utv"(*a_), "0"(b_)                          \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -8559,9 +8559,9 @@  vld1_f64 (const float64_t * a)
        float64x1_t b_ = (b);                                            \
        const float64_t * a_ = (a);                                      \
        float64x1_t result;                                              \
-       __asm__ ("ld1 {%0.d}[%3],[%1]"                                   \
+       __asm__ ("ld1 {%0.d}[%1], %2"                                    \
                 : "=w"(result)                                          \
-                : "r"(a_), "0"(b_), "i"(c)                              \
+                : "i" (c), "Utv"(*a_), "0"(b_)                          \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -8572,9 +8572,9 @@  vld1_f64 (const float64_t * a)
        poly8x8_t b_ = (b);                                              \
        const poly8_t * a_ = (a);                                        \
        poly8x8_t result;                                                \
-       __asm__ ("ld1 {%0.b}[%3],[%1]"                                   \
+       __asm__ ("ld1 {%0.b}[%1], %2"                                    \
                 : "=w"(result)                                          \
-                : "r"(a_), "0"(b_), "i"(c)                              \
+                : "i" (c), "Utv"(*a_), "0"(b_)                          \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -8585,9 +8585,9 @@  vld1_f64 (const float64_t * a)
        poly16x4_t b_ = (b);                                             \
        const poly16_t * a_ = (a);                                       \
        poly16x4_t result;                                               \
-       __asm__ ("ld1 {%0.h}[%3],[%1]"                                   \
+       __asm__ ("ld1 {%0.h}[%1], %2"                                    \
                 : "=w"(result)                                          \
-                : "r"(a_), "0"(b_), "i"(c)                              \
+                : "i" (c), "Utv"(*a_), "0"(b_)                          \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -8598,9 +8598,9 @@  vld1_f64 (const float64_t * a)
        int8x8_t b_ = (b);                                               \
        const int8_t * a_ = (a);                                         \
        int8x8_t result;                                                 \
-       __asm__ ("ld1 {%0.b}[%3],[%1]"                                   \
+       __asm__ ("ld1 {%0.b}[%1], %2"                                    \
                 : "=w"(result)                                          \
-                : "r"(a_), "0"(b_), "i"(c)                              \
+                : "i" (c), "Utv"(*a_), "0"(b_)                          \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -8611,9 +8611,9 @@  vld1_f64 (const float64_t * a)
        int16x4_t b_ = (b);                                              \
        const int16_t * a_ = (a);                                        \
        int16x4_t result;                                                \
-       __asm__ ("ld1 {%0.h}[%3],[%1]"                                   \
+       __asm__ ("ld1 {%0.h}[%1], %2"                                    \
                 : "=w"(result)                                          \
-                : "r"(a_), "0"(b_), "i"(c)                              \
+                : "i" (c), "Utv"(*a_), "0"(b_)                          \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -8624,9 +8624,9 @@  vld1_f64 (const float64_t * a)
        int32x2_t b_ = (b);                                              \
        const int32_t * a_ = (a);                                        \
        int32x2_t result;                                                \
-       __asm__ ("ld1 {%0.s}[%3],[%1]"                                   \
+       __asm__ ("ld1 {%0.s}[%1], %2"                                    \
                 : "=w"(result)                                          \
-                : "r"(a_), "0"(b_), "i"(c)                              \
+                : "i" (c), "Utv"(*a_), "0"(b_)                          \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -8637,9 +8637,9 @@  vld1_f64 (const float64_t * a)
        int64x1_t b_ = (b);                                              \
        const int64_t * a_ = (a);                                        \
        int64x1_t result;                                                \
-       __asm__ ("ld1 {%0.d}[%3],[%1]"                                   \
+       __asm__ ("ld1 {%0.d}[%1], %2"                                    \
                 : "=w"(result)                                          \
-                : "r"(a_), "0"(b_), "i"(c)                              \
+                : "i" (c), "Utv"(*a_), "0"(b_)                          \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -8650,9 +8650,9 @@  vld1_f64 (const float64_t * a)
        uint8x8_t b_ = (b);                                              \
        const uint8_t * a_ = (a);                                        \
        uint8x8_t result;                                                \
-       __asm__ ("ld1 {%0.b}[%3],[%1]"                                   \
+       __asm__ ("ld1 {%0.b}[%1], %2"                                    \
                 : "=w"(result)                                          \
-                : "r"(a_), "0"(b_), "i"(c)                              \
+                : "i" (c), "Utv"(*a_), "0"(b_)                          \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -8663,9 +8663,9 @@  vld1_f64 (const float64_t * a)
        uint16x4_t b_ = (b);                                             \
        const uint16_t * a_ = (a);                                       \
        uint16x4_t result;                                               \
-       __asm__ ("ld1 {%0.h}[%3],[%1]"                                   \
+       __asm__ ("ld1 {%0.h}[%1], %2"                                    \
                 : "=w"(result)                                          \
-                : "r"(a_), "0"(b_), "i"(c)                              \
+                : "i" (c), "Utv"(*a_), "0"(b_)                          \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -8676,9 +8676,9 @@  vld1_f64 (const float64_t * a)
        uint32x2_t b_ = (b);                                             \
        const uint32_t * a_ = (a);                                       \
        uint32x2_t result;                                               \
-       __asm__ ("ld1 {%0.s}[%3],[%1]"                                   \
+       __asm__ ("ld1 {%0.s}[%1], %2"                                    \
                 : "=w"(result)                                          \
-                : "r"(a_), "0"(b_), "i"(c)                              \
+                : "i" (c), "Utv"(*a_), "0"(b_)                          \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -8689,9 +8689,9 @@  vld1_f64 (const float64_t * a)
        uint64x1_t b_ = (b);                                             \
        const uint64_t * a_ = (a);                                       \
        uint64x1_t result;                                               \
-       __asm__ ("ld1 {%0.d}[%3],[%1]"                                   \
+       __asm__ ("ld1 {%0.d}[%1], %2"                                    \
                 : "=w"(result)                                          \
-                : "r"(a_), "0"(b_), "i"(c)                              \
+                : "i" (c), "Utv"(*a_), "0"(b_)                          \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -8700,10 +8700,10 @@  __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 vld1_p8 (const poly8_t * a)
 {
   poly8x8_t result;
-  __asm__ ("ld1 {%0.8b}, [%1]"
-           : "=w"(result)
-           : "r"(a)
-           : /* No clobbers */);
+  __asm__ ("ld1 {%0.8b}, %1"
+	   : "=w"(result)
+	   : "Utv"(({const poly8x8_t *_a = (poly8x8_t *) a; *_a;}))
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8711,10 +8711,10 @@  __extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
 vld1_p16 (const poly16_t * a)
 {
   poly16x4_t result;
-  __asm__ ("ld1 {%0.4h}, [%1]"
-           : "=w"(result)
-           : "r"(a)
-           : /* No clobbers */);
+  __asm__ ("ld1 {%0.4h}, %1"
+	   : "=w"(result)
+	   : "Utv"(({const poly16x4_t *_a = (poly16x4_t *) a; *_a;}))
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8722,10 +8722,10 @@  __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vld1_s8 (const int8_t * a)
 {
   int8x8_t result;
-  __asm__ ("ld1 {%0.8b},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1 {%0.8b}, %1"
+	   : "=w"(result)
+	   : "Utv"(({const int8x8_t *_a = (int8x8_t *) a; *_a;}))
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8733,10 +8733,10 @@  __extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 vld1_s16 (const int16_t * a)
 {
   int16x4_t result;
-  __asm__ ("ld1 {%0.4h},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1 {%0.4h}, %1"
+	   : "=w"(result)
+	   : "Utv"(({const int16x4_t *_a = (int16x4_t *) a; *_a;}))
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8744,10 +8744,10 @@  __extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 vld1_s32 (const int32_t * a)
 {
   int32x2_t result;
-  __asm__ ("ld1 {%0.2s},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1 {%0.2s}, %1"
+	   : "=w"(result)
+	   : "Utv"(({const int32x2_t *_a = (int32x2_t *) a; *_a;}))
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8755,10 +8755,10 @@  __extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
 vld1_s64 (const int64_t * a)
 {
   int64x1_t result;
-  __asm__ ("ld1 {%0.1d},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1 {%0.1d}, %1"
+	   : "=w"(result)
+	   : "Utv"(*a)
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8766,10 +8766,10 @@  __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 vld1_u8 (const uint8_t * a)
 {
   uint8x8_t result;
-  __asm__ ("ld1 {%0.8b},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1 {%0.8b}, %1"
+	   : "=w"(result)
+	   : "Utv"(({const uint8x8_t *_a = (uint8x8_t *) a; *_a;}))
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8777,10 +8777,10 @@  __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 vld1_u16 (const uint16_t * a)
 {
   uint16x4_t result;
-  __asm__ ("ld1 {%0.4h},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1 {%0.4h}, %1"
+	   : "=w"(result)
+	   : "Utv"(({const uint16x4_t *_a = (uint16x4_t *) a; *_a;}))
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8788,10 +8788,10 @@  __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 vld1_u32 (const uint32_t * a)
 {
   uint32x2_t result;
-  __asm__ ("ld1 {%0.2s},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1 {%0.2s}, %1"
+	   : "=w"(result)
+	   : "Utv"(({const uint32x2_t *_a = (uint32x2_t *) a; *_a;}))
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8799,10 +8799,10 @@  __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vld1_u64 (const uint64_t * a)
 {
   uint64x1_t result;
-  __asm__ ("ld1 {%0.1d},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1 {%0.1d}, %1"
+	   : "=w"(result)
+	   : "Utv"(*a)
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8810,10 +8810,10 @@  __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vld1q_dup_f32 (const float32_t * a)
 {
   float32x4_t result;
-  __asm__ ("ld1r {%0.4s},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1r {%0.4s}, %1"
+	   : "=w"(result)
+	   : "Utv"(*a)
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8821,10 +8821,10 @@  __extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 vld1q_dup_f64 (const float64_t * a)
 {
   float64x2_t result;
-  __asm__ ("ld1r {%0.2d},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1r {%0.2d}, %1"
+	   : "=w"(result)
+	   : "Utv"(*a)
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8832,10 +8832,10 @@  __extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
 vld1q_dup_p8 (const poly8_t * a)
 {
   poly8x16_t result;
-  __asm__ ("ld1r {%0.16b},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1r {%0.16b}, %1"
+	   : "=w"(result)
+	   : "Utv"(*a)
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8843,10 +8843,10 @@  __extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
 vld1q_dup_p16 (const poly16_t * a)
 {
   poly16x8_t result;
-  __asm__ ("ld1r {%0.8h},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1r {%0.8h}, %1"
+	   : "=w"(result)
+	   : "Utv"(*a)
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8854,10 +8854,10 @@  __extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 vld1q_dup_s8 (const int8_t * a)
 {
   int8x16_t result;
-  __asm__ ("ld1r {%0.16b},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1r {%0.16b}, %1"
+	   : "=w"(result)
+	   : "Utv"(*a)
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8865,10 +8865,10 @@  __extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 vld1q_dup_s16 (const int16_t * a)
 {
   int16x8_t result;
-  __asm__ ("ld1r {%0.8h},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1r {%0.8h}, %1"
+	   : "=w"(result)
+	   : "Utv"(*a)
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8876,10 +8876,10 @@  __extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 vld1q_dup_s32 (const int32_t * a)
 {
   int32x4_t result;
-  __asm__ ("ld1r {%0.4s},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1r {%0.4s}, %1"
+	   : "=w"(result)
+	   : "Utv"(*a)
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8887,10 +8887,10 @@  __extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 vld1q_dup_s64 (const int64_t * a)
 {
   int64x2_t result;
-  __asm__ ("ld1r {%0.2d},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1r {%0.2d}, %1"
+	   : "=w"(result)
+	   : "Utv"(*a)
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8898,10 +8898,10 @@  __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 vld1q_dup_u8 (const uint8_t * a)
 {
   uint8x16_t result;
-  __asm__ ("ld1r {%0.16b},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1r {%0.16b}, %1"
+	   : "=w"(result)
+	   : "Utv"(*a)
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8909,10 +8909,10 @@  __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 vld1q_dup_u16 (const uint16_t * a)
 {
   uint16x8_t result;
-  __asm__ ("ld1r {%0.8h},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1r {%0.8h}, %1"
+	   : "=w"(result)
+	   : "Utv"(*a)
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8920,10 +8920,10 @@  __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 vld1q_dup_u32 (const uint32_t * a)
 {
   uint32x4_t result;
-  __asm__ ("ld1r {%0.4s},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1r {%0.4s}, %1"
+	   : "=w"(result)
+	   : "Utv"(*a)
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8931,10 +8931,10 @@  __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 vld1q_dup_u64 (const uint64_t * a)
 {
   uint64x2_t result;
-  __asm__ ("ld1r {%0.2d},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1r {%0.2d}, %1"
+	   : "=w"(result)
+	   : "Utv"(*a)
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8942,10 +8942,10 @@  __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vld1q_f32 (const float32_t * a)
 {
   float32x4_t result;
-  __asm__ ("ld1 {%0.4s},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1 {%0.4s}, %1"
+	   : "=w"(result)
+	   : "Utv"(({const float32x4_t *_a = (float32x4_t *) a; *_a;}))
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8953,10 +8953,10 @@  __extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
 vld1q_f64 (const float64_t * a)
 {
   float64x2_t result;
-  __asm__ ("ld1 {%0.2d},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1 {%0.2d}, %1"
+	   : "=w"(result)
+	   : "Utv"(({const float64x2_t *_a = (float64x2_t *) a; *_a;}))
+	   : /* No clobbers */);
   return result;
 }
 
@@ -8966,9 +8966,9 @@  vld1q_f64 (const float64_t * a)
        float32x4_t b_ = (b);                                            \
        const float32_t * a_ = (a);                                      \
        float32x4_t result;                                              \
-       __asm__ ("ld1 {%0.s}[%3],[%1]"                                   \
+       __asm__ ("ld1 {%0.s}[%1], %2"                                    \
                 : "=w"(result)                                          \
-                : "r"(a_), "0"(b_), "i"(c)                              \
+                : "i"(c), "Utv"(*a_), "0"(b_)                           \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -8979,9 +8979,9 @@  vld1q_f64 (const float64_t * a)
        float64x2_t b_ = (b);                                            \
        const float64_t * a_ = (a);                                      \
        float64x2_t result;                                              \
-       __asm__ ("ld1 {%0.d}[%3],[%1]"                                   \
+       __asm__ ("ld1 {%0.d}[%1], %2"                                    \
                 : "=w"(result)                                          \
-                : "r"(a_), "0"(b_), "i"(c)                              \
+                : "i"(c), "Utv"(*a_), "0"(b_)                           \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -8992,9 +8992,9 @@  vld1q_f64 (const float64_t * a)
        poly8x16_t b_ = (b);                                             \
        const poly8_t * a_ = (a);                                        \
        poly8x16_t result;                                               \
-       __asm__ ("ld1 {%0.b}[%3],[%1]"                                   \
+       __asm__ ("ld1 {%0.b}[%1], %2"                                    \
                 : "=w"(result)                                          \
-                : "r"(a_), "0"(b_), "i"(c)                              \
+                : "i"(c), "Utv"(*a_), "0"(b_)                           \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -9005,9 +9005,9 @@  vld1q_f64 (const float64_t * a)
        poly16x8_t b_ = (b);                                             \
        const poly16_t * a_ = (a);                                       \
        poly16x8_t result;                                               \
-       __asm__ ("ld1 {%0.h}[%3],[%1]"                                   \
+       __asm__ ("ld1 {%0.h}[%1], %2"                                    \
                 : "=w"(result)                                          \
-                : "r"(a_), "0"(b_), "i"(c)                              \
+                : "i"(c), "Utv"(*a_), "0"(b_)                           \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -9018,9 +9018,9 @@  vld1q_f64 (const float64_t * a)
        int8x16_t b_ = (b);                                              \
        const int8_t * a_ = (a);                                         \
        int8x16_t result;                                                \
-       __asm__ ("ld1 {%0.b}[%3],[%1]"                                   \
+       __asm__ ("ld1 {%0.b}[%1], %2"                                    \
                 : "=w"(result)                                          \
-                : "r"(a_), "0"(b_), "i"(c)                              \
+                : "i"(c), "Utv"(*a_), "0"(b_)                           \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -9031,9 +9031,9 @@  vld1q_f64 (const float64_t * a)
        int16x8_t b_ = (b);                                              \
        const int16_t * a_ = (a);                                        \
        int16x8_t result;                                                \
-       __asm__ ("ld1 {%0.h}[%3],[%1]"                                   \
+       __asm__ ("ld1 {%0.h}[%1], %2"                                    \
                 : "=w"(result)                                          \
-                : "r"(a_), "0"(b_), "i"(c)                              \
+                : "i"(c), "Utv"(*a_), "0"(b_)                           \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -9044,9 +9044,9 @@  vld1q_f64 (const float64_t * a)
        int32x4_t b_ = (b);                                              \
        const int32_t * a_ = (a);                                        \
        int32x4_t result;                                                \
-       __asm__ ("ld1 {%0.s}[%3],[%1]"                                   \
+       __asm__ ("ld1 {%0.s}[%1], %2"                                    \
                 : "=w"(result)                                          \
-                : "r"(a_), "0"(b_), "i"(c)                              \
+                : "i"(c), "Utv"(*a_), "0"(b_)                           \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -9057,9 +9057,9 @@  vld1q_f64 (const float64_t * a)
        int64x2_t b_ = (b);                                              \
        const int64_t * a_ = (a);                                        \
        int64x2_t result;                                                \
-       __asm__ ("ld1 {%0.d}[%3],[%1]"                                   \
+       __asm__ ("ld1 {%0.d}[%1], %2"                                    \
                 : "=w"(result)                                          \
-                : "r"(a_), "0"(b_), "i"(c)                              \
+                : "i"(c), "Utv"(*a_), "0"(b_)                           \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -9070,9 +9070,9 @@  vld1q_f64 (const float64_t * a)
        uint8x16_t b_ = (b);                                             \
        const uint8_t * a_ = (a);                                        \
        uint8x16_t result;                                               \
-       __asm__ ("ld1 {%0.b}[%3],[%1]"                                   \
+       __asm__ ("ld1 {%0.b}[%1], %2"                                    \
                 : "=w"(result)                                          \
-                : "r"(a_), "0"(b_), "i"(c)                              \
+                : "i"(c), "Utv"(*a_), "0"(b_)                           \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -9083,9 +9083,9 @@  vld1q_f64 (const float64_t * a)
        uint16x8_t b_ = (b);                                             \
        const uint16_t * a_ = (a);                                       \
        uint16x8_t result;                                               \
-       __asm__ ("ld1 {%0.h}[%3],[%1]"                                   \
+       __asm__ ("ld1 {%0.h}[%1], %2"                                    \
                 : "=w"(result)                                          \
-                : "r"(a_), "0"(b_), "i"(c)                              \
+                : "i"(c), "Utv"(*a_), "0"(b_)                           \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -9096,9 +9096,9 @@  vld1q_f64 (const float64_t * a)
        uint32x4_t b_ = (b);                                             \
        const uint32_t * a_ = (a);                                       \
        uint32x4_t result;                                               \
-       __asm__ ("ld1 {%0.s}[%3],[%1]"                                   \
+       __asm__ ("ld1 {%0.s}[%1], %2"                                    \
                 : "=w"(result)                                          \
-                : "r"(a_), "0"(b_), "i"(c)                              \
+                : "i"(c), "Utv"(*a_), "0"(b_)                           \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -9109,9 +9109,9 @@  vld1q_f64 (const float64_t * a)
        uint64x2_t b_ = (b);                                             \
        const uint64_t * a_ = (a);                                       \
        uint64x2_t result;                                               \
-       __asm__ ("ld1 {%0.d}[%3],[%1]"                                   \
+       __asm__ ("ld1 {%0.d}[%1], %2"                                    \
                 : "=w"(result)                                          \
-                : "r"(a_), "0"(b_), "i"(c)                              \
+                : "i"(c), "Utv"(*a_), "0"(b_)                           \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -9120,10 +9120,10 @@  __extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
 vld1q_p8 (const poly8_t * a)
 {
   poly8x16_t result;
-  __asm__ ("ld1 {%0.16b},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1 {%0.16b}, %1"
+	   : "=w"(result)
+	   : "Utv"(({const poly8x16_t *_a = (poly8x16_t *) a; *_a;}))
+	   : /* No clobbers */);
   return result;
 }
 
@@ -9131,10 +9131,10 @@  __extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
 vld1q_p16 (const poly16_t * a)
 {
   poly16x8_t result;
-  __asm__ ("ld1 {%0.8h},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1 {%0.16b}, %1"
+	   : "=w"(result)
+	   : "Utv"(({const poly16x8_t *_a = (poly16x8_t *) a; *_a;}))
+	   : /* No clobbers */);
   return result;
 }
 
@@ -9142,10 +9142,10 @@  __extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 vld1q_s8 (const int8_t * a)
 {
   int8x16_t result;
-  __asm__ ("ld1 {%0.16b},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1 {%0.16b}, %1"
+	   : "=w"(result)
+	   : "Utv"(({const int8x16_t *_a = (int8x16_t *) a; *_a;}))
+	   : /* No clobbers */);
   return result;
 }
 
@@ -9153,10 +9153,10 @@  __extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 vld1q_s16 (const int16_t * a)
 {
   int16x8_t result;
-  __asm__ ("ld1 {%0.8h},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1 {%0.8h}, %1"
+	   : "=w"(result)
+	   : "Utv"(({const int16x8_t *_a = (int16x8_t *) a; *_a;}))
+	   : /* No clobbers */);
   return result;
 }
 
@@ -9164,10 +9164,10 @@  __extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 vld1q_s32 (const int32_t * a)
 {
   int32x4_t result;
-  __asm__ ("ld1 {%0.4s},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1 {%0.4s}, %1"
+	   : "=w"(result)
+	   : "Utv"(({const int32x4_t *_a = (int32x4_t *) a; *_a;}))
+	   : /* No clobbers */);
   return result;
 }
 
@@ -9175,10 +9175,10 @@  __extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 vld1q_s64 (const int64_t * a)
 {
   int64x2_t result;
-  __asm__ ("ld1 {%0.2d},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1 {%0.2d}, %1"
+	   : "=w"(result)
+	   : "Utv"(({const int64x2_t *_a = (int64x2_t *) a; *_a;}))
+	   : /* No clobbers */);
   return result;
 }
 
@@ -9186,10 +9186,10 @@  __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 vld1q_u8 (const uint8_t * a)
 {
   uint8x16_t result;
-  __asm__ ("ld1 {%0.16b},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1 {%0.16b}, %1"
+	   : "=w"(result)
+	   : "Utv"(({const uint8x16_t *_a = (uint8x16_t *) a; *_a;}))
+	   : /* No clobbers */);
   return result;
 }
 
@@ -9197,10 +9197,10 @@  __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 vld1q_u16 (const uint16_t * a)
 {
   uint16x8_t result;
-  __asm__ ("ld1 {%0.8h},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1 {%0.8h}, %1"
+	   : "=w"(result)
+	   : "Utv"(({const uint16x8_t *_a = (uint16x8_t *) a; *_a;}))
+	   : /* No clobbers */);
   return result;
 }
 
@@ -9208,10 +9208,10 @@  __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 vld1q_u32 (const uint32_t * a)
 {
   uint32x4_t result;
-  __asm__ ("ld1 {%0.4s},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1 {%0.4s}, %1"
+	   : "=w"(result)
+	   : "Utv"(({const uint32x4_t *_a = (uint32x4_t *) a; *_a;}))
+	   : /* No clobbers */);
   return result;
 }
 
@@ -9219,10 +9219,10 @@  __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 vld1q_u64 (const uint64_t * a)
 {
   uint64x2_t result;
-  __asm__ ("ld1 {%0.2d},[%1]"
-           : "=w"(result)
-           : "r"(a)
-           : "memory");
+  __asm__ ("ld1 {%0.2d}, %1"
+	   : "=w"(result)
+	   : "Utv"(({const uint64x2_t *_a = (uint64x2_t *) a; *_a;}))
+	   : /* No clobbers */);
   return result;
 }