Patchwork [AArch64] Fix register constraints for lane intrinsics.

login
register
mail settings
Submitter James Greenhalgh
Date Sept. 6, 2013, 8:18 a.m.
Message ID <1378455486-23723-1-git-send-email-james.greenhalgh@arm.com>
Download mbox | patch
Permalink /patch/273119/
State New
Headers show

Comments

James Greenhalgh - Sept. 6, 2013, 8:18 a.m.
Hi,

Most of the vector-by-element instructions in AArch64 have the restriction
that, if the vector they are taking an element from has type "h"
then it must be in a register from the lower half of the vector register
set (i.e. v0-v15). While we have imposed that restriction in places, we
have not been consistent.

Fix that.

Tested with aarch64.exp with no regressions.

OK for trunk?

Thanks,
James

---
gcc/

2013-09-06  James Greenhalgh  <james.greenhalgh@arm.com>

	* config/aarch64/aarch64-simd.md
	(aarch64_sqdml<SBINQOPS:as>l_n<mode>_internal): Use
	<vwx> iterator to ensure correct register choice.
	(aarch64_sqdml<SBINQOPS:as>l2_n<mode>_internal): Likewise.
	(aarch64_sqdmull_n<mode>): Likewise.
	(aarch64_sqdmull2_n<mode>_internal): Likewise.
	* config/aarch64/arm_neon.h
	(vml<as><q>_lane<q>_<su>16): Use 'x' constraint for element vector.
	(vml<as><q>_n_<su>16): Likewise.
	(vml<as>l_high_lane<q>_<su>16): Likewise.
	(vml<as>l_high_n_<su>16): Likewise.
	(vml<as>l_lane<q>_<su>16): Likewise.
	(vml<as>l_n_<su>16): Likewise.
	(vmul<q>_lane<q>_<su>16): Likewise.
	(vmul<q>_n_<su>16): Likewise.
	(vmull_lane<q>_<su>16): Likewise.
	(vmull_n_<su>16): Likewise.
	(vmull_high_lane<q>_<su>16): Likewise.
	(vmull_high_n_<su>16): Likewise.
	(vqrdmulh<q>_n_s16): Likewise.
Marcus Shawcroft - Sept. 6, 2013, 10:47 a.m.
On 6 September 2013 09:18, James Greenhalgh <james.greenhalgh@arm.com> wrote:
>
> Hi,
>
> Most of the vector-by-element instructions in AArch64 have the restriction
> that, if the vector they are taking an element from has type "h"
> then it must be in a register from the lower half of the vector register
> set (i.e. v0-v15). While we have imposed that restriction in places, we
> have not been consistent.
>
> Fix that.
>
> Tested with aarch64.exp with no regressions.
>
> OK for trunk?

OK
/Marcus

Patch

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index c085fb9c49958c5f402a28c0b39fe45ec1aadbc7..5161e48dfcea91910b9dc2ee68219c3ede55f4aa 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -2797,7 +2797,7 @@  (define_insn "aarch64_sqdml<SBINQOPS:as>
 		      (match_operand:VD_HSI 2 "register_operand" "w"))
 		(sign_extend:<VWIDE>
 		  (vec_duplicate:VD_HSI
-		    (match_operand:<VEL> 3 "register_operand" "w"))))
+		    (match_operand:<VEL> 3 "register_operand" "<vwx>"))))
 	      (const_int 1))))]
   "TARGET_SIMD"
   "sqdml<SBINQOPS:as>l\\t%<vw2>0<Vmwtype>, %<v>2<Vmtype>, %3.<Vetype>[0]"
@@ -2955,7 +2955,7 @@  (define_insn "aarch64_sqdml<SBINQOPS:as>
                   (match_operand:VQ_HSI 4 "vect_par_cnst_hi_half" "")))
 	      (sign_extend:<VWIDE>
                 (vec_duplicate:<VHALF>
-		  (match_operand:<VEL> 3 "register_operand" "w"))))
+		  (match_operand:<VEL> 3 "register_operand" "<vwx>"))))
 	    (const_int 1))))]
   "TARGET_SIMD"
   "sqdml<SBINQOPS:as>l2\\t%<vw2>0<Vmwtype>, %<v>2<Vmtype>, %3.<Vetype>[0]"
@@ -3083,7 +3083,7 @@  (define_insn "aarch64_sqdmull_n<mode>"
 		 (match_operand:VD_HSI 1 "register_operand" "w"))
 	       (sign_extend:<VWIDE>
                  (vec_duplicate:VD_HSI
-                   (match_operand:<VEL> 2 "register_operand" "w")))
+                   (match_operand:<VEL> 2 "register_operand" "<vwx>")))
 	       )
 	     (const_int 1)))]
   "TARGET_SIMD"
@@ -3193,7 +3193,7 @@  (define_insn "aarch64_sqdmull2_n<mode>_i
                    (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" "")))
 	       (sign_extend:<VWIDE>
                  (vec_duplicate:<VHALF>
-                   (match_operand:<VEL> 2 "register_operand" "w")))
+                   (match_operand:<VEL> 2 "register_operand" "<vwx>")))
 	       )
 	     (const_int 1)))]
   "TARGET_SIMD"
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 29d1378..e20d34e 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -7146,7 +7146,7 @@  vld1q_dup_u64 (const uint64_t * a)
        int16x4_t result;                                                \
        __asm__ ("mla %0.4h, %2.4h, %3.h[%4]"                            \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -7174,7 +7174,7 @@  vld1q_dup_u64 (const uint64_t * a)
        uint16x4_t result;                                               \
        __asm__ ("mla %0.4h, %2.4h, %3.h[%4]"                            \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -7202,7 +7202,7 @@  vld1q_dup_u64 (const uint64_t * a)
        int16x4_t result;                                                \
        __asm__ ("mla %0.4h, %2.4h, %3.h[%4]"                            \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -7230,7 +7230,7 @@  vld1q_dup_u64 (const uint64_t * a)
        uint16x4_t result;                                               \
        __asm__ ("mla %0.4h, %2.4h, %3.h[%4]"                            \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -7267,7 +7267,7 @@  vmla_n_s16 (int16x4_t a, int16x4_t b, int16_t c)
   int16x4_t result;
   __asm__ ("mla %0.4h,%2.4h,%3.h[0]"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "0"(a), "w"(b), "x"(c)
            : /* No clobbers */);
   return result;
 }
@@ -7289,7 +7289,7 @@  vmla_n_u16 (uint16x4_t a, uint16x4_t b, uint16_t c)
   uint16x4_t result;
   __asm__ ("mla %0.4h,%2.4h,%3.h[0]"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "0"(a), "w"(b), "x"(c)
            : /* No clobbers */);
   return result;
 }
@@ -7380,7 +7380,7 @@  vmla_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
        int32x4_t result;                                                \
        __asm__ ("smlal2 %0.4s, %2.8h, %3.h[%4]"                         \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -7408,7 +7408,7 @@  vmla_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
        uint32x4_t result;                                               \
        __asm__ ("umlal2 %0.4s, %2.8h, %3.h[%4]"                         \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -7436,7 +7436,7 @@  vmla_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
        int32x4_t result;                                                \
        __asm__ ("smlal2 %0.4s, %2.8h, %3.h[%4]"                         \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -7464,7 +7464,7 @@  vmla_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
        uint32x4_t result;                                               \
        __asm__ ("umlal2 %0.4s, %2.8h, %3.h[%4]"                         \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -7489,7 +7489,7 @@  vmlal_high_n_s16 (int32x4_t a, int16x8_t b, int16_t c)
   int32x4_t result;
   __asm__ ("smlal2 %0.4s,%2.8h,%3.h[0]"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "0"(a), "w"(b), "x"(c)
            : /* No clobbers */);
   return result;
 }
@@ -7511,7 +7511,7 @@  vmlal_high_n_u16 (uint32x4_t a, uint16x8_t b, uint16_t c)
   uint32x4_t result;
   __asm__ ("umlal2 %0.4s,%2.8h,%3.h[0]"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "0"(a), "w"(b), "x"(c)
            : /* No clobbers */);
   return result;
 }
@@ -7602,7 +7602,7 @@  vmlal_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
        int32x4_t result;                                                \
        __asm__ ("smlal %0.4s,%2.4h,%3.h[%4]"                            \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -7630,7 +7630,7 @@  vmlal_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
        uint32x4_t result;                                               \
        __asm__ ("umlal %0.4s,%2.4h,%3.h[%4]"                            \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -7658,7 +7658,7 @@  vmlal_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
        int32x4_t result;                                                \
        __asm__ ("smlal %0.4s, %2.4h, %3.h[%4]"                          \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -7686,7 +7686,7 @@  vmlal_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
        uint32x4_t result;                                               \
        __asm__ ("umlal %0.4s, %2.4h, %3.h[%4]"                          \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -7711,7 +7711,7 @@  vmlal_n_s16 (int32x4_t a, int16x4_t b, int16_t c)
   int32x4_t result;
   __asm__ ("smlal %0.4s,%2.4h,%3.h[0]"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "0"(a), "w"(b), "x"(c)
            : /* No clobbers */);
   return result;
 }
@@ -7733,7 +7733,7 @@  vmlal_n_u16 (uint32x4_t a, uint16x4_t b, uint16_t c)
   uint32x4_t result;
   __asm__ ("umlal %0.4s,%2.4h,%3.h[0]"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "0"(a), "w"(b), "x"(c)
            : /* No clobbers */);
   return result;
 }
@@ -7839,7 +7839,7 @@  vmlal_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
        int16x8_t result;                                                \
        __asm__ ("mla %0.8h, %2.8h, %3.h[%4]"                            \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -7867,7 +7867,7 @@  vmlal_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
        uint16x8_t result;                                               \
        __asm__ ("mla %0.8h, %2.8h, %3.h[%4]"                            \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -7895,7 +7895,7 @@  vmlal_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
        int16x8_t result;                                                \
        __asm__ ("mla %0.8h, %2.8h, %3.h[%4]"                            \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -7923,7 +7923,7 @@  vmlal_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
        uint16x8_t result;                                               \
        __asm__ ("mla %0.8h, %2.8h, %3.h[%4]"                            \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -7972,7 +7972,7 @@  vmlaq_n_s16 (int16x8_t a, int16x8_t b, int16_t c)
   int16x8_t result;
   __asm__ ("mla %0.8h,%2.8h,%3.h[0]"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "0"(a), "w"(b), "x"(c)
            : /* No clobbers */);
   return result;
 }
@@ -7994,7 +7994,7 @@  vmlaq_n_u16 (uint16x8_t a, uint16x8_t b, uint16_t c)
   uint16x8_t result;
   __asm__ ("mla %0.8h,%2.8h,%3.h[0]"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "0"(a), "w"(b), "x"(c)
            : /* No clobbers */);
   return result;
 }
@@ -8100,7 +8100,7 @@  vmlaq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c)
        int16x4_t result;                                                \
        __asm__ ("mls %0.4h,%2.4h,%3.h[%4]"                              \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -8128,7 +8128,7 @@  vmlaq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c)
        uint16x4_t result;                                               \
        __asm__ ("mls %0.4h,%2.4h,%3.h[%4]"                              \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -8165,7 +8165,7 @@  vmls_n_s16 (int16x4_t a, int16x4_t b, int16_t c)
   int16x4_t result;
   __asm__ ("mls %0.4h, %2.4h, %3.h[0]"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "0"(a), "w"(b), "x"(c)
            : /* No clobbers */);
   return result;
 }
@@ -8187,7 +8187,7 @@  vmls_n_u16 (uint16x4_t a, uint16x4_t b, uint16_t c)
   uint16x4_t result;
   __asm__ ("mls %0.4h, %2.4h, %3.h[0]"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "0"(a), "w"(b), "x"(c)
            : /* No clobbers */);
   return result;
 }
@@ -8278,7 +8278,7 @@  vmls_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
        int32x4_t result;                                                \
        __asm__ ("smlsl2 %0.4s, %2.8h, %3.h[%4]"                         \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -8306,7 +8306,7 @@  vmls_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
        uint32x4_t result;                                               \
        __asm__ ("umlsl2 %0.4s, %2.8h, %3.h[%4]"                         \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -8334,7 +8334,7 @@  vmls_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
        int32x4_t result;                                                \
        __asm__ ("smlsl2 %0.4s, %2.8h, %3.h[%4]"                         \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -8362,7 +8362,7 @@  vmls_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
        uint32x4_t result;                                               \
        __asm__ ("umlsl2 %0.4s, %2.8h, %3.h[%4]"                         \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -8387,7 +8387,7 @@  vmlsl_high_n_s16 (int32x4_t a, int16x8_t b, int16_t c)
   int32x4_t result;
   __asm__ ("smlsl2 %0.4s, %2.8h, %3.h[0]"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "0"(a), "w"(b), "x"(c)
            : /* No clobbers */);
   return result;
 }
@@ -8409,7 +8409,7 @@  vmlsl_high_n_u16 (uint32x4_t a, uint16x8_t b, uint16_t c)
   uint32x4_t result;
   __asm__ ("umlsl2 %0.4s, %2.8h, %3.h[0]"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "0"(a), "w"(b), "x"(c)
            : /* No clobbers */);
   return result;
 }
@@ -8500,7 +8500,7 @@  vmlsl_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
        int32x4_t result;                                                \
        __asm__ ("smlsl %0.4s, %2.4h, %3.h[%4]"                          \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -8528,7 +8528,7 @@  vmlsl_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
        uint32x4_t result;                                               \
        __asm__ ("umlsl %0.4s, %2.4h, %3.h[%4]"                          \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -8556,7 +8556,7 @@  vmlsl_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
        int32x4_t result;                                                \
        __asm__ ("smlsl %0.4s, %2.4h, %3.h[%4]"                          \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -8584,7 +8584,7 @@  vmlsl_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
        uint32x4_t result;                                               \
        __asm__ ("umlsl %0.4s, %2.4h, %3.h[%4]"                          \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -8609,7 +8609,7 @@  vmlsl_n_s16 (int32x4_t a, int16x4_t b, int16_t c)
   int32x4_t result;
   __asm__ ("smlsl %0.4s, %2.4h, %3.h[0]"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "0"(a), "w"(b), "x"(c)
            : /* No clobbers */);
   return result;
 }
@@ -8631,7 +8631,7 @@  vmlsl_n_u16 (uint32x4_t a, uint16x4_t b, uint16_t c)
   uint32x4_t result;
   __asm__ ("umlsl %0.4s, %2.4h, %3.h[0]"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "0"(a), "w"(b), "x"(c)
            : /* No clobbers */);
   return result;
 }
@@ -8737,7 +8737,7 @@  vmlsl_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
        int16x8_t result;                                                \
        __asm__ ("mls %0.8h,%2.8h,%3.h[%4]"                              \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -8765,7 +8765,7 @@  vmlsl_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
        uint16x8_t result;                                               \
        __asm__ ("mls %0.8h,%2.8h,%3.h[%4]"                              \
                 : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -8808,7 +8808,7 @@  vmlsl_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
        int16x8_t __result;						\
        __asm__ ("mls %0.8h, %2.8h, %3.h[%4]"				\
                 : "=w"(__result)					\
-                : "0"(__a_), "w"(__b_), "w"(__c_), "i"(__d)		\
+                : "0"(__a_), "w"(__b_), "x"(__c_), "i"(__d)		\
                 : /* No clobbers */);					\
        __result;							\
      })
@@ -8836,7 +8836,7 @@  vmlsl_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
        uint16x8_t __result;						\
        __asm__ ("mls %0.8h, %2.8h, %3.h[%4]"				\
                 : "=w"(__result)					\
-                : "0"(__a_), "w"(__b_), "w"(__c_), "i"(__d)		\
+                : "0"(__a_), "w"(__b_), "x"(__c_), "i"(__d)		\
                 : /* No clobbers */);					\
        __result;							\
      })
@@ -8874,7 +8874,7 @@  vmlsq_n_f64 (float64x2_t a, float64x2_t b, float64_t c)
   float64x2_t t1;
   __asm__ ("fmul %1.2d, %3.2d, %4.d[0]; fsub %0.2d, %0.2d, %1.2d"
            : "=w"(result), "=w"(t1)
-           : "0"(a), "w"(b), "w"(c)
+           : "0"(a), "w"(b), "x"(c)
            : /* No clobbers */);
   return result;
 }
@@ -8885,7 +8885,7 @@  vmlsq_n_s16 (int16x8_t a, int16x8_t b, int16_t c)
   int16x8_t result;
   __asm__ ("mls %0.8h, %2.8h, %3.h[0]"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "0"(a), "w"(b), "x"(c)
            : /* No clobbers */);
   return result;
 }
@@ -8907,7 +8907,7 @@  vmlsq_n_u16 (uint16x8_t a, uint16x8_t b, uint16_t c)
   uint16x8_t result;
   __asm__ ("mls %0.8h, %2.8h, %3.h[0]"
            : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "0"(a), "w"(b), "x"(c)
            : /* No clobbers */);
   return result;
 }
@@ -9648,7 +9648,7 @@  vmul_n_s16 (int16x4_t a, int16_t b)
   int16x4_t result;
   __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
            : "=w"(result)
-           : "w"(a), "w"(b)
+           : "w"(a), "x"(b)
            : /* No clobbers */);
   return result;
 }
@@ -9670,7 +9670,7 @@  vmul_n_u16 (uint16x4_t a, uint16_t b)
   uint16x4_t result;
   __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
            : "=w"(result)
-           : "w"(a), "w"(b)
+           : "w"(a), "x"(b)
            : /* No clobbers */);
   return result;
 }
@@ -9707,7 +9707,7 @@  vmul_n_u32 (uint32x2_t a, uint32_t b)
        int32x4_t result;                                                \
        __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]"                         \
                 : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
+                : "w"(a_), "x"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -9733,7 +9733,7 @@  vmul_n_u32 (uint32x2_t a, uint32_t b)
        uint32x4_t result;                                               \
        __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]"                         \
                 : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
+                : "w"(a_), "x"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -9759,7 +9759,7 @@  vmul_n_u32 (uint32x2_t a, uint32_t b)
        int32x4_t result;                                                \
        __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]"                         \
                 : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
+                : "w"(a_), "x"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -9785,7 +9785,7 @@  vmul_n_u32 (uint32x2_t a, uint32_t b)
        uint32x4_t result;                                               \
        __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]"                         \
                 : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
+                : "w"(a_), "x"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -9809,7 +9809,7 @@  vmull_high_n_s16 (int16x8_t a, int16_t b)
   int32x4_t result;
   __asm__ ("smull2 %0.4s,%1.8h,%2.h[0]"
            : "=w"(result)
-           : "w"(a), "w"(b)
+           : "w"(a), "x"(b)
            : /* No clobbers */);
   return result;
 }
@@ -9831,7 +9831,7 @@  vmull_high_n_u16 (uint16x8_t a, uint16_t b)
   uint32x4_t result;
   __asm__ ("umull2 %0.4s,%1.8h,%2.h[0]"
            : "=w"(result)
-           : "w"(a), "w"(b)
+           : "w"(a), "x"(b)
            : /* No clobbers */);
   return result;
 }
@@ -9932,7 +9932,7 @@  vmull_high_u32 (uint32x4_t a, uint32x4_t b)
        int32x4_t result;                                                \
        __asm__ ("smull %0.4s,%1.4h,%2.h[%3]"                            \
                 : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
+                : "w"(a_), "x"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -9958,7 +9958,7 @@  vmull_high_u32 (uint32x4_t a, uint32x4_t b)
        uint32x4_t result;                                               \
        __asm__ ("umull %0.4s,%1.4h,%2.h[%3]"                            \
                 : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
+                : "w"(a_), "x"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -9984,7 +9984,7 @@  vmull_high_u32 (uint32x4_t a, uint32x4_t b)
        int32x4_t result;                                                \
        __asm__ ("smull %0.4s, %1.4h, %2.h[%3]"                          \
                 : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
+                : "w"(a_), "x"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -10010,7 +10010,7 @@  vmull_high_u32 (uint32x4_t a, uint32x4_t b)
        uint32x4_t result;                                               \
        __asm__ ("umull %0.4s, %1.4h, %2.h[%3]"                          \
                 : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
+                : "w"(a_), "x"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -10034,7 +10034,7 @@  vmull_n_s16 (int16x4_t a, int16_t b)
   int32x4_t result;
   __asm__ ("smull %0.4s,%1.4h,%2.h[0]"
            : "=w"(result)
-           : "w"(a), "w"(b)
+           : "w"(a), "x"(b)
            : /* No clobbers */);
   return result;
 }
@@ -10056,7 +10056,7 @@  vmull_n_u16 (uint16x4_t a, uint16_t b)
   uint32x4_t result;
   __asm__ ("umull %0.4s,%1.4h,%2.h[0]"
            : "=w"(result)
-           : "w"(a), "w"(b)
+           : "w"(a), "x"(b)
            : /* No clobbers */);
   return result;
 }
@@ -10183,7 +10183,7 @@  vmull_u32 (uint32x2_t a, uint32x2_t b)
        int16x8_t result;                                                \
        __asm__ ("mul %0.8h,%1.8h,%2.h[%3]"                              \
                 : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
+                : "w"(a_), "x"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -10209,7 +10209,7 @@  vmull_u32 (uint32x2_t a, uint32x2_t b)
        uint16x8_t result;                                               \
        __asm__ ("mul %0.8h,%1.8h,%2.h[%3]"                              \
                 : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
+                : "w"(a_), "x"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -10261,7 +10261,7 @@  vmull_u32 (uint32x2_t a, uint32x2_t b)
        int16x8_t result;                                                \
        __asm__ ("mul %0.8h, %1.8h, %2.h[%3]"                            \
                 : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
+                : "w"(a_), "x"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -10287,7 +10287,7 @@  vmull_u32 (uint32x2_t a, uint32x2_t b)
        uint16x8_t result;                                               \
        __asm__ ("mul %0.8h, %1.8h, %2.h[%3]"                            \
                 : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
+                : "w"(a_), "x"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
@@ -10333,7 +10333,7 @@  vmulq_n_s16 (int16x8_t a, int16_t b)
   int16x8_t result;
   __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
            : "=w"(result)
-           : "w"(a), "w"(b)
+           : "w"(a), "x"(b)
            : /* No clobbers */);
   return result;
 }
@@ -10355,7 +10355,7 @@  vmulq_n_u16 (uint16x8_t a, uint16_t b)
   uint16x8_t result;
   __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
            : "=w"(result)
-           : "w"(a), "w"(b)
+           : "w"(a), "x"(b)
            : /* No clobbers */);
   return result;
 }
@@ -11821,7 +11821,7 @@  vqrdmulh_n_s16 (int16x4_t a, int16_t b)
   int16x4_t result;
   __asm__ ("sqrdmulh %0.4h,%1.4h,%2.h[0]"
            : "=w"(result)
-           : "w"(a), "w"(b)
+           : "w"(a), "x"(b)
            : /* No clobbers */);
   return result;
 }
@@ -11843,7 +11843,7 @@  vqrdmulhq_n_s16 (int16x8_t a, int16_t b)
   int16x8_t result;
   __asm__ ("sqrdmulh %0.8h,%1.8h,%2.h[0]"
            : "=w"(result)
-           : "w"(a), "w"(b)
+           : "w"(a), "x"(b)
            : /* No clobbers */);
   return result;
 }