diff mbox

[AArch64,NEON] Improve vmulX intrinsics

Message ID 5486F084.6010202@huawei.com
State New
Headers show

Commit Message

Jiangjiji Dec. 9, 2014, 12:52 p.m. UTC
Hi,
      This patch converts more intrinsics to use builtin functions instead of the
previous inline assembly syntax.
      Passed the glorious testsuite of Christophe Lyon.

      Three testcases are added for the testing of intriniscs which are not
covered by the testsuite:
      gcc.target/aarch64/vmull_high.c
      gcc.target/aarch64/vmull_high_lane.c
      gcc.target/aarch64/vmull_high_n.c

      Regtested with aarch64-linux-gnu on QEMU.
      This patch has no regressions for aarch64_be-linux-gnu big-endian target too.
      OK for the trunk?
Index: gcc/ChangeLog
===================================================================
--- gcc/ChangeLog	(revision 218464)
+++ gcc/ChangeLog	(working copy)
@@ -1,3 +1,38 @@
+2014-12-09  Felix Yang  <felix.yang@huawei.com>
+            Jiji Jiang  <jiangjiji@huawei.com>
+
+	* config/aarch64/aarch64-simd.md (aarch64_mul_n<mode>,
+	aarch64_<su>mull_n<mode>, aarch64_<su>mull<mode>,
+	aarch64_simd_<su>mull2_n<mode>, aarch64_<su>mull2_n<mode>,
+	aarch64_<su>mull_lane<mode>, aarch64_<su>mull2_lane<mode>_internal,
+	aarch64_<su>mull_laneq<mode>, aarch64_<su>mull2_laneq<mode>_internal,
+	aarch64_smull2_lane<mode>, aarch64_umull2_lane<mode>,
+	aarch64_smull2_laneq<mode>, aarch64_umull2_laneq<mode>,
+	aarch64_fmulx<mode>, aarch64_fmulx<mode>, aarch64_fmulx_lane<mode>,
+	aarch64_pmull2v16qi, aarch64_pmullv8qi): New patterns.
+	* config/aarch64/aarch64-simd-builtins.def (vec_widen_smult_hi_,
+	vec_widen_umult_hi_, umull, smull, smull_n, umull_n, mul_n, smull2_n,
+	umull2_n, smull_lane, umull_lane, smull_laneq, umull_laneq, pmull,
+	umull2_lane, smull2_laneq, umull2_laneq, fmulx, fmulx_lane, pmull2,
+	smull2_lane): New builtins.
+	* config/aarch64/arm_neon.h (vmul_n_f32, vmul_n_s16, vmul_n_s32,
+	vmul_n_u16, vmul_n_u32, vmulq_n_f32, vmulq_n_f64, vmulq_n_s16,
+	vmulq_n_s32, vmulq_n_u16, vmulq_n_u32, vmull_high_lane_s16,
+	vmull_high_lane_s32, vmull_high_lane_u16, vmull_high_lane_u32,
+	vmull_high_laneq_s16, vmull_high_laneq_s32, vmull_high_laneq_u16,
+	vmull_high_laneq_u32, vmull_high_n_s16, vmull_high_n_s32,
+	vmull_high_n_u16, vmull_high_n_u32, vmull_high_p8, vmull_high_s8,
+	vmull_high_s16, vmull_high_s32, vmull_high_u8, vmull_high_u16,
+	vmull_high_u32, vmull_lane_s16, vmull_lane_s32, vmull_lane_u16,
+	vmull_lane_u32, vmull_laneq_s16, vmull_laneq_s32, vmull_laneq_u16,
+	vmull_laneq_u32, vmull_n_s16, vmull_n_s32, vmull_n_u16, vmull_n_u32,
+	vmull_p8, vmull_s8, vmull_s16, vmull_s32, vmull_u8, vmull_u16,
+	vmull_u32, vmulx_f32, vmulx_lane_f32, vmulxd_f64, vmulxq_f32,
+	vmulxq_f64, vmulxq_lane_f32, vmulxq_lane_f64, vmulxs_f32): Rewrite
+	using builtin functions.
+	* config/aarch64/iterators.md (UNSPEC_FMULX, UNSPEC_FMULX_LANE,
+	VDQF_Q): New unspec and int iterator.
+
 2014-12-07  Felix Yang  <felix.yang@huawei.com>
 	    Shanyao Chen  <chenshanyao@huawei.com>
 
Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c
===================================================================
--- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c	(revision 0)
+++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c	(revision 0)
@@ -0,0 +1,111 @@
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+
+/* Expected results.  */
+VECT_VAR_DECL(expected,int,16,8) [] = { 0xfc48, 0xfcbf, 0xfd36, 0xfdad,
+                                        0xfe24, 0xfe9b, 0xff12, 0xff89 };
+VECT_VAR_DECL(expected,int,32,4) [] = { 0xfffff9a0, 0xfffffa28,
+                                        0xfffffab0, 0xfffffb38 };
+VECT_VAR_DECL(expected,int,64,2) [] = { 0xfffffffffffff7a2,
+                                        0xfffffffffffff83b };
+VECT_VAR_DECL(expected,uint,16,8) [] = { 0xa4b0, 0xa55a, 0xa604, 0xa6ae,
+                                         0xa758, 0xa802, 0xa8ac, 0xa956 };
+VECT_VAR_DECL(expected,uint,32,4) [] = { 0xbaf73c, 0xbaf7f7,
+                                         0xbaf8b2, 0xbaf96d };
+VECT_VAR_DECL(expected,uint,64,2) [] = { 0xcbfffff4d8,
+                                         0xcbfffff5a4};
+VECT_VAR_DECL(expected,poly,16,8) [] = { 0x6530, 0x659a, 0x6464, 0x64ce,
+                                         0x6798, 0x6732, 0x66cc, 0x6666 };
+
+#ifndef INSN_NAME
+#define INSN_NAME vmull_high
+#define TEST_MSG "VMUL_HIGH"
+#endif
+
+#define FNNAME1(NAME) exec_ ## NAME
+#define FNNAME(NAME) FNNAME1(NAME)
+
+void FNNAME (INSN_NAME) (void)
+{
+#define DECL_VMUL(T, W, N)                      \
+  DECL_VARIABLE(vector1, T, W, N);              \
+  DECL_VARIABLE(vector2, T, W, N);		
+
+  /* vector_res = OP(vector1, vector2), then store the result.  */
+#define TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1)                 \
+  VECT_VAR(vector_res, T1, W1, N1) =                                    \
+    INSN##Q##_##T2##W(VECT_VAR(vector1, T1, W, N),                      \
+                               VECT_VAR(vector2, T1, W, N));            \
+  vst1q##_##T2##W1(VECT_VAR(result, T1, W1, N1),                        \
+		    VECT_VAR(vector_res, T1, W1, N1))
+
+#define TEST_VMULL_HIGH(INSN, Q, T1, T2, W, N, W1, N1)	                \
+  TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1)
+
+#define CHECK_VMULL_HIGH_RESULTS(test_name,comment)                     \
+  {                                                                     \
+    CHECK(test_name, int, 16, 8, PRIx16, expected, comment);            \
+    CHECK(test_name, int, 32, 4, PRIx32, expected, comment);            \
+    CHECK(test_name, int, 64, 2, PRIx64, expected, comment);            \
+    CHECK(test_name, uint, 16, 8, PRIx16,  expected, comment);          \
+    CHECK(test_name, uint, 32, 4, PRIx32, expected, comment);           \
+    CHECK(test_name, uint, 64, 2, PRIx64, expected, comment);           \
+    CHECK(test_name, poly, 16, 8, PRIx16, expected, comment);           \
+  }  
+
+  DECL_VMUL(int, 8, 16);
+  DECL_VMUL(int, 16, 8);
+  DECL_VMUL(int, 32, 4);
+  DECL_VMUL(uint, 8, 16);
+  DECL_VMUL(uint, 16, 8);
+  DECL_VMUL(uint, 32, 4);
+  DECL_VMUL(poly, 8, 16);
+
+  DECL_VARIABLE(vector_res, int, 16, 8);
+  DECL_VARIABLE(vector_res, int, 32, 4);
+  DECL_VARIABLE(vector_res, int, 64, 2);
+  DECL_VARIABLE(vector_res, uint, 16, 8);
+  DECL_VARIABLE(vector_res, uint, 32, 4);
+  DECL_VARIABLE(vector_res, uint, 64, 2);
+  DECL_VARIABLE(vector_res, poly, 16, 8);
+
+  clean_results ();
+
+  /* Initialize input "vector1" from "buffer".  */
+  VLOAD(vector1, buffer, q, int, s, 8, 16);
+  VLOAD(vector1, buffer, q, int, s, 16, 8);
+  VLOAD(vector1, buffer, q, int, s, 32, 4);
+  VLOAD(vector1, buffer, q, uint, u, 8, 16);
+  VLOAD(vector1, buffer, q, uint, u, 16, 8);
+  VLOAD(vector1, buffer, q, uint, u, 32, 4);
+  VLOAD(vector1, buffer, q, poly, p, 8, 16);
+
+  /* Choose init value arbitrarily.  */
+  VDUP(vector2, q, int, s, 8, 16, 0x77);
+  VDUP(vector2, q, int, s, 16, 8, 0x88);
+  VDUP(vector2, q, int, s, 32, 4, 0x99);
+  VDUP(vector2, q, uint, u, 8, 16, 0xAA);
+  VDUP(vector2, q, uint, u, 16, 8, 0xBB);
+  VDUP(vector2, q, uint, u, 32, 4, 0xCC);
+  VDUP(vector2, q, poly, p, 8, 16, 0xAA);
+
+  /* Execute the tests.  */
+  TEST_VMULL_HIGH(INSN_NAME, , int, s, 8, 16, 16, 8);
+  TEST_VMULL_HIGH(INSN_NAME, , int, s, 16, 8, 32, 4);
+  TEST_VMULL_HIGH(INSN_NAME, , int, s, 32, 4, 64, 2);
+  TEST_VMULL_HIGH(INSN_NAME, , uint, u, 8, 16, 16, 8);
+  TEST_VMULL_HIGH(INSN_NAME, , uint, u, 16, 8, 32, 4);
+  TEST_VMULL_HIGH(INSN_NAME, , uint, u, 32, 4, 64, 2);
+  TEST_VMULL_HIGH(INSN_NAME, , poly, p, 8, 16, 16, 8);
+
+  CHECK_VMULL_HIGH_RESULTS (TEST_MSG, "");
+}
+
+int main (void)
+{
+  FNNAME (INSN_NAME) ();
+
+  return 0;
+}

Property changes on: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c

Comments

Christophe Lyon Dec. 9, 2014, 1:42 p.m. UTC | #1
On 9 December 2014 at 13:52, Jiangjiji <jiangjiji@huawei.com> wrote:
> Hi,
>      This patch converts more intrinsics to use builtin functions instead of
> the
> previous inline assembly syntax.
>      Passed the glorious testsuite of Christophe Lyon.
>
>      Three testcases are added for the testing of intriniscs which are not
> covered by the testsuite:
>      gcc.target/aarch64/vmull_high.c
>      gcc.target/aarch64/vmull_high_lane.c
>      gcc.target/aarch64/vmull_high_n.c
>

As I said here:
https://gcc.gnu.org/ml/gcc-patches/2014-10/msg01934.html
I am in tre process of converting my existing testsuite to GCC/Dejagnu.
Please do not duplicate work.


>      Regtested with aarch64-linux-gnu on QEMU.
>      This patch has no regressions for aarch64_be-linux-gnu big-endian
> target too.
>      OK for the trunk?
>
>
>
> Index: gcc/ChangeLog
> ===================================================================
> --- gcc/ChangeLog       (revision 218464)
> +++ gcc/ChangeLog       (working copy)
> @@ -1,3 +1,38 @@
> +2014-12-09  Felix Yang  <felix.yang@huawei.com>
> +            Jiji Jiang  <jiangjiji@huawei.com>
> +
> +       * config/aarch64/aarch64-simd.md (aarch64_mul_n<mode>,
> +       aarch64_<su>mull_n<mode>, aarch64_<su>mull<mode>,
> +       aarch64_simd_<su>mull2_n<mode>, aarch64_<su>mull2_n<mode>,
> +       aarch64_<su>mull_lane<mode>, aarch64_<su>mull2_lane<mode>_internal,
> +       aarch64_<su>mull_laneq<mode>,
> aarch64_<su>mull2_laneq<mode>_internal,
> +       aarch64_smull2_lane<mode>, aarch64_umull2_lane<mode>,
> +       aarch64_smull2_laneq<mode>, aarch64_umull2_laneq<mode>,
> +       aarch64_fmulx<mode>, aarch64_fmulx<mode>, aarch64_fmulx_lane<mode>,
> +       aarch64_pmull2v16qi, aarch64_pmullv8qi): New patterns.
> +       * config/aarch64/aarch64-simd-builtins.def (vec_widen_smult_hi_,
> +       vec_widen_umult_hi_, umull, smull, smull_n, umull_n, mul_n,
> smull2_n,
> +       umull2_n, smull_lane, umull_lane, smull_laneq, umull_laneq, pmull,
> +       umull2_lane, smull2_laneq, umull2_laneq, fmulx, fmulx_lane, pmull2,
> +       smull2_lane): New builtins.
> +       * config/aarch64/arm_neon.h (vmul_n_f32, vmul_n_s16, vmul_n_s32,
> +       vmul_n_u16, vmul_n_u32, vmulq_n_f32, vmulq_n_f64, vmulq_n_s16,
> +       vmulq_n_s32, vmulq_n_u16, vmulq_n_u32, vmull_high_lane_s16,
> +       vmull_high_lane_s32, vmull_high_lane_u16, vmull_high_lane_u32,
> +       vmull_high_laneq_s16, vmull_high_laneq_s32, vmull_high_laneq_u16,
> +       vmull_high_laneq_u32, vmull_high_n_s16, vmull_high_n_s32,
> +       vmull_high_n_u16, vmull_high_n_u32, vmull_high_p8, vmull_high_s8,
> +       vmull_high_s16, vmull_high_s32, vmull_high_u8, vmull_high_u16,
> +       vmull_high_u32, vmull_lane_s16, vmull_lane_s32, vmull_lane_u16,
> +       vmull_lane_u32, vmull_laneq_s16, vmull_laneq_s32, vmull_laneq_u16,
> +       vmull_laneq_u32, vmull_n_s16, vmull_n_s32, vmull_n_u16, vmull_n_u32,
> +       vmull_p8, vmull_s8, vmull_s16, vmull_s32, vmull_u8, vmull_u16,
> +       vmull_u32, vmulx_f32, vmulx_lane_f32, vmulxd_f64, vmulxq_f32,
> +       vmulxq_f64, vmulxq_lane_f32, vmulxq_lane_f64, vmulxs_f32): Rewrite
> +       using builtin functions.
> +       * config/aarch64/iterators.md (UNSPEC_FMULX, UNSPEC_FMULX_LANE,
> +       VDQF_Q): New unspec and int iterator.
> +
>  2014-12-07  Felix Yang  <felix.yang@huawei.com>
>             Shanyao Chen  <chenshanyao@huawei.com>
>  Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c
> ===================================================================
> --- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c
> (revision 0)
> +++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c
> (revision 0)
> @@ -0,0 +1,111 @@
> +#include <arm_neon.h>
> +#include "arm-neon-ref.h"
> +#include "compute-ref-data.h"
> +
> +
> +/* Expected results.  */
> +VECT_VAR_DECL(expected,int,16,8) [] = { 0xfc48, 0xfcbf, 0xfd36, 0xfdad,
> +                                        0xfe24, 0xfe9b, 0xff12, 0xff89 };
> +VECT_VAR_DECL(expected,int,32,4) [] = { 0xfffff9a0, 0xfffffa28,
> +                                        0xfffffab0, 0xfffffb38 };
> +VECT_VAR_DECL(expected,int,64,2) [] = { 0xfffffffffffff7a2,
> +                                        0xfffffffffffff83b };
> +VECT_VAR_DECL(expected,uint,16,8) [] = { 0xa4b0, 0xa55a, 0xa604, 0xa6ae,
> +                                         0xa758, 0xa802, 0xa8ac, 0xa956 };
> +VECT_VAR_DECL(expected,uint,32,4) [] = { 0xbaf73c, 0xbaf7f7,
> +                                         0xbaf8b2, 0xbaf96d };
> +VECT_VAR_DECL(expected,uint,64,2) [] = { 0xcbfffff4d8,
> +                                         0xcbfffff5a4};
> +VECT_VAR_DECL(expected,poly,16,8) [] = { 0x6530, 0x659a, 0x6464, 0x64ce,
> +                                         0x6798, 0x6732, 0x66cc, 0x6666 };
> +
> +#ifndef INSN_NAME
> +#define INSN_NAME vmull_high
> +#define TEST_MSG "VMUL_HIGH"
> +#endif
> +
> +#define FNNAME1(NAME) exec_ ## NAME
> +#define FNNAME(NAME) FNNAME1(NAME)
> +
> +void FNNAME (INSN_NAME) (void)
> +{
> +#define DECL_VMUL(T, W, N)                      \
> +  DECL_VARIABLE(vector1, T, W, N);              \
> +  DECL_VARIABLE(vector2, T, W, N);
> +
> +  /* vector_res = OP(vector1, vector2), then store the result.  */
> +#define TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1)                 \
> +  VECT_VAR(vector_res, T1, W1, N1) =                                    \
> +    INSN##Q##_##T2##W(VECT_VAR(vector1, T1, W, N),                      \
> +                               VECT_VAR(vector2, T1, W, N));            \
> +  vst1q##_##T2##W1(VECT_VAR(result, T1, W1, N1),                        \
> +                   VECT_VAR(vector_res, T1, W1, N1))
> +
> +#define TEST_VMULL_HIGH(INSN, Q, T1, T2, W, N, W1, N1)                 \
> +  TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1)
> +
> +#define CHECK_VMULL_HIGH_RESULTS(test_name,comment)                     \
> +  {                                                                     \
> +    CHECK(test_name, int, 16, 8, PRIx16, expected, comment);            \
> +    CHECK(test_name, int, 32, 4, PRIx32, expected, comment);            \
> +    CHECK(test_name, int, 64, 2, PRIx64, expected, comment);            \
> +    CHECK(test_name, uint, 16, 8, PRIx16,  expected, comment);          \
> +    CHECK(test_name, uint, 32, 4, PRIx32, expected, comment);           \
> +    CHECK(test_name, uint, 64, 2, PRIx64, expected, comment);           \
> +    CHECK(test_name, poly, 16, 8, PRIx16, expected, comment);           \
> +  }
> +
> +  DECL_VMUL(int, 8, 16);
> +  DECL_VMUL(int, 16, 8);
> +  DECL_VMUL(int, 32, 4);
> +  DECL_VMUL(uint, 8, 16);
> +  DECL_VMUL(uint, 16, 8);
> +  DECL_VMUL(uint, 32, 4);
> +  DECL_VMUL(poly, 8, 16);
> +
> +  DECL_VARIABLE(vector_res, int, 16, 8);
> +  DECL_VARIABLE(vector_res, int, 32, 4);
> +  DECL_VARIABLE(vector_res, int, 64, 2);
> +  DECL_VARIABLE(vector_res, uint, 16, 8);
> +  DECL_VARIABLE(vector_res, uint, 32, 4);
> +  DECL_VARIABLE(vector_res, uint, 64, 2);
> +  DECL_VARIABLE(vector_res, poly, 16, 8);
> +
> +  clean_results ();
> +
> +  /* Initialize input "vector1" from "buffer".  */
> +  VLOAD(vector1, buffer, q, int, s, 8, 16);
> +  VLOAD(vector1, buffer, q, int, s, 16, 8);
> +  VLOAD(vector1, buffer, q, int, s, 32, 4);
> +  VLOAD(vector1, buffer, q, uint, u, 8, 16);
> +  VLOAD(vector1, buffer, q, uint, u, 16, 8);
> +  VLOAD(vector1, buffer, q, uint, u, 32, 4);
> +  VLOAD(vector1, buffer, q, poly, p, 8, 16);
> +
> +  /* Choose init value arbitrarily.  */
> +  VDUP(vector2, q, int, s, 8, 16, 0x77);
> +  VDUP(vector2, q, int, s, 16, 8, 0x88);
> +  VDUP(vector2, q, int, s, 32, 4, 0x99);
> +  VDUP(vector2, q, uint, u, 8, 16, 0xAA);
> +  VDUP(vector2, q, uint, u, 16, 8, 0xBB);
> +  VDUP(vector2, q, uint, u, 32, 4, 0xCC);
> +  VDUP(vector2, q, poly, p, 8, 16, 0xAA);
> +
> +  /* Execute the tests.  */
> +  TEST_VMULL_HIGH(INSN_NAME, , int, s, 8, 16, 16, 8);
> +  TEST_VMULL_HIGH(INSN_NAME, , int, s, 16, 8, 32, 4);
> +  TEST_VMULL_HIGH(INSN_NAME, , int, s, 32, 4, 64, 2);
> +  TEST_VMULL_HIGH(INSN_NAME, , uint, u, 8, 16, 16, 8);
> +  TEST_VMULL_HIGH(INSN_NAME, , uint, u, 16, 8, 32, 4);
> +  TEST_VMULL_HIGH(INSN_NAME, , uint, u, 32, 4, 64, 2);
> +  TEST_VMULL_HIGH(INSN_NAME, , poly, p, 8, 16, 16, 8);
> +
> +  CHECK_VMULL_HIGH_RESULTS (TEST_MSG, "");
> +}
> +
> +int main (void)
> +{
> +  FNNAME (INSN_NAME) ();
> +
> +  return 0;
> +}
>
> Property changes on:
> gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c
> ___________________________________________________________________
> Added: svn:executable
>    + *
>
> Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c
> ===================================================================
> --- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c
> (revision 0)
> +++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c
> (revision 0)
> @@ -0,0 +1,135 @@
> +#include <arm_neon.h>
> +#include "arm-neon-ref.h"
> +#include "compute-ref-data.h"
> +
> +VECT_VAR_DECL(expected, int, 32, 4) [] = { 0x4000, 0x4000, 0x4000, 0x4000
> };
> +VECT_VAR_DECL(expected, int, 64, 2) [] = { 0x2000, 0x2000};
> +VECT_VAR_DECL(expected, uint, 32, 4) [] = { 0x4000, 0x4000, 0x4000, 0x4000
> };
> +VECT_VAR_DECL(expected, uint, 64, 2) [] = { 0x2000, 0x2000 };
> +
> +#define TEST_MSG "VMULL_HIGH_LANE/VMULL_HIGH_LANEQ"
> +void exec_vmull_high_lane (void)
> +{
> +  /* vector_res = vmull_lane(vector,vector2,lane), then store the result.
> */
> +#define TEST_VMULL_HIGH_LANE(T1, T2, W, W2, N1, N2, L)                  \
> +  VECT_VAR(vector_res, T1, W2, N2) =                                    \
> +    vmull##_high_lane_##T2##W(VECT_VAR(vector, T1, W, N1 ),             \
> +                         VECT_VAR(vector2, T1, W, N2),                  \
> +                         L);                                            \
> +  vst1q_##T2##W2(VECT_VAR(result, T1, W2, N2), VECT_VAR(vector_res, T1, W2,
> N2))
> +
> +#define CHECK_VMULL_HIGH_LANE_RESULTS(test_name,comment)                \
> +  {                                                                     \
> +    CHECK(test_name, int, 32, 4, PRIx32, expected, comment);            \
> +    CHECK(test_name, int, 64, 2, PRIx64, expected, comment);            \
> +    CHECK(test_name, uint, 32, 4, PRIx32,  expected, comment);          \
> +    CHECK(test_name, uint, 64, 2, PRIx64, expected, comment);           \
> +  }
> +
> +
> +  /* With ARM RVCT, we need to declare variables before any executable
> +     statement */
> +  DECL_VARIABLE(vector, int, 16, 8);
> +  DECL_VARIABLE(vector, int, 32, 4);
> +  DECL_VARIABLE(vector, uint, 16, 8);
> +  DECL_VARIABLE(vector, uint, 32, 4);
> +  DECL_VARIABLE(vector2, int, 16, 4);
> +  DECL_VARIABLE(vector2, int, 32, 2);
> +  DECL_VARIABLE(vector2, uint, 16, 4);
> +  DECL_VARIABLE(vector2, uint, 32, 2);
> +
> +  DECL_VARIABLE(vector_res, int, 32, 4);
> +  DECL_VARIABLE(vector_res, int, 64, 2);
> +  DECL_VARIABLE(vector_res, uint, 32, 4);
> +  DECL_VARIABLE(vector_res, uint, 64, 2);
> +
> +  clean_results ();
> +
> +  /* Initialize vector */
> +  VDUP(vector2, , int, s, 16, 4, 0x1000);
> +  VDUP(vector2, , int, s, 32, 2, 0x1000);
> +  VDUP(vector2, , uint, u, 16, 4, 0x1000);
> +  VDUP(vector2, , uint, u, 32, 2, 0x1000);
> +
> +  /* Initialize vector2 */
> +  VDUP(vector, q, int, s, 16, 8, 0x4);
> +  VDUP(vector, q, int, s, 32, 4, 0x2);
> +  VDUP(vector, q, uint, u, 16, 8, 0x4);
> +  VDUP(vector, q, uint, u, 32, 4, 0x2);
> +
> +  /* Choose lane arbitrarily */
> +  TEST_VMULL_HIGH_LANE(int, s, 16, 32, 8, 4, 2);
> +  TEST_VMULL_HIGH_LANE(int, s, 32, 64, 4, 2, 1);
> +  TEST_VMULL_HIGH_LANE(uint, u, 16, 32, 8, 4, 2);
> +  TEST_VMULL_HIGH_LANE(uint, u, 32, 64, 4, 2, 1);
> +
> +  CHECK_VMULL_HIGH_LANE_RESULTS (TEST_MSG, "");
> +}
> +
> +
> +void exec_vmull_high_laneq (void)
> +{
> +  /* vector_res = vmull_lane(vector,vector2,lane), then store the result.
> */
> +#define TEST_VMULL_HIGH_LANEQ(T1, T2, W, W2, N2, N1, L)                 \
> +  VECT_VAR(vector_res, T1, W2, N1) =                                    \
> +    vmull##_high_laneq_##T2##W(VECT_VAR(vector, T1, W, N2 ),            \
> +                         VECT_VAR(vector2, T1, W, N2),                  \
> +                         L);                                            \
> +  vst1q_##T2##W2(VECT_VAR(result, T1, W2, N1), VECT_VAR(vector_res, T1, W2,
> N1))
> +
> +#define CHECK_VMULL_HIGH_LANEQ_RESULTS(test_name,comment)               \
> +  {                                                                     \
> +    CHECK(test_name, int, 32, 4, PRIx32, expected, comment);            \
> +    CHECK(test_name, int, 64, 2, PRIx64, expected, comment);            \
> +    CHECK(test_name, uint, 32, 4, PRIx32,  expected, comment);          \
> +    CHECK(test_name, uint, 64, 2, PRIx64, expected, comment);           \
> +  }
> +
> +
> +  /* With ARM RVCT, we need to declare variables before any executable
> +     statement */
> +  DECL_VARIABLE(vector, int, 16, 8);
> +  DECL_VARIABLE(vector, int, 32, 4);
> +  DECL_VARIABLE(vector, uint, 16, 8);
> +  DECL_VARIABLE(vector, uint, 32, 4);
> +  DECL_VARIABLE(vector2, int, 16, 8);
> +  DECL_VARIABLE(vector2, int, 32, 4);
> +  DECL_VARIABLE(vector2, uint, 16, 8);
> +  DECL_VARIABLE(vector2, uint, 32, 4);
> +
> +  DECL_VARIABLE(vector_res, int, 32, 4);
> +  DECL_VARIABLE(vector_res, int, 64, 2);
> +  DECL_VARIABLE(vector_res, uint, 32, 4);
> +  DECL_VARIABLE(vector_res, uint, 64, 2);
> +
> +  clean_results ();
> +
> +  /* Initialize vector */
> +  VDUP(vector2, q, int, s, 16, 8, 0x1000);
> +  VDUP(vector2, q, int, s, 32, 4, 0x1000);
> +  VDUP(vector2, q, uint, u, 16, 8, 0x1000);
> +  VDUP(vector2, q, uint, u, 32, 4, 0x1000);
> +
> +  /* Initialize vector2 */
> +  VDUP(vector, q, int, s, 16, 8, 0x4);
> +  VDUP(vector, q, int, s, 32, 4, 0x2);
> +  VDUP(vector, q, uint, u, 16, 8, 0x4);
> +  VDUP(vector, q, uint, u, 32, 4, 0x2);
> +
> +  /* Choose lane arbitrarily */
> +  TEST_VMULL_HIGH_LANEQ(int, s, 16, 32, 8, 4, 2);
> +  TEST_VMULL_HIGH_LANEQ(int, s, 32, 64, 4, 2, 1);
> +  TEST_VMULL_HIGH_LANEQ(uint, u, 16, 32, 8, 4, 2);
> +  TEST_VMULL_HIGH_LANEQ(uint, u, 32, 64, 4, 2, 1);
> +
> +  CHECK_VMULL_HIGH_LANEQ_RESULTS (TEST_MSG, "");
> +}
> +
> +
> +
> +
> +int main (void)
> +{
> +  exec_vmull_high_lane();
> +  return 0;
> +}
>
> Property changes on:
> gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c
> ___________________________________________________________________
> Added: svn:executable
>    + *
>
> Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c
> ===================================================================
> --- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c
> (revision 0)
> +++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c
> (revision 0)
> @@ -0,0 +1,81 @@
> +#include <arm_neon.h>
> +#include "arm-neon-ref.h"
> +#include "compute-ref-data.h"
> +
> +
> +/* Expected results.  */
> +VECT_VAR_DECL(expected,int,32,4) [] = { 0xfffff73c, 0xfffff7f7,
> +                                        0xfffff8b2, 0xfffff96d };
> +VECT_VAR_DECL(expected,int,64,2) [] = { 0xfffffffffffff4d8,
> +                                        0xfffffffffffff5a4 };
> +VECT_VAR_DECL(expected,uint,32,4) [] = { 0xedf4d8, 0xedf5c6,
> +                                         0xedf6b4, 0xedf7a2 };
> +VECT_VAR_DECL(expected,uint,64,2) [] = { 0xfefffff20e,
> +                                         0xfefffff30d};
> +
> +#ifndef INSN_NAME
> +#define INSN_NAME vmull_high_n
> +#define TEST_MSG "VMULL_HIGH_N"
> +#endif
> +
> +#define FNNAME1(NAME) exec_ ## NAME
> +#define FNNAME(NAME) FNNAME1(NAME)
> +
> +void FNNAME (INSN_NAME) (void)
> +{
> +#define DECL_VMUL(T, W, N)                      \
> +  DECL_VARIABLE(vector1, T, W, N);              \
> +
> +  /* vector_res = OP(vector1, vector2), then store the result.  */
> +#define TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1, C)              \
> +  VECT_VAR(vector_res, T1, W1, N1) =                                    \
> +    INSN##Q##_##T2##W(VECT_VAR(vector1, T1, W, N),                      \
> +                      C);                                               \
> +  vst1q##_##T2##W1(VECT_VAR(result, T1, W1, N1),                        \
> +                      VECT_VAR(vector_res, T1, W1, N1))
> +
> +#define TEST_VMULL_HIGH(INSN, Q, T1, T2, W, N, W1, N1, C)               \
> +  TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1, C)
> +
> +#define CHECK_VMULL_HIGH_N_RESULTS(test_name,comment)                   \
> +  {                                                                     \
> +    CHECK(test_name, int, 32, 4, PRIx32, expected, comment);            \
> +    CHECK(test_name, int, 64, 2, PRIx64, expected, comment);            \
> +    CHECK(test_name, uint, 32, 4, PRIx32, expected, comment);           \
> +    CHECK(test_name, uint, 64, 2, PRIx64, expected, comment);           \
> +  }
> +
> +  DECL_VMUL(int, 16, 8);
> +  DECL_VMUL(int, 32, 4);
> +  DECL_VMUL(uint, 16, 8);
> +  DECL_VMUL(uint, 32, 4);
> +
> +  DECL_VARIABLE(vector_res, int, 32, 4);
> +  DECL_VARIABLE(vector_res, int, 64, 2);
> +  DECL_VARIABLE(vector_res, uint, 32, 4);
> +  DECL_VARIABLE(vector_res, uint, 64, 2);
> +
> +  clean_results ();
> +
> +  /* Initialize input "vector1" from "buffer".  */
> +  VLOAD(vector1, buffer, q, int, s, 16, 8);
> +  VLOAD(vector1, buffer, q, int, s, 32, 4);
> +  VLOAD(vector1, buffer, q, uint, u, 16, 8);
> +  VLOAD(vector1, buffer, q, uint, u, 32, 4);
> +
> +
> +  /* Execute the tests.  */
> +  TEST_VMULL_HIGH(INSN_NAME, , int, s, 16, 8, 32, 4, 0xBB);
> +  TEST_VMULL_HIGH(INSN_NAME, , int, s, 32, 4, 64, 2, 0xCC);
> +  TEST_VMULL_HIGH(INSN_NAME, , uint, u, 16, 8, 32, 4, 0xEE);
> +  TEST_VMULL_HIGH(INSN_NAME, , uint, u, 32, 4, 64, 2, 0xFF);
> +
> +  CHECK_VMULL_HIGH_N_RESULTS (TEST_MSG, "");
> +}
> +
> +int main (void)
> +{
> +  FNNAME (INSN_NAME) ();
> +
> +  return 0;
> +}
>
> Property changes on:
> gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c
> ___________________________________________________________________
> Added: svn:executable
>    + *
>
> Index: gcc/testsuite/ChangeLog
> ===================================================================
> --- gcc/testsuite/ChangeLog     (revision 218464)
> +++ gcc/testsuite/ChangeLog     (working copy)
> @@ -1,3 +1,13 @@
> +2014-12-09  Felix Yang  <felix.yang@huawei.com>
> +            Jiji Jiang  <jiangjiji@huawei.com>
> +
> +        * testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c: New
> +        test.
> +        *
> testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c:
> +        New test.
> +        * testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c:
> New
> +        test.
> +
>  2014-12-07  Christophe Lyon  <christophe.lyon@linaro.org>
>         * gcc.target/aarch64/advsimd-intrinsics/vaddhn.c: Actually execute
> Index: gcc/config/aarch64/arm_neon.h
> ===================================================================
> --- gcc/config/aarch64/arm_neon.h       (revision 218464)
> +++ gcc/config/aarch64/arm_neon.h       (working copy)
> @@ -7627,671 +7627,6 @@ vmovn_u64 (uint64x2_t a)
>    return result;
>  }
>  -__extension__ static __inline float32x2_t __attribute__
> ((__always_inline__))
> -vmul_n_f32 (float32x2_t a, float32_t b)
> -{
> -  float32x2_t result;
> -  __asm__ ("fmul %0.2s,%1.2s,%2.s[0]"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
> -vmul_n_s16 (int16x4_t a, int16_t b)
> -{
> -  int16x4_t result;
> -  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
> -           : "=w"(result)
> -           : "w"(a), "x"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
> -vmul_n_s32 (int32x2_t a, int32_t b)
> -{
> -  int32x2_t result;
> -  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint16x4_t __attribute__
> ((__always_inline__))
> -vmul_n_u16 (uint16x4_t a, uint16_t b)
> -{
> -  uint16x4_t result;
> -  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
> -           : "=w"(result)
> -           : "w"(a), "x"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint32x2_t __attribute__
> ((__always_inline__))
> -vmul_n_u32 (uint32x2_t a, uint32_t b)
> -{
> -  uint32x2_t result;
> -  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -#define vmull_high_lane_s16(a, b, c)                                    \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       int16x4_t b_ = (b);                                              \
> -       int16x8_t a_ = (a);                                              \
> -       int32x4_t result;                                                \
> -       __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]"                         \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "x"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -#define vmull_high_lane_s32(a, b, c)                                    \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       int32x2_t b_ = (b);                                              \
> -       int32x4_t a_ = (a);                                              \
> -       int64x2_t result;                                                \
> -       __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]"                         \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "w"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -#define vmull_high_lane_u16(a, b, c)                                    \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       uint16x4_t b_ = (b);                                             \
> -       uint16x8_t a_ = (a);                                             \
> -       uint32x4_t result;                                               \
> -       __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]"                         \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "x"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -#define vmull_high_lane_u32(a, b, c)                                    \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       uint32x2_t b_ = (b);                                             \
> -       uint32x4_t a_ = (a);                                             \
> -       uint64x2_t result;                                               \
> -       __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]"                         \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "w"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -#define vmull_high_laneq_s16(a, b, c)                                   \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       int16x8_t b_ = (b);                                              \
> -       int16x8_t a_ = (a);                                              \
> -       int32x4_t result;                                                \
> -       __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]"                         \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "x"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -#define vmull_high_laneq_s32(a, b, c)                                   \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       int32x4_t b_ = (b);                                              \
> -       int32x4_t a_ = (a);                                              \
> -       int64x2_t result;                                                \
> -       __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]"                         \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "w"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -#define vmull_high_laneq_u16(a, b, c)                                   \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       uint16x8_t b_ = (b);                                             \
> -       uint16x8_t a_ = (a);                                             \
> -       uint32x4_t result;                                               \
> -       __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]"                         \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "x"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -#define vmull_high_laneq_u32(a, b, c)                                   \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       uint32x4_t b_ = (b);                                             \
> -       uint32x4_t a_ = (a);                                             \
> -       uint64x2_t result;                                               \
> -       __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]"                         \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "w"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
> -vmull_high_n_s16 (int16x8_t a, int16_t b)
> -{
> -  int32x4_t result;
> -  __asm__ ("smull2 %0.4s,%1.8h,%2.h[0]"
> -           : "=w"(result)
> -           : "w"(a), "x"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
> -vmull_high_n_s32 (int32x4_t a, int32_t b)
> -{
> -  int64x2_t result;
> -  __asm__ ("smull2 %0.2d,%1.4s,%2.s[0]"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint32x4_t __attribute__
> ((__always_inline__))
> -vmull_high_n_u16 (uint16x8_t a, uint16_t b)
> -{
> -  uint32x4_t result;
> -  __asm__ ("umull2 %0.4s,%1.8h,%2.h[0]"
> -           : "=w"(result)
> -           : "w"(a), "x"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint64x2_t __attribute__
> ((__always_inline__))
> -vmull_high_n_u32 (uint32x4_t a, uint32_t b)
> -{
> -  uint64x2_t result;
> -  __asm__ ("umull2 %0.2d,%1.4s,%2.s[0]"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline poly16x8_t __attribute__
> ((__always_inline__))
> -vmull_high_p8 (poly8x16_t a, poly8x16_t b)
> -{
> -  poly16x8_t result;
> -  __asm__ ("pmull2 %0.8h,%1.16b,%2.16b"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
> -vmull_high_s8 (int8x16_t a, int8x16_t b)
> -{
> -  int16x8_t result;
> -  __asm__ ("smull2 %0.8h,%1.16b,%2.16b"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
> -vmull_high_s16 (int16x8_t a, int16x8_t b)
> -{
> -  int32x4_t result;
> -  __asm__ ("smull2 %0.4s,%1.8h,%2.8h"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
> -vmull_high_s32 (int32x4_t a, int32x4_t b)
> -{
> -  int64x2_t result;
> -  __asm__ ("smull2 %0.2d,%1.4s,%2.4s"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint16x8_t __attribute__
> ((__always_inline__))
> -vmull_high_u8 (uint8x16_t a, uint8x16_t b)
> -{
> -  uint16x8_t result;
> -  __asm__ ("umull2 %0.8h,%1.16b,%2.16b"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint32x4_t __attribute__
> ((__always_inline__))
> -vmull_high_u16 (uint16x8_t a, uint16x8_t b)
> -{
> -  uint32x4_t result;
> -  __asm__ ("umull2 %0.4s,%1.8h,%2.8h"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint64x2_t __attribute__
> ((__always_inline__))
> -vmull_high_u32 (uint32x4_t a, uint32x4_t b)
> -{
> -  uint64x2_t result;
> -  __asm__ ("umull2 %0.2d,%1.4s,%2.4s"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -#define vmull_lane_s16(a, b, c)                                         \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       int16x4_t b_ = (b);                                              \
> -       int16x4_t a_ = (a);                                              \
> -       int32x4_t result;                                                \
> -       __asm__ ("smull %0.4s,%1.4h,%2.h[%3]"                            \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "x"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -#define vmull_lane_s32(a, b, c)                                         \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       int32x2_t b_ = (b);                                              \
> -       int32x2_t a_ = (a);                                              \
> -       int64x2_t result;                                                \
> -       __asm__ ("smull %0.2d,%1.2s,%2.s[%3]"                            \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "w"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -#define vmull_lane_u16(a, b, c)                                         \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       uint16x4_t b_ = (b);                                             \
> -       uint16x4_t a_ = (a);                                             \
> -       uint32x4_t result;                                               \
> -       __asm__ ("umull %0.4s,%1.4h,%2.h[%3]"                            \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "x"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -#define vmull_lane_u32(a, b, c)                                         \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       uint32x2_t b_ = (b);                                             \
> -       uint32x2_t a_ = (a);                                             \
> -       uint64x2_t result;                                               \
> -       __asm__ ("umull %0.2d, %1.2s, %2.s[%3]"                          \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "w"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -#define vmull_laneq_s16(a, b, c)                                        \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       int16x8_t b_ = (b);                                              \
> -       int16x4_t a_ = (a);                                              \
> -       int32x4_t result;                                                \
> -       __asm__ ("smull %0.4s, %1.4h, %2.h[%3]"                          \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "x"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -#define vmull_laneq_s32(a, b, c)                                        \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       int32x4_t b_ = (b);                                              \
> -       int32x2_t a_ = (a);                                              \
> -       int64x2_t result;                                                \
> -       __asm__ ("smull %0.2d, %1.2s, %2.s[%3]"                          \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "w"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -#define vmull_laneq_u16(a, b, c)                                        \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       uint16x8_t b_ = (b);                                             \
> -       uint16x4_t a_ = (a);                                             \
> -       uint32x4_t result;                                               \
> -       __asm__ ("umull %0.4s, %1.4h, %2.h[%3]"                          \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "x"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -#define vmull_laneq_u32(a, b, c)                                        \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       uint32x4_t b_ = (b);                                             \
> -       uint32x2_t a_ = (a);                                             \
> -       uint64x2_t result;                                               \
> -       __asm__ ("umull %0.2d, %1.2s, %2.s[%3]"                          \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "w"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
> -vmull_n_s16 (int16x4_t a, int16_t b)
> -{
> -  int32x4_t result;
> -  __asm__ ("smull %0.4s,%1.4h,%2.h[0]"
> -           : "=w"(result)
> -           : "w"(a), "x"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
> -vmull_n_s32 (int32x2_t a, int32_t b)
> -{
> -  int64x2_t result;
> -  __asm__ ("smull %0.2d,%1.2s,%2.s[0]"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint32x4_t __attribute__
> ((__always_inline__))
> -vmull_n_u16 (uint16x4_t a, uint16_t b)
> -{
> -  uint32x4_t result;
> -  __asm__ ("umull %0.4s,%1.4h,%2.h[0]"
> -           : "=w"(result)
> -           : "w"(a), "x"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint64x2_t __attribute__
> ((__always_inline__))
> -vmull_n_u32 (uint32x2_t a, uint32_t b)
> -{
> -  uint64x2_t result;
> -  __asm__ ("umull %0.2d,%1.2s,%2.s[0]"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline poly16x8_t __attribute__
> ((__always_inline__))
> -vmull_p8 (poly8x8_t a, poly8x8_t b)
> -{
> -  poly16x8_t result;
> -  __asm__ ("pmull %0.8h, %1.8b, %2.8b"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
> -vmull_s8 (int8x8_t a, int8x8_t b)
> -{
> -  int16x8_t result;
> -  __asm__ ("smull %0.8h, %1.8b, %2.8b"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
> -vmull_s16 (int16x4_t a, int16x4_t b)
> -{
> -  int32x4_t result;
> -  __asm__ ("smull %0.4s, %1.4h, %2.4h"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
> -vmull_s32 (int32x2_t a, int32x2_t b)
> -{
> -  int64x2_t result;
> -  __asm__ ("smull %0.2d, %1.2s, %2.2s"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint16x8_t __attribute__
> ((__always_inline__))
> -vmull_u8 (uint8x8_t a, uint8x8_t b)
> -{
> -  uint16x8_t result;
> -  __asm__ ("umull %0.8h, %1.8b, %2.8b"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint32x4_t __attribute__
> ((__always_inline__))
> -vmull_u16 (uint16x4_t a, uint16x4_t b)
> -{
> -  uint32x4_t result;
> -  __asm__ ("umull %0.4s, %1.4h, %2.4h"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint64x2_t __attribute__
> ((__always_inline__))
> -vmull_u32 (uint32x2_t a, uint32x2_t b)
> -{
> -  uint64x2_t result;
> -  __asm__ ("umull %0.2d, %1.2s, %2.2s"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline float32x4_t __attribute__
> ((__always_inline__))
> -vmulq_n_f32 (float32x4_t a, float32_t b)
> -{
> -  float32x4_t result;
> -  __asm__ ("fmul %0.4s,%1.4s,%2.s[0]"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline float64x2_t __attribute__
> ((__always_inline__))
> -vmulq_n_f64 (float64x2_t a, float64_t b)
> -{
> -  float64x2_t result;
> -  __asm__ ("fmul %0.2d,%1.2d,%2.d[0]"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
> -vmulq_n_s16 (int16x8_t a, int16_t b)
> -{
> -  int16x8_t result;
> -  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
> -           : "=w"(result)
> -           : "w"(a), "x"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
> -vmulq_n_s32 (int32x4_t a, int32_t b)
> -{
> -  int32x4_t result;
> -  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint16x8_t __attribute__
> ((__always_inline__))
> -vmulq_n_u16 (uint16x8_t a, uint16_t b)
> -{
> -  uint16x8_t result;
> -  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
> -           : "=w"(result)
> -           : "w"(a), "x"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint32x4_t __attribute__
> ((__always_inline__))
> -vmulq_n_u32 (uint32x4_t a, uint32_t b)
> -{
> -  uint32x4_t result;
> -  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline float32x2_t __attribute__
> ((__always_inline__))
> -vmulx_f32 (float32x2_t a, float32x2_t b)
> -{
> -  float32x2_t result;
> -  __asm__ ("fmulx %0.2s,%1.2s,%2.2s"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -#define vmulx_lane_f32(a, b, c)                                         \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       float32x4_t b_ = (b);                                            \
> -       float32x2_t a_ = (a);                                            \
> -       float32x2_t result;                                              \
> -       __asm__ ("fmulx %0.2s,%1.2s,%2.s[%3]"                            \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "w"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -__extension__ static __inline float64_t __attribute__ ((__always_inline__))
> -vmulxd_f64 (float64_t a, float64_t b)
> -{
> -  float64_t result;
> -  __asm__ ("fmulx %d0, %d1, %d2"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline float32x4_t __attribute__
> ((__always_inline__))
> -vmulxq_f32 (float32x4_t a, float32x4_t b)
> -{
> -  float32x4_t result;
> -  __asm__ ("fmulx %0.4s,%1.4s,%2.4s"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline float64x2_t __attribute__
> ((__always_inline__))
> -vmulxq_f64 (float64x2_t a, float64x2_t b)
> -{
> -  float64x2_t result;
> -  __asm__ ("fmulx %0.2d,%1.2d,%2.2d"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -#define vmulxq_lane_f32(a, b, c)                                        \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       float32x4_t b_ = (b);                                            \
> -       float32x4_t a_ = (a);                                            \
> -       float32x4_t result;                                              \
> -       __asm__ ("fmulx %0.4s,%1.4s,%2.s[%3]"                            \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "w"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -#define vmulxq_lane_f64(a, b, c)                                        \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       float64x2_t b_ = (b);                                            \
> -       float64x2_t a_ = (a);                                            \
> -       float64x2_t result;                                              \
> -       __asm__ ("fmulx %0.2d,%1.2d,%2.d[%3]"                            \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "w"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -__extension__ static __inline float32_t __attribute__ ((__always_inline__))
> -vmulxs_f32 (float32_t a, float32_t b)
> -{
> -  float32_t result;
> -  __asm__ ("fmulx %s0, %s1, %s2"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
>  __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
>  vmvn_p8 (poly8x8_t a)
>  {
> @@ -19172,6 +18507,78 @@ vmul_n_f64  (float64x1_t __a, float64_t __b)
>    return (float64x1_t) { vget_lane_f64 (__a, 0) * __b };
>  }
>  +__extension__ static __inline float32x2_t __attribute__
> ((__always_inline__))
> +vmul_n_f32 (float32x2_t __a, float32_t __b)
> +{
> +  return __builtin_aarch64_mul_nv2sf (__a, __b);
> +}
> +
> +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
> +vmul_n_s16 (int16x4_t __a, int16_t __b)
> +{
> +  return __builtin_aarch64_mul_nv4hi (__a, __b);
> +}
> +
> +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
> +vmul_n_s32 (int32x2_t __a, int32_t __b)
> +{
> +  return __builtin_aarch64_mul_nv2si (__a, __b);
> +}
> +
> +__extension__ static __inline uint16x4_t __attribute__
> ((__always_inline__))
> +vmul_n_u16 (uint16x4_t __a, uint16_t __b)
> +{
> +  return (uint16x4_t) __builtin_aarch64_mul_nv4hi ((int16x4_t)__a,
> +                                                   (int16_t)__b);
> +}
> +
> +__extension__ static __inline uint32x2_t __attribute__
> ((__always_inline__))
> +vmul_n_u32 (uint32x2_t __a, uint32_t __b)
> +{
> +  return (uint32x2_t) __builtin_aarch64_mul_nv2si ((int32x2_t)__a,
> +                                                   (int32_t)__b);
> +}
> +
> +/* vmulq_n  */
> +
> +__extension__ static __inline float32x4_t __attribute__
> ((__always_inline__))
> +vmulq_n_f32 (float32x4_t __a, float32_t __b)
> +{
> +  return __builtin_aarch64_mul_nv4sf (__a, __b);
> +}
> +
> +__extension__ static __inline float64x2_t __attribute__
> ((__always_inline__))
> +vmulq_n_f64 (float64x2_t __a, float64_t __b)
> +{
> +  return __builtin_aarch64_mul_nv2df (__a, __b);
> +}
> +
> +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
> +vmulq_n_s16 (int16x8_t __a, int16_t __b)
> +{
> +  return __builtin_aarch64_mul_nv8hi (__a, __b);
> +}
> +
> +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
> +vmulq_n_s32 (int32x4_t __a, int32_t __b)
> +{
> +  return __builtin_aarch64_mul_nv4si (__a, __b);
> +}
> +
> +__extension__ static __inline uint16x8_t __attribute__
> ((__always_inline__))
> +vmulq_n_u16 (uint16x8_t __a, uint16_t __b)
> +{
> +  return (uint16x8_t) __builtin_aarch64_mul_nv8hi ((int16x8_t)__a,
> +                                                   (int16_t)__b);
> +}
> +
> +__extension__ static __inline uint32x4_t __attribute__
> ((__always_inline__))
> +vmulq_n_u32 (uint32x4_t __a, uint32_t __b)
> +{
> +  return (uint32x4_t) __builtin_aarch64_mul_nv4si ((int32x4_t)__a,
> +                                                   (int32_t)__b);
> +}
> +
>  /* vmulq_lane  */
>   __extension__ static __inline float32x4_t __attribute__
> ((__always_inline__))
> @@ -19249,6 +18656,308 @@ vmulq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, c
>    return __a * __aarch64_vgetq_lane_u32 (__b, __lane);
>  }
>  +/* vmull_high_lane  */
> +
> +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
> +vmull_high_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
> +{
> +  return __builtin_aarch64_smull2_lanev8hi (__a, __b, __c);
> +}
> +
> +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
> +vmull_high_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
> +{
> +  return __builtin_aarch64_smull2_lanev4si (__a, __b, __c);
> +}
> +
> +__extension__ static __inline uint32x4_t __attribute__
> ((__always_inline__))
> +vmull_high_lane_u16 (uint16x8_t __a, uint16x4_t __b, const int __c)
> +{
> +  return (uint32x4_t) __builtin_aarch64_umull2_lanev8hi ((int16x8_t) __a,
> +                                                         (int16x4_t) __b,
> +                                                         __c);
> +}
> +
> +__extension__ static __inline uint64x2_t __attribute__
> ((__always_inline__))
> +vmull_high_lane_u32 (uint32x4_t __a, uint32x2_t __b, const int __c)
> +{
> +  return (uint64x2_t) __builtin_aarch64_umull2_lanev4si ((int32x4_t) __a,
> +                                                         (int32x2_t) __b,
> +                                                          __c);
> +}
> +
> +/* vmull_high_laneq  */
> +
> +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
> +vmull_high_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c)
> +{
> +  return __builtin_aarch64_smull2_laneqv8hi (__a, __b, __c);
> +}
> +
> +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
> +vmull_high_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c)
> +{
> +  return __builtin_aarch64_smull2_laneqv4si (__a, __b, __c);
> +}
> +
> +__extension__ static __inline uint32x4_t __attribute__
> ((__always_inline__))
> +vmull_high_laneq_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
> +{
> +  return (uint32x4_t) __builtin_aarch64_umull2_laneqv8hi ((int16x8_t)__a,
> +                                                          (int16x8_t)__b,
> +                                                          __c);
> +}
> +
> +__extension__ static __inline uint64x2_t __attribute__
> ((__always_inline__))
> +vmull_high_laneq_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
> +{
> +  return (uint64x2_t) __builtin_aarch64_umull2_laneqv4si ((int32x4_t) __a,
> +                                                          (int32x4_t) __b,
> +                                                           __c);
> +}
> +
> +/* vmull_high_n  */
> +
> +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
> +vmull_high_n_s16 (int16x8_t __a, int16_t __b)
> +{
> +  return __builtin_aarch64_smull2_nv8hi (__a, __b);
> +}
> +
> +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
> +vmull_high_n_s32 (int32x4_t __a, int32_t __b)
> +{
> +  return __builtin_aarch64_smull2_nv4si (__a, __b);
> +}
> +
> +__extension__ static __inline uint32x4_t __attribute__
> ((__always_inline__))
> +vmull_high_n_u16 (uint16x8_t __a, uint16_t __b)
> +{
> +  return __builtin_aarch64_umull2_nv8hi_uuu (__a, __b);
> +}
> +
> +__extension__ static __inline uint64x2_t __attribute__
> ((__always_inline__))
> +vmull_high_n_u32 (uint32x4_t __a, uint32_t __b)
> +{
> +  return __builtin_aarch64_umull2_nv4si_uuu (__a, __b);
> +}
> +
> +/* vmull_high  */
> +
> +__extension__ static __inline poly16x8_t __attribute__
> ((__always_inline__))
> +vmull_high_p8 (poly8x16_t __a, poly8x16_t __b)
> +{
> +    return  __builtin_aarch64_pmull2v16qi_ppp (__a, __b);
> +}
> +
> +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
> +vmull_high_s8 (int8x16_t __a, int8x16_t __b)
> +{
> +  return __builtin_aarch64_vec_widen_smult_hi_v16qi (__a, __b);
> +}
> +
> +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
> +vmull_high_s16 (int16x8_t __a, int16x8_t __b)
> +{
> +  return __builtin_aarch64_vec_widen_smult_hi_v8hi (__a, __b);
Jiangjiji Dec. 11, 2014, 2:02 a.m. UTC | #2
Hi, Christophe Lyon
These testcases are not covered by the glorious testsuite.
If these cases are in your todo list , I will exclude them.

Thanks.

-----邮件原件-----
发件人: Christophe Lyon [mailto:christophe.lyon@linaro.org] 
发送时间: 2014年12月9日 21:43
收件人: Jiangjiji
抄送: gcc-patches@gcc.gnu.org; Richard Earnshaw; Yangfei (Felix); Marcus Shawcroft
主题: Re: [AArch64, NEON] Improve vmulX intrinsics

On 9 December 2014 at 13:52, Jiangjiji <jiangjiji@huawei.com> wrote:
> Hi,

>      This patch converts more intrinsics to use builtin functions instead of

> the

> previous inline assembly syntax.

>      Passed the glorious testsuite of Christophe Lyon.

>

>      Three testcases are added for the testing of intriniscs which are not

> covered by the testsuite:

>      gcc.target/aarch64/vmull_high.c

>      gcc.target/aarch64/vmull_high_lane.c

>      gcc.target/aarch64/vmull_high_n.c

>


As I said here:
https://gcc.gnu.org/ml/gcc-patches/2014-10/msg01934.html
I am in tre process of converting my existing testsuite to GCC/Dejagnu.
Please do not duplicate work.


>      Regtested with aarch64-linux-gnu on QEMU.

>      This patch has no regressions for aarch64_be-linux-gnu big-endian

> target too.

>      OK for the trunk?

>

>

>

> Index: gcc/ChangeLog

> ===================================================================

> --- gcc/ChangeLog       (revision 218464)

> +++ gcc/ChangeLog       (working copy)

> @@ -1,3 +1,38 @@

> +2014-12-09  Felix Yang  <felix.yang@huawei.com>

> +            Jiji Jiang  <jiangjiji@huawei.com>

> +

> +       * config/aarch64/aarch64-simd.md (aarch64_mul_n<mode>,

> +       aarch64_<su>mull_n<mode>, aarch64_<su>mull<mode>,

> +       aarch64_simd_<su>mull2_n<mode>, aarch64_<su>mull2_n<mode>,

> +       aarch64_<su>mull_lane<mode>, aarch64_<su>mull2_lane<mode>_internal,

> +       aarch64_<su>mull_laneq<mode>,

> aarch64_<su>mull2_laneq<mode>_internal,

> +       aarch64_smull2_lane<mode>, aarch64_umull2_lane<mode>,

> +       aarch64_smull2_laneq<mode>, aarch64_umull2_laneq<mode>,

> +       aarch64_fmulx<mode>, aarch64_fmulx<mode>, aarch64_fmulx_lane<mode>,

> +       aarch64_pmull2v16qi, aarch64_pmullv8qi): New patterns.

> +       * config/aarch64/aarch64-simd-builtins.def (vec_widen_smult_hi_,

> +       vec_widen_umult_hi_, umull, smull, smull_n, umull_n, mul_n,

> smull2_n,

> +       umull2_n, smull_lane, umull_lane, smull_laneq, umull_laneq, pmull,

> +       umull2_lane, smull2_laneq, umull2_laneq, fmulx, fmulx_lane, pmull2,

> +       smull2_lane): New builtins.

> +       * config/aarch64/arm_neon.h (vmul_n_f32, vmul_n_s16, vmul_n_s32,

> +       vmul_n_u16, vmul_n_u32, vmulq_n_f32, vmulq_n_f64, vmulq_n_s16,

> +       vmulq_n_s32, vmulq_n_u16, vmulq_n_u32, vmull_high_lane_s16,

> +       vmull_high_lane_s32, vmull_high_lane_u16, vmull_high_lane_u32,

> +       vmull_high_laneq_s16, vmull_high_laneq_s32, vmull_high_laneq_u16,

> +       vmull_high_laneq_u32, vmull_high_n_s16, vmull_high_n_s32,

> +       vmull_high_n_u16, vmull_high_n_u32, vmull_high_p8, vmull_high_s8,

> +       vmull_high_s16, vmull_high_s32, vmull_high_u8, vmull_high_u16,

> +       vmull_high_u32, vmull_lane_s16, vmull_lane_s32, vmull_lane_u16,

> +       vmull_lane_u32, vmull_laneq_s16, vmull_laneq_s32, vmull_laneq_u16,

> +       vmull_laneq_u32, vmull_n_s16, vmull_n_s32, vmull_n_u16, vmull_n_u32,

> +       vmull_p8, vmull_s8, vmull_s16, vmull_s32, vmull_u8, vmull_u16,

> +       vmull_u32, vmulx_f32, vmulx_lane_f32, vmulxd_f64, vmulxq_f32,

> +       vmulxq_f64, vmulxq_lane_f32, vmulxq_lane_f64, vmulxs_f32): Rewrite

> +       using builtin functions.

> +       * config/aarch64/iterators.md (UNSPEC_FMULX, UNSPEC_FMULX_LANE,

> +       VDQF_Q): New unspec and int iterator.

> +

>  2014-12-07  Felix Yang  <felix.yang@huawei.com>

>             Shanyao Chen  <chenshanyao@huawei.com>

>  Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c

> ===================================================================

> --- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c

> (revision 0)

> +++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c

> (revision 0)

> @@ -0,0 +1,111 @@

> +#include <arm_neon.h>

> +#include "arm-neon-ref.h"

> +#include "compute-ref-data.h"

> +

> +

> +/* Expected results.  */

> +VECT_VAR_DECL(expected,int,16,8) [] = { 0xfc48, 0xfcbf, 0xfd36, 0xfdad,

> +                                        0xfe24, 0xfe9b, 0xff12, 0xff89 };

> +VECT_VAR_DECL(expected,int,32,4) [] = { 0xfffff9a0, 0xfffffa28,

> +                                        0xfffffab0, 0xfffffb38 };

> +VECT_VAR_DECL(expected,int,64,2) [] = { 0xfffffffffffff7a2,

> +                                        0xfffffffffffff83b };

> +VECT_VAR_DECL(expected,uint,16,8) [] = { 0xa4b0, 0xa55a, 0xa604, 0xa6ae,

> +                                         0xa758, 0xa802, 0xa8ac, 0xa956 };

> +VECT_VAR_DECL(expected,uint,32,4) [] = { 0xbaf73c, 0xbaf7f7,

> +                                         0xbaf8b2, 0xbaf96d };

> +VECT_VAR_DECL(expected,uint,64,2) [] = { 0xcbfffff4d8,

> +                                         0xcbfffff5a4};

> +VECT_VAR_DECL(expected,poly,16,8) [] = { 0x6530, 0x659a, 0x6464, 0x64ce,

> +                                         0x6798, 0x6732, 0x66cc, 0x6666 };

> +

> +#ifndef INSN_NAME

> +#define INSN_NAME vmull_high

> +#define TEST_MSG "VMUL_HIGH"

> +#endif

> +

> +#define FNNAME1(NAME) exec_ ## NAME

> +#define FNNAME(NAME) FNNAME1(NAME)

> +

> +void FNNAME (INSN_NAME) (void)

> +{

> +#define DECL_VMUL(T, W, N)                      \

> +  DECL_VARIABLE(vector1, T, W, N);              \

> +  DECL_VARIABLE(vector2, T, W, N);

> +

> +  /* vector_res = OP(vector1, vector2), then store the result.  */

> +#define TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1)                 \

> +  VECT_VAR(vector_res, T1, W1, N1) =                                    \

> +    INSN##Q##_##T2##W(VECT_VAR(vector1, T1, W, N),                      \

> +                               VECT_VAR(vector2, T1, W, N));            \

> +  vst1q##_##T2##W1(VECT_VAR(result, T1, W1, N1),                        \

> +                   VECT_VAR(vector_res, T1, W1, N1))

> +

> +#define TEST_VMULL_HIGH(INSN, Q, T1, T2, W, N, W1, N1)                 \

> +  TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1)

> +

> +#define CHECK_VMULL_HIGH_RESULTS(test_name,comment)                     \

> +  {                                                                     \

> +    CHECK(test_name, int, 16, 8, PRIx16, expected, comment);            \

> +    CHECK(test_name, int, 32, 4, PRIx32, expected, comment);            \

> +    CHECK(test_name, int, 64, 2, PRIx64, expected, comment);            \

> +    CHECK(test_name, uint, 16, 8, PRIx16,  expected, comment);          \

> +    CHECK(test_name, uint, 32, 4, PRIx32, expected, comment);           \

> +    CHECK(test_name, uint, 64, 2, PRIx64, expected, comment);           \

> +    CHECK(test_name, poly, 16, 8, PRIx16, expected, comment);           \

> +  }

> +

> +  DECL_VMUL(int, 8, 16);

> +  DECL_VMUL(int, 16, 8);

> +  DECL_VMUL(int, 32, 4);

> +  DECL_VMUL(uint, 8, 16);

> +  DECL_VMUL(uint, 16, 8);

> +  DECL_VMUL(uint, 32, 4);

> +  DECL_VMUL(poly, 8, 16);

> +

> +  DECL_VARIABLE(vector_res, int, 16, 8);

> +  DECL_VARIABLE(vector_res, int, 32, 4);

> +  DECL_VARIABLE(vector_res, int, 64, 2);

> +  DECL_VARIABLE(vector_res, uint, 16, 8);

> +  DECL_VARIABLE(vector_res, uint, 32, 4);

> +  DECL_VARIABLE(vector_res, uint, 64, 2);

> +  DECL_VARIABLE(vector_res, poly, 16, 8);

> +

> +  clean_results ();

> +

> +  /* Initialize input "vector1" from "buffer".  */

> +  VLOAD(vector1, buffer, q, int, s, 8, 16);

> +  VLOAD(vector1, buffer, q, int, s, 16, 8);

> +  VLOAD(vector1, buffer, q, int, s, 32, 4);

> +  VLOAD(vector1, buffer, q, uint, u, 8, 16);

> +  VLOAD(vector1, buffer, q, uint, u, 16, 8);

> +  VLOAD(vector1, buffer, q, uint, u, 32, 4);

> +  VLOAD(vector1, buffer, q, poly, p, 8, 16);

> +

> +  /* Choose init value arbitrarily.  */

> +  VDUP(vector2, q, int, s, 8, 16, 0x77);

> +  VDUP(vector2, q, int, s, 16, 8, 0x88);

> +  VDUP(vector2, q, int, s, 32, 4, 0x99);

> +  VDUP(vector2, q, uint, u, 8, 16, 0xAA);

> +  VDUP(vector2, q, uint, u, 16, 8, 0xBB);

> +  VDUP(vector2, q, uint, u, 32, 4, 0xCC);

> +  VDUP(vector2, q, poly, p, 8, 16, 0xAA);

> +

> +  /* Execute the tests.  */

> +  TEST_VMULL_HIGH(INSN_NAME, , int, s, 8, 16, 16, 8);

> +  TEST_VMULL_HIGH(INSN_NAME, , int, s, 16, 8, 32, 4);

> +  TEST_VMULL_HIGH(INSN_NAME, , int, s, 32, 4, 64, 2);

> +  TEST_VMULL_HIGH(INSN_NAME, , uint, u, 8, 16, 16, 8);

> +  TEST_VMULL_HIGH(INSN_NAME, , uint, u, 16, 8, 32, 4);

> +  TEST_VMULL_HIGH(INSN_NAME, , uint, u, 32, 4, 64, 2);

> +  TEST_VMULL_HIGH(INSN_NAME, , poly, p, 8, 16, 16, 8);

> +

> +  CHECK_VMULL_HIGH_RESULTS (TEST_MSG, "");

> +}

> +

> +int main (void)

> +{

> +  FNNAME (INSN_NAME) ();

> +

> +  return 0;

> +}

>

> Property changes on:

> gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c

> ___________________________________________________________________

> Added: svn:executable

>    + *

>

> Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c

> ===================================================================

> --- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c

> (revision 0)

> +++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c

> (revision 0)

> @@ -0,0 +1,135 @@

> +#include <arm_neon.h>

> +#include "arm-neon-ref.h"

> +#include "compute-ref-data.h"

> +

> +VECT_VAR_DECL(expected, int, 32, 4) [] = { 0x4000, 0x4000, 0x4000, 0x4000

> };

> +VECT_VAR_DECL(expected, int, 64, 2) [] = { 0x2000, 0x2000};

> +VECT_VAR_DECL(expected, uint, 32, 4) [] = { 0x4000, 0x4000, 0x4000, 0x4000

> };

> +VECT_VAR_DECL(expected, uint, 64, 2) [] = { 0x2000, 0x2000 };

> +

> +#define TEST_MSG "VMULL_HIGH_LANE/VMULL_HIGH_LANEQ"

> +void exec_vmull_high_lane (void)

> +{

> +  /* vector_res = vmull_lane(vector,vector2,lane), then store the result.

> */

> +#define TEST_VMULL_HIGH_LANE(T1, T2, W, W2, N1, N2, L)                  \

> +  VECT_VAR(vector_res, T1, W2, N2) =                                    \

> +    vmull##_high_lane_##T2##W(VECT_VAR(vector, T1, W, N1 ),             \

> +                         VECT_VAR(vector2, T1, W, N2),                  \

> +                         L);                                            \

> +  vst1q_##T2##W2(VECT_VAR(result, T1, W2, N2), VECT_VAR(vector_res, T1, W2,

> N2))

> +

> +#define CHECK_VMULL_HIGH_LANE_RESULTS(test_name,comment)                \

> +  {                                                                     \

> +    CHECK(test_name, int, 32, 4, PRIx32, expected, comment);            \

> +    CHECK(test_name, int, 64, 2, PRIx64, expected, comment);            \

> +    CHECK(test_name, uint, 32, 4, PRIx32,  expected, comment);          \

> +    CHECK(test_name, uint, 64, 2, PRIx64, expected, comment);           \

> +  }

> +

> +

> +  /* With ARM RVCT, we need to declare variables before any executable

> +     statement */

> +  DECL_VARIABLE(vector, int, 16, 8);

> +  DECL_VARIABLE(vector, int, 32, 4);

> +  DECL_VARIABLE(vector, uint, 16, 8);

> +  DECL_VARIABLE(vector, uint, 32, 4);

> +  DECL_VARIABLE(vector2, int, 16, 4);

> +  DECL_VARIABLE(vector2, int, 32, 2);

> +  DECL_VARIABLE(vector2, uint, 16, 4);

> +  DECL_VARIABLE(vector2, uint, 32, 2);

> +

> +  DECL_VARIABLE(vector_res, int, 32, 4);

> +  DECL_VARIABLE(vector_res, int, 64, 2);

> +  DECL_VARIABLE(vector_res, uint, 32, 4);

> +  DECL_VARIABLE(vector_res, uint, 64, 2);

> +

> +  clean_results ();

> +

> +  /* Initialize vector */

> +  VDUP(vector2, , int, s, 16, 4, 0x1000);

> +  VDUP(vector2, , int, s, 32, 2, 0x1000);

> +  VDUP(vector2, , uint, u, 16, 4, 0x1000);

> +  VDUP(vector2, , uint, u, 32, 2, 0x1000);

> +

> +  /* Initialize vector2 */

> +  VDUP(vector, q, int, s, 16, 8, 0x4);

> +  VDUP(vector, q, int, s, 32, 4, 0x2);

> +  VDUP(vector, q, uint, u, 16, 8, 0x4);

> +  VDUP(vector, q, uint, u, 32, 4, 0x2);

> +

> +  /* Choose lane arbitrarily */

> +  TEST_VMULL_HIGH_LANE(int, s, 16, 32, 8, 4, 2);

> +  TEST_VMULL_HIGH_LANE(int, s, 32, 64, 4, 2, 1);

> +  TEST_VMULL_HIGH_LANE(uint, u, 16, 32, 8, 4, 2);

> +  TEST_VMULL_HIGH_LANE(uint, u, 32, 64, 4, 2, 1);

> +

> +  CHECK_VMULL_HIGH_LANE_RESULTS (TEST_MSG, "");

> +}

> +

> +

> +void exec_vmull_high_laneq (void)

> +{

> +  /* vector_res = vmull_lane(vector,vector2,lane), then store the result.

> */

> +#define TEST_VMULL_HIGH_LANEQ(T1, T2, W, W2, N2, N1, L)                 \

> +  VECT_VAR(vector_res, T1, W2, N1) =                                    \

> +    vmull##_high_laneq_##T2##W(VECT_VAR(vector, T1, W, N2 ),            \

> +                         VECT_VAR(vector2, T1, W, N2),                  \

> +                         L);                                            \

> +  vst1q_##T2##W2(VECT_VAR(result, T1, W2, N1), VECT_VAR(vector_res, T1, W2,

> N1))

> +

> +#define CHECK_VMULL_HIGH_LANEQ_RESULTS(test_name,comment)               \

> +  {                                                                     \

> +    CHECK(test_name, int, 32, 4, PRIx32, expected, comment);            \

> +    CHECK(test_name, int, 64, 2, PRIx64, expected, comment);            \

> +    CHECK(test_name, uint, 32, 4, PRIx32,  expected, comment);          \

> +    CHECK(test_name, uint, 64, 2, PRIx64, expected, comment);           \

> +  }

> +

> +

> +  /* With ARM RVCT, we need to declare variables before any executable

> +     statement */

> +  DECL_VARIABLE(vector, int, 16, 8);

> +  DECL_VARIABLE(vector, int, 32, 4);

> +  DECL_VARIABLE(vector, uint, 16, 8);

> +  DECL_VARIABLE(vector, uint, 32, 4);

> +  DECL_VARIABLE(vector2, int, 16, 8);

> +  DECL_VARIABLE(vector2, int, 32, 4);

> +  DECL_VARIABLE(vector2, uint, 16, 8);

> +  DECL_VARIABLE(vector2, uint, 32, 4);

> +

> +  DECL_VARIABLE(vector_res, int, 32, 4);

> +  DECL_VARIABLE(vector_res, int, 64, 2);

> +  DECL_VARIABLE(vector_res, uint, 32, 4);

> +  DECL_VARIABLE(vector_res, uint, 64, 2);

> +

> +  clean_results ();

> +

> +  /* Initialize vector */

> +  VDUP(vector2, q, int, s, 16, 8, 0x1000);

> +  VDUP(vector2, q, int, s, 32, 4, 0x1000);

> +  VDUP(vector2, q, uint, u, 16, 8, 0x1000);

> +  VDUP(vector2, q, uint, u, 32, 4, 0x1000);

> +

> +  /* Initialize vector2 */

> +  VDUP(vector, q, int, s, 16, 8, 0x4);

> +  VDUP(vector, q, int, s, 32, 4, 0x2);

> +  VDUP(vector, q, uint, u, 16, 8, 0x4);

> +  VDUP(vector, q, uint, u, 32, 4, 0x2);

> +

> +  /* Choose lane arbitrarily */

> +  TEST_VMULL_HIGH_LANEQ(int, s, 16, 32, 8, 4, 2);

> +  TEST_VMULL_HIGH_LANEQ(int, s, 32, 64, 4, 2, 1);

> +  TEST_VMULL_HIGH_LANEQ(uint, u, 16, 32, 8, 4, 2);

> +  TEST_VMULL_HIGH_LANEQ(uint, u, 32, 64, 4, 2, 1);

> +

> +  CHECK_VMULL_HIGH_LANEQ_RESULTS (TEST_MSG, "");

> +}

> +

> +

> +

> +

> +int main (void)

> +{

> +  exec_vmull_high_lane();

> +  return 0;

> +}

>

> Property changes on:

> gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c

> ___________________________________________________________________

> Added: svn:executable

>    + *

>

> Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c

> ===================================================================

> --- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c

> (revision 0)

> +++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c

> (revision 0)

> @@ -0,0 +1,81 @@

> +#include <arm_neon.h>

> +#include "arm-neon-ref.h"

> +#include "compute-ref-data.h"

> +

> +

> +/* Expected results.  */

> +VECT_VAR_DECL(expected,int,32,4) [] = { 0xfffff73c, 0xfffff7f7,

> +                                        0xfffff8b2, 0xfffff96d };

> +VECT_VAR_DECL(expected,int,64,2) [] = { 0xfffffffffffff4d8,

> +                                        0xfffffffffffff5a4 };

> +VECT_VAR_DECL(expected,uint,32,4) [] = { 0xedf4d8, 0xedf5c6,

> +                                         0xedf6b4, 0xedf7a2 };

> +VECT_VAR_DECL(expected,uint,64,2) [] = { 0xfefffff20e,

> +                                         0xfefffff30d};

> +

> +#ifndef INSN_NAME

> +#define INSN_NAME vmull_high_n

> +#define TEST_MSG "VMULL_HIGH_N"

> +#endif

> +

> +#define FNNAME1(NAME) exec_ ## NAME

> +#define FNNAME(NAME) FNNAME1(NAME)

> +

> +void FNNAME (INSN_NAME) (void)

> +{

> +#define DECL_VMUL(T, W, N)                      \

> +  DECL_VARIABLE(vector1, T, W, N);              \

> +

> +  /* vector_res = OP(vector1, vector2), then store the result.  */

> +#define TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1, C)              \

> +  VECT_VAR(vector_res, T1, W1, N1) =                                    \

> +    INSN##Q##_##T2##W(VECT_VAR(vector1, T1, W, N),                      \

> +                      C);                                               \

> +  vst1q##_##T2##W1(VECT_VAR(result, T1, W1, N1),                        \

> +                      VECT_VAR(vector_res, T1, W1, N1))

> +

> +#define TEST_VMULL_HIGH(INSN, Q, T1, T2, W, N, W1, N1, C)               \

> +  TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1, C)

> +

> +#define CHECK_VMULL_HIGH_N_RESULTS(test_name,comment)                   \

> +  {                                                                     \

> +    CHECK(test_name, int, 32, 4, PRIx32, expected, comment);            \

> +    CHECK(test_name, int, 64, 2, PRIx64, expected, comment);            \

> +    CHECK(test_name, uint, 32, 4, PRIx32, expected, comment);           \

> +    CHECK(test_name, uint, 64, 2, PRIx64, expected, comment);           \

> +  }

> +

> +  DECL_VMUL(int, 16, 8);

> +  DECL_VMUL(int, 32, 4);

> +  DECL_VMUL(uint, 16, 8);

> +  DECL_VMUL(uint, 32, 4);

> +

> +  DECL_VARIABLE(vector_res, int, 32, 4);

> +  DECL_VARIABLE(vector_res, int, 64, 2);

> +  DECL_VARIABLE(vector_res, uint, 32, 4);

> +  DECL_VARIABLE(vector_res, uint, 64, 2);

> +

> +  clean_results ();

> +

> +  /* Initialize input "vector1" from "buffer".  */

> +  VLOAD(vector1, buffer, q, int, s, 16, 8);

> +  VLOAD(vector1, buffer, q, int, s, 32, 4);

> +  VLOAD(vector1, buffer, q, uint, u, 16, 8);

> +  VLOAD(vector1, buffer, q, uint, u, 32, 4);

> +

> +

> +  /* Execute the tests.  */

> +  TEST_VMULL_HIGH(INSN_NAME, , int, s, 16, 8, 32, 4, 0xBB);

> +  TEST_VMULL_HIGH(INSN_NAME, , int, s, 32, 4, 64, 2, 0xCC);

> +  TEST_VMULL_HIGH(INSN_NAME, , uint, u, 16, 8, 32, 4, 0xEE);

> +  TEST_VMULL_HIGH(INSN_NAME, , uint, u, 32, 4, 64, 2, 0xFF);

> +

> +  CHECK_VMULL_HIGH_N_RESULTS (TEST_MSG, "");

> +}

> +

> +int main (void)

> +{

> +  FNNAME (INSN_NAME) ();

> +

> +  return 0;

> +}

>

> Property changes on:

> gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c

> ___________________________________________________________________

> Added: svn:executable

>    + *

>

> Index: gcc/testsuite/ChangeLog

> ===================================================================

> --- gcc/testsuite/ChangeLog     (revision 218464)

> +++ gcc/testsuite/ChangeLog     (working copy)

> @@ -1,3 +1,13 @@

> +2014-12-09  Felix Yang  <felix.yang@huawei.com>

> +            Jiji Jiang  <jiangjiji@huawei.com>

> +

> +        * testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c: New

> +        test.

> +        *

> testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c:

> +        New test.

> +        * testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c:

> New

> +        test.

> +

>  2014-12-07  Christophe Lyon  <christophe.lyon@linaro.org>

>         * gcc.target/aarch64/advsimd-intrinsics/vaddhn.c: Actually execute

> Index: gcc/config/aarch64/arm_neon.h

> ===================================================================

> --- gcc/config/aarch64/arm_neon.h       (revision 218464)

> +++ gcc/config/aarch64/arm_neon.h       (working copy)

> @@ -7627,671 +7627,6 @@ vmovn_u64 (uint64x2_t a)

>    return result;

>  }

>  -__extension__ static __inline float32x2_t __attribute__

> ((__always_inline__))

> -vmul_n_f32 (float32x2_t a, float32_t b)

> -{

> -  float32x2_t result;

> -  __asm__ ("fmul %0.2s,%1.2s,%2.s[0]"

> -           : "=w"(result)

> -           : "w"(a), "w"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))

> -vmul_n_s16 (int16x4_t a, int16_t b)

> -{

> -  int16x4_t result;

> -  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"

> -           : "=w"(result)

> -           : "w"(a), "x"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))

> -vmul_n_s32 (int32x2_t a, int32_t b)

> -{

> -  int32x2_t result;

> -  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"

> -           : "=w"(result)

> -           : "w"(a), "w"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -__extension__ static __inline uint16x4_t __attribute__

> ((__always_inline__))

> -vmul_n_u16 (uint16x4_t a, uint16_t b)

> -{

> -  uint16x4_t result;

> -  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"

> -           : "=w"(result)

> -           : "w"(a), "x"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -__extension__ static __inline uint32x2_t __attribute__

> ((__always_inline__))

> -vmul_n_u32 (uint32x2_t a, uint32_t b)

> -{

> -  uint32x2_t result;

> -  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"

> -           : "=w"(result)

> -           : "w"(a), "w"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -#define vmull_high_lane_s16(a, b, c)                                    \

> -  __extension__                                                         \

> -    ({                                                                  \

> -       int16x4_t b_ = (b);                                              \

> -       int16x8_t a_ = (a);                                              \

> -       int32x4_t result;                                                \

> -       __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]"                         \

> -                : "=w"(result)                                          \

> -                : "w"(a_), "x"(b_), "i"(c)                              \

> -                : /* No clobbers */);                                   \

> -       result;                                                          \

> -     })

> -

> -#define vmull_high_lane_s32(a, b, c)                                    \

> -  __extension__                                                         \

> -    ({                                                                  \

> -       int32x2_t b_ = (b);                                              \

> -       int32x4_t a_ = (a);                                              \

> -       int64x2_t result;                                                \

> -       __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]"                         \

> -                : "=w"(result)                                          \

> -                : "w"(a_), "w"(b_), "i"(c)                              \

> -                : /* No clobbers */);                                   \

> -       result;                                                          \

> -     })

> -

> -#define vmull_high_lane_u16(a, b, c)                                    \

> -  __extension__                                                         \

> -    ({                                                                  \

> -       uint16x4_t b_ = (b);                                             \

> -       uint16x8_t a_ = (a);                                             \

> -       uint32x4_t result;                                               \

> -       __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]"                         \

> -                : "=w"(result)                                          \

> -                : "w"(a_), "x"(b_), "i"(c)                              \

> -                : /* No clobbers */);                                   \

> -       result;                                                          \

> -     })

> -

> -#define vmull_high_lane_u32(a, b, c)                                    \

> -  __extension__                                                         \

> -    ({                                                                  \

> -       uint32x2_t b_ = (b);                                             \

> -       uint32x4_t a_ = (a);                                             \

> -       uint64x2_t result;                                               \

> -       __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]"                         \

> -                : "=w"(result)                                          \

> -                : "w"(a_), "w"(b_), "i"(c)                              \

> -                : /* No clobbers */);                                   \

> -       result;                                                          \

> -     })

> -

> -#define vmull_high_laneq_s16(a, b, c)                                   \

> -  __extension__                                                         \

> -    ({                                                                  \

> -       int16x8_t b_ = (b);                                              \

> -       int16x8_t a_ = (a);                                              \

> -       int32x4_t result;                                                \

> -       __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]"                         \

> -                : "=w"(result)                                          \

> -                : "w"(a_), "x"(b_), "i"(c)                              \

> -                : /* No clobbers */);                                   \

> -       result;                                                          \

> -     })

> -

> -#define vmull_high_laneq_s32(a, b, c)                                   \

> -  __extension__                                                         \

> -    ({                                                                  \

> -       int32x4_t b_ = (b);                                              \

> -       int32x4_t a_ = (a);                                              \

> -       int64x2_t result;                                                \

> -       __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]"                         \

> -                : "=w"(result)                                          \

> -                : "w"(a_), "w"(b_), "i"(c)                              \

> -                : /* No clobbers */);                                   \

> -       result;                                                          \

> -     })

> -

> -#define vmull_high_laneq_u16(a, b, c)                                   \

> -  __extension__                                                         \

> -    ({                                                                  \

> -       uint16x8_t b_ = (b);                                             \

> -       uint16x8_t a_ = (a);                                             \

> -       uint32x4_t result;                                               \

> -       __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]"                         \

> -                : "=w"(result)                                          \

> -                : "w"(a_), "x"(b_), "i"(c)                              \

> -                : /* No clobbers */);                                   \

> -       result;                                                          \

> -     })

> -

> -#define vmull_high_laneq_u32(a, b, c)                                   \

> -  __extension__                                                         \

> -    ({                                                                  \

> -       uint32x4_t b_ = (b);                                             \

> -       uint32x4_t a_ = (a);                                             \

> -       uint64x2_t result;                                               \

> -       __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]"                         \

> -                : "=w"(result)                                          \

> -                : "w"(a_), "w"(b_), "i"(c)                              \

> -                : /* No clobbers */);                                   \

> -       result;                                                          \

> -     })

> -

> -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))

> -vmull_high_n_s16 (int16x8_t a, int16_t b)

> -{

> -  int32x4_t result;

> -  __asm__ ("smull2 %0.4s,%1.8h,%2.h[0]"

> -           : "=w"(result)

> -           : "w"(a), "x"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))

> -vmull_high_n_s32 (int32x4_t a, int32_t b)

> -{

> -  int64x2_t result;

> -  __asm__ ("smull2 %0.2d,%1.4s,%2.s[0]"

> -           : "=w"(result)

> -           : "w"(a), "w"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -__extension__ static __inline uint32x4_t __attribute__

> ((__always_inline__))

> -vmull_high_n_u16 (uint16x8_t a, uint16_t b)

> -{

> -  uint32x4_t result;

> -  __asm__ ("umull2 %0.4s,%1.8h,%2.h[0]"

> -           : "=w"(result)

> -           : "w"(a), "x"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -__extension__ static __inline uint64x2_t __attribute__

> ((__always_inline__))

> -vmull_high_n_u32 (uint32x4_t a, uint32_t b)

> -{

> -  uint64x2_t result;

> -  __asm__ ("umull2 %0.2d,%1.4s,%2.s[0]"

> -           : "=w"(result)

> -           : "w"(a), "w"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -__extension__ static __inline poly16x8_t __attribute__

> ((__always_inline__))

> -vmull_high_p8 (poly8x16_t a, poly8x16_t b)

> -{

> -  poly16x8_t result;

> -  __asm__ ("pmull2 %0.8h,%1.16b,%2.16b"

> -           : "=w"(result)

> -           : "w"(a), "w"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))

> -vmull_high_s8 (int8x16_t a, int8x16_t b)

> -{

> -  int16x8_t result;

> -  __asm__ ("smull2 %0.8h,%1.16b,%2.16b"

> -           : "=w"(result)

> -           : "w"(a), "w"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))

> -vmull_high_s16 (int16x8_t a, int16x8_t b)

> -{

> -  int32x4_t result;

> -  __asm__ ("smull2 %0.4s,%1.8h,%2.8h"

> -           : "=w"(result)

> -           : "w"(a), "w"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))

> -vmull_high_s32 (int32x4_t a, int32x4_t b)

> -{

> -  int64x2_t result;

> -  __asm__ ("smull2 %0.2d,%1.4s,%2.4s"

> -           : "=w"(result)

> -           : "w"(a), "w"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -__extension__ static __inline uint16x8_t __attribute__

> ((__always_inline__))

> -vmull_high_u8 (uint8x16_t a, uint8x16_t b)

> -{

> -  uint16x8_t result;

> -  __asm__ ("umull2 %0.8h,%1.16b,%2.16b"

> -           : "=w"(result)

> -           : "w"(a), "w"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -__extension__ static __inline uint32x4_t __attribute__

> ((__always_inline__))

> -vmull_high_u16 (uint16x8_t a, uint16x8_t b)

> -{

> -  uint32x4_t result;

> -  __asm__ ("umull2 %0.4s,%1.8h,%2.8h"

> -           : "=w"(result)

> -           : "w"(a), "w"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -__extension__ static __inline uint64x2_t __attribute__

> ((__always_inline__))

> -vmull_high_u32 (uint32x4_t a, uint32x4_t b)

> -{

> -  uint64x2_t result;

> -  __asm__ ("umull2 %0.2d,%1.4s,%2.4s"

> -           : "=w"(result)

> -           : "w"(a), "w"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -#define vmull_lane_s16(a, b, c)                                         \

> -  __extension__                                                         \

> -    ({                                                                  \

> -       int16x4_t b_ = (b);                                              \

> -       int16x4_t a_ = (a);                                              \

> -       int32x4_t result;                                                \

> -       __asm__ ("smull %0.4s,%1.4h,%2.h[%3]"                            \

> -                : "=w"(result)                                          \

> -                : "w"(a_), "x"(b_), "i"(c)                              \

> -                : /* No clobbers */);                                   \

> -       result;                                                          \

> -     })

> -

> -#define vmull_lane_s32(a, b, c)                                         \

> -  __extension__                                                         \

> -    ({                                                                  \

> -       int32x2_t b_ = (b);                                              \

> -       int32x2_t a_ = (a);                                              \

> -       int64x2_t result;                                                \

> -       __asm__ ("smull %0.2d,%1.2s,%2.s[%3]"                            \

> -                : "=w"(result)                                          \

> -                : "w"(a_), "w"(b_), "i"(c)                              \

> -                : /* No clobbers */);                                   \

> -       result;                                                          \

> -     })

> -

> -#define vmull_lane_u16(a, b, c)                                         \

> -  __extension__                                                         \

> -    ({                                                                  \

> -       uint16x4_t b_ = (b);                                             \

> -       uint16x4_t a_ = (a);                                             \

> -       uint32x4_t result;                                               \

> -       __asm__ ("umull %0.4s,%1.4h,%2.h[%3]"                            \

> -                : "=w"(result)                                          \

> -                : "w"(a_), "x"(b_), "i"(c)                              \

> -                : /* No clobbers */);                                   \

> -       result;                                                          \

> -     })

> -

> -#define vmull_lane_u32(a, b, c)                                         \

> -  __extension__                                                         \

> -    ({                                                                  \

> -       uint32x2_t b_ = (b);                                             \

> -       uint32x2_t a_ = (a);                                             \

> -       uint64x2_t result;                                               \

> -       __asm__ ("umull %0.2d, %1.2s, %2.s[%3]"                          \

> -                : "=w"(result)                                          \

> -                : "w"(a_), "w"(b_), "i"(c)                              \

> -                : /* No clobbers */);                                   \

> -       result;                                                          \

> -     })

> -

> -#define vmull_laneq_s16(a, b, c)                                        \

> -  __extension__                                                         \

> -    ({                                                                  \

> -       int16x8_t b_ = (b);                                              \

> -       int16x4_t a_ = (a);                                              \

> -       int32x4_t result;                                                \

> -       __asm__ ("smull %0.4s, %1.4h, %2.h[%3]"                          \

> -                : "=w"(result)                                          \

> -                : "w"(a_), "x"(b_), "i"(c)                              \

> -                : /* No clobbers */);                                   \

> -       result;                                                          \

> -     })

> -

> -#define vmull_laneq_s32(a, b, c)                                        \

> -  __extension__                                                         \

> -    ({                                                                  \

> -       int32x4_t b_ = (b);                                              \

> -       int32x2_t a_ = (a);                                              \

> -       int64x2_t result;                                                \

> -       __asm__ ("smull %0.2d, %1.2s, %2.s[%3]"                          \

> -                : "=w"(result)                                          \

> -                : "w"(a_), "w"(b_), "i"(c)                              \

> -                : /* No clobbers */);                                   \

> -       result;                                                          \

> -     })

> -

> -#define vmull_laneq_u16(a, b, c)                                        \

> -  __extension__                                                         \

> -    ({                                                                  \

> -       uint16x8_t b_ = (b);                                             \

> -       uint16x4_t a_ = (a);                                             \

> -       uint32x4_t result;                                               \

> -       __asm__ ("umull %0.4s, %1.4h, %2.h[%3]"                          \

> -                : "=w"(result)                                          \

> -                : "w"(a_), "x"(b_), "i"(c)                              \

> -                : /* No clobbers */);                                   \

> -       result;                                                          \

> -     })

> -

> -#define vmull_laneq_u32(a, b, c)                                        \

> -  __extension__                                                         \

> -    ({                                                                  \

> -       uint32x4_t b_ = (b);                                             \

> -       uint32x2_t a_ = (a);                                             \

> -       uint64x2_t result;                                               \

> -       __asm__ ("umull %0.2d, %1.2s, %2.s[%3]"                          \

> -                : "=w"(result)                                          \

> -                : "w"(a_), "w"(b_), "i"(c)                              \

> -                : /* No clobbers */);                                   \

> -       result;                                                          \

> -     })

> -

> -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))

> -vmull_n_s16 (int16x4_t a, int16_t b)

> -{

> -  int32x4_t result;

> -  __asm__ ("smull %0.4s,%1.4h,%2.h[0]"

> -           : "=w"(result)

> -           : "w"(a), "x"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))

> -vmull_n_s32 (int32x2_t a, int32_t b)

> -{

> -  int64x2_t result;

> -  __asm__ ("smull %0.2d,%1.2s,%2.s[0]"

> -           : "=w"(result)

> -           : "w"(a), "w"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -__extension__ static __inline uint32x4_t __attribute__

> ((__always_inline__))

> -vmull_n_u16 (uint16x4_t a, uint16_t b)

> -{

> -  uint32x4_t result;

> -  __asm__ ("umull %0.4s,%1.4h,%2.h[0]"

> -           : "=w"(result)

> -           : "w"(a), "x"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -__extension__ static __inline uint64x2_t __attribute__

> ((__always_inline__))

> -vmull_n_u32 (uint32x2_t a, uint32_t b)

> -{

> -  uint64x2_t result;

> -  __asm__ ("umull %0.2d,%1.2s,%2.s[0]"

> -           : "=w"(result)

> -           : "w"(a), "w"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -__extension__ static __inline poly16x8_t __attribute__

> ((__always_inline__))

> -vmull_p8 (poly8x8_t a, poly8x8_t b)

> -{

> -  poly16x8_t result;

> -  __asm__ ("pmull %0.8h, %1.8b, %2.8b"

> -           : "=w"(result)

> -           : "w"(a), "w"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))

> -vmull_s8 (int8x8_t a, int8x8_t b)

> -{

> -  int16x8_t result;

> -  __asm__ ("smull %0.8h, %1.8b, %2.8b"

> -           : "=w"(result)

> -           : "w"(a), "w"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))

> -vmull_s16 (int16x4_t a, int16x4_t b)

> -{

> -  int32x4_t result;

> -  __asm__ ("smull %0.4s, %1.4h, %2.4h"

> -           : "=w"(result)

> -           : "w"(a), "w"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))

> -vmull_s32 (int32x2_t a, int32x2_t b)

> -{

> -  int64x2_t result;

> -  __asm__ ("smull %0.2d, %1.2s, %2.2s"

> -           : "=w"(result)

> -           : "w"(a), "w"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -__extension__ static __inline uint16x8_t __attribute__

> ((__always_inline__))

> -vmull_u8 (uint8x8_t a, uint8x8_t b)

> -{

> -  uint16x8_t result;

> -  __asm__ ("umull %0.8h, %1.8b, %2.8b"

> -           : "=w"(result)

> -           : "w"(a), "w"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -__extension__ static __inline uint32x4_t __attribute__

> ((__always_inline__))

> -vmull_u16 (uint16x4_t a, uint16x4_t b)

> -{

> -  uint32x4_t result;

> -  __asm__ ("umull %0.4s, %1.4h, %2.4h"

> -           : "=w"(result)

> -           : "w"(a), "w"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -__extension__ static __inline uint64x2_t __attribute__

> ((__always_inline__))

> -vmull_u32 (uint32x2_t a, uint32x2_t b)

> -{

> -  uint64x2_t result;

> -  __asm__ ("umull %0.2d, %1.2s, %2.2s"

> -           : "=w"(result)

> -           : "w"(a), "w"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -__extension__ static __inline float32x4_t __attribute__

> ((__always_inline__))

> -vmulq_n_f32 (float32x4_t a, float32_t b)

> -{

> -  float32x4_t result;

> -  __asm__ ("fmul %0.4s,%1.4s,%2.s[0]"

> -           : "=w"(result)

> -           : "w"(a), "w"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -__extension__ static __inline float64x2_t __attribute__

> ((__always_inline__))

> -vmulq_n_f64 (float64x2_t a, float64_t b)

> -{

> -  float64x2_t result;

> -  __asm__ ("fmul %0.2d,%1.2d,%2.d[0]"

> -           : "=w"(result)

> -           : "w"(a), "w"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))

> -vmulq_n_s16 (int16x8_t a, int16_t b)

> -{

> -  int16x8_t result;

> -  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"

> -           : "=w"(result)

> -           : "w"(a), "x"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))

> -vmulq_n_s32 (int32x4_t a, int32_t b)

> -{

> -  int32x4_t result;

> -  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"

> -           : "=w"(result)

> -           : "w"(a), "w"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -__extension__ static __inline uint16x8_t __attribute__

> ((__always_inline__))

> -vmulq_n_u16 (uint16x8_t a, uint16_t b)

> -{

> -  uint16x8_t result;

> -  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"

> -           : "=w"(result)

> -           : "w"(a), "x"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -__extension__ static __inline uint32x4_t __attribute__

> ((__always_inline__))

> -vmulq_n_u32 (uint32x4_t a, uint32_t b)

> -{

> -  uint32x4_t result;

> -  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"

> -           : "=w"(result)

> -           : "w"(a), "w"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -__extension__ static __inline float32x2_t __attribute__

> ((__always_inline__))

> -vmulx_f32 (float32x2_t a, float32x2_t b)

> -{

> -  float32x2_t result;

> -  __asm__ ("fmulx %0.2s,%1.2s,%2.2s"

> -           : "=w"(result)

> -           : "w"(a), "w"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -#define vmulx_lane_f32(a, b, c)                                         \

> -  __extension__                                                         \

> -    ({                                                                  \

> -       float32x4_t b_ = (b);                                            \

> -       float32x2_t a_ = (a);                                            \

> -       float32x2_t result;                                              \

> -       __asm__ ("fmulx %0.2s,%1.2s,%2.s[%3]"                            \

> -                : "=w"(result)                                          \

> -                : "w"(a_), "w"(b_), "i"(c)                              \

> -                : /* No clobbers */);                                   \

> -       result;                                                          \

> -     })

> -

> -__extension__ static __inline float64_t __attribute__ ((__always_inline__))

> -vmulxd_f64 (float64_t a, float64_t b)

> -{

> -  float64_t result;

> -  __asm__ ("fmulx %d0, %d1, %d2"

> -           : "=w"(result)

> -           : "w"(a), "w"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -__extension__ static __inline float32x4_t __attribute__

> ((__always_inline__))

> -vmulxq_f32 (float32x4_t a, float32x4_t b)

> -{

> -  float32x4_t result;

> -  __asm__ ("fmulx %0.4s,%1.4s,%2.4s"

> -           : "=w"(result)

> -           : "w"(a), "w"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -__extension__ static __inline float64x2_t __attribute__

> ((__always_inline__))

> -vmulxq_f64 (float64x2_t a, float64x2_t b)

> -{

> -  float64x2_t result;

> -  __asm__ ("fmulx %0.2d,%1.2d,%2.2d"

> -           : "=w"(result)

> -           : "w"(a), "w"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

> -#define vmulxq_lane_f32(a, b, c)                                        \

> -  __extension__                                                         \

> -    ({                                                                  \

> -       float32x4_t b_ = (b);                                            \

> -       float32x4_t a_ = (a);                                            \

> -       float32x4_t result;                                              \

> -       __asm__ ("fmulx %0.4s,%1.4s,%2.s[%3]"                            \

> -                : "=w"(result)                                          \

> -                : "w"(a_), "w"(b_), "i"(c)                              \

> -                : /* No clobbers */);                                   \

> -       result;                                                          \

> -     })

> -

> -#define vmulxq_lane_f64(a, b, c)                                        \

> -  __extension__                                                         \

> -    ({                                                                  \

> -       float64x2_t b_ = (b);                                            \

> -       float64x2_t a_ = (a);                                            \

> -       float64x2_t result;                                              \

> -       __asm__ ("fmulx %0.2d,%1.2d,%2.d[%3]"                            \

> -                : "=w"(result)                                          \

> -                : "w"(a_), "w"(b_), "i"(c)                              \

> -                : /* No clobbers */);                                   \

> -       result;                                                          \

> -     })

> -

> -__extension__ static __inline float32_t __attribute__ ((__always_inline__))

> -vmulxs_f32 (float32_t a, float32_t b)

> -{

> -  float32_t result;

> -  __asm__ ("fmulx %s0, %s1, %s2"

> -           : "=w"(result)

> -           : "w"(a), "w"(b)

> -           : /* No clobbers */);

> -  return result;

> -}

> -

>  __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))

>  vmvn_p8 (poly8x8_t a)

>  {

> @@ -19172,6 +18507,78 @@ vmul_n_f64  (float64x1_t __a, float64_t __b)

>    return (float64x1_t) { vget_lane_f64 (__a, 0) * __b };

>  }

>  +__extension__ static __inline float32x2_t __attribute__

> ((__always_inline__))

> +vmul_n_f32 (float32x2_t __a, float32_t __b)

> +{

> +  return __builtin_aarch64_mul_nv2sf (__a, __b);

> +}

> +

> +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))

> +vmul_n_s16 (int16x4_t __a, int16_t __b)

> +{

> +  return __builtin_aarch64_mul_nv4hi (__a, __b);

> +}

> +

> +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))

> +vmul_n_s32 (int32x2_t __a, int32_t __b)

> +{

> +  return __builtin_aarch64_mul_nv2si (__a, __b);

> +}

> +

> +__extension__ static __inline uint16x4_t __attribute__

> ((__always_inline__))

> +vmul_n_u16 (uint16x4_t __a, uint16_t __b)

> +{

> +  return (uint16x4_t) __builtin_aarch64_mul_nv4hi ((int16x4_t)__a,

> +                                                   (int16_t)__b);

> +}

> +

> +__extension__ static __inline uint32x2_t __attribute__

> ((__always_inline__))

> +vmul_n_u32 (uint32x2_t __a, uint32_t __b)

> +{

> +  return (uint32x2_t) __builtin_aarch64_mul_nv2si ((int32x2_t)__a,

> +                                                   (int32_t)__b);

> +}

> +

> +/* vmulq_n  */

> +

> +__extension__ static __inline float32x4_t __attribute__

> ((__always_inline__))

> +vmulq_n_f32 (float32x4_t __a, float32_t __b)

> +{

> +  return __builtin_aarch64_mul_nv4sf (__a, __b);

> +}

> +

> +__extension__ static __inline float64x2_t __attribute__

> ((__always_inline__))

> +vmulq_n_f64 (float64x2_t __a, float64_t __b)

> +{

> +  return __builtin_aarch64_mul_nv2df (__a, __b);

> +}

> +

> +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))

> +vmulq_n_s16 (int16x8_t __a, int16_t __b)

> +{

> +  return __builtin_aarch64_mul_nv8hi (__a, __b);

> +}

> +

> +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))

> +vmulq_n_s32 (int32x4_t __a, int32_t __b)

> +{

> +  return __builtin_aarch64_mul_nv4si (__a, __b);

> +}

> +

> +__extension__ static __inline uint16x8_t __attribute__

> ((__always_inline__))

> +vmulq_n_u16 (uint16x8_t __a, uint16_t __b)

> +{

> +  return (uint16x8_t) __builtin_aarch64_mul_nv8hi ((int16x8_t)__a,

> +                                                   (int16_t)__b);

> +}

> +

> +__extension__ static __inline uint32x4_t __attribute__

> ((__always_inline__))

> +vmulq_n_u32 (uint32x4_t __a, uint32_t __b)

> +{

> +  return (uint32x4_t) __builtin_aarch64_mul_nv4si ((int32x4_t)__a,

> +                                                   (int32_t)__b);

> +}

> +

>  /* vmulq_lane  */

>   __extension__ static __inline float32x4_t __attribute__

> ((__always_inline__))

> @@ -19249,6 +18656,308 @@ vmulq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, c

>    return __a * __aarch64_vgetq_lane_u32 (__b, __lane);

>  }

>  +/* vmull_high_lane  */

> +

> +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))

> +vmull_high_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)

> +{

> +  return __builtin_aarch64_smull2_lanev8hi (__a, __b, __c);

> +}

> +

> +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))

> +vmull_high_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)

> +{

> +  return __builtin_aarch64_smull2_lanev4si (__a, __b, __c);

> +}

> +

> +__extension__ static __inline uint32x4_t __attribute__

> ((__always_inline__))

> +vmull_high_lane_u16 (uint16x8_t __a, uint16x4_t __b, const int __c)

> +{

> +  return (uint32x4_t) __builtin_aarch64_umull2_lanev8hi ((int16x8_t) __a,

> +                                                         (int16x4_t) __b,

> +                                                         __c);

> +}

> +

> +__extension__ static __inline uint64x2_t __attribute__

> ((__always_inline__))

> +vmull_high_lane_u32 (uint32x4_t __a, uint32x2_t __b, const int __c)

> +{

> +  return (uint64x2_t) __builtin_aarch64_umull2_lanev4si ((int32x4_t) __a,

> +                                                         (int32x2_t) __b,

> +                                                          __c);

> +}

> +

> +/* vmull_high_laneq  */

> +

> +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))

> +vmull_high_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c)

> +{

> +  return __builtin_aarch64_smull2_laneqv8hi (__a, __b, __c);

> +}

> +

> +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))

> +vmull_high_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c)

> +{

> +  return __builtin_aarch64_smull2_laneqv4si (__a, __b, __c);

> +}

> +

> +__extension__ static __inline uint32x4_t __attribute__

> ((__always_inline__))

> +vmull_high_laneq_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)

> +{

> +  return (uint32x4_t) __builtin_aarch64_umull2_laneqv8hi ((int16x8_t)__a,

> +                                                          (int16x8_t)__b,

> +                                                          __c);

> +}

> +

> +__extension__ static __inline uint64x2_t __attribute__

> ((__always_inline__))

> +vmull_high_laneq_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)

> +{

> +  return (uint64x2_t) __builtin_aarch64_umull2_laneqv4si ((int32x4_t) __a,

> +                                                          (int32x4_t) __b,

> +                                                           __c);

> +}

> +

> +/* vmull_high_n  */

> +

> +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))

> +vmull_high_n_s16 (int16x8_t __a, int16_t __b)

> +{

> +  return __builtin_aarch64_smull2_nv8hi (__a, __b);

> +}

> +

> +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))

> +vmull_high_n_s32 (int32x4_t __a, int32_t __b)

> +{

> +  return __builtin_aarch64_smull2_nv4si (__a, __b);

> +}

> +

> +__extension__ static __inline uint32x4_t __attribute__

> ((__always_inline__))

> +vmull_high_n_u16 (uint16x8_t __a, uint16_t __b)

> +{

> +  return __builtin_aarch64_umull2_nv8hi_uuu (__a, __b);

> +}

> +

> +__extension__ static __inline uint64x2_t __attribute__

> ((__always_inline__))

> +vmull_high_n_u32 (uint32x4_t __a, uint32_t __b)

> +{

> +  return __builtin_aarch64_umull2_nv4si_uuu (__a, __b);

> +}

> +

> +/* vmull_high  */

> +

> +__extension__ static __inline poly16x8_t __attribute__

> ((__always_inline__))

> +vmull_high_p8 (poly8x16_t __a, poly8x16_t __b)

> +{

> +    return  __builtin_aarch64_pmull2v16qi_ppp (__a, __b);

> +}

> +

> +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))

> +vmull_high_s8 (int8x16_t __a, int8x16_t __b)

> +{

> +  return __builtin_aarch64_vec_widen_smult_hi_v16qi (__a, __b);

> +}

> +

> +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))

> +vmull_high_s16 (int16x8_t __a, int16x8_t __b)

> +{

> +  return __builtin_aarch64_vec_widen_smult_hi_v8hi (__a, __b);
diff mbox

Patch

Index: gcc/ChangeLog
===================================================================
--- gcc/ChangeLog	(revision 218464)
+++ gcc/ChangeLog	(working copy)
@@ -1,3 +1,38 @@ 
+2014-12-09  Felix Yang  <felix.yang@huawei.com>
+            Jiji Jiang  <jiangjiji@huawei.com>
+
+	* config/aarch64/aarch64-simd.md (aarch64_mul_n<mode>,
+	aarch64_<su>mull_n<mode>, aarch64_<su>mull<mode>,
+	aarch64_simd_<su>mull2_n<mode>, aarch64_<su>mull2_n<mode>,
+	aarch64_<su>mull_lane<mode>, aarch64_<su>mull2_lane<mode>_internal,
+	aarch64_<su>mull_laneq<mode>, aarch64_<su>mull2_laneq<mode>_internal,
+	aarch64_smull2_lane<mode>, aarch64_umull2_lane<mode>,
+	aarch64_smull2_laneq<mode>, aarch64_umull2_laneq<mode>,
+	aarch64_fmulx<mode>, aarch64_fmulx<mode>, aarch64_fmulx_lane<mode>,
+	aarch64_pmull2v16qi, aarch64_pmullv8qi): New patterns.
+	* config/aarch64/aarch64-simd-builtins.def (vec_widen_smult_hi_,
+	vec_widen_umult_hi_, umull, smull, smull_n, umull_n, mul_n, smull2_n,
+	umull2_n, smull_lane, umull_lane, smull_laneq, umull_laneq, pmull,
+	umull2_lane, smull2_laneq, umull2_laneq, fmulx, fmulx_lane, pmull2,
+	smull2_lane): New builtins.
+	* config/aarch64/arm_neon.h (vmul_n_f32, vmul_n_s16, vmul_n_s32,
+	vmul_n_u16, vmul_n_u32, vmulq_n_f32, vmulq_n_f64, vmulq_n_s16,
+	vmulq_n_s32, vmulq_n_u16, vmulq_n_u32, vmull_high_lane_s16,
+	vmull_high_lane_s32, vmull_high_lane_u16, vmull_high_lane_u32,
+	vmull_high_laneq_s16, vmull_high_laneq_s32, vmull_high_laneq_u16,
+	vmull_high_laneq_u32, vmull_high_n_s16, vmull_high_n_s32,
+	vmull_high_n_u16, vmull_high_n_u32, vmull_high_p8, vmull_high_s8,
+	vmull_high_s16, vmull_high_s32, vmull_high_u8, vmull_high_u16,
+	vmull_high_u32, vmull_lane_s16, vmull_lane_s32, vmull_lane_u16,
+	vmull_lane_u32, vmull_laneq_s16, vmull_laneq_s32, vmull_laneq_u16,
+	vmull_laneq_u32, vmull_n_s16, vmull_n_s32, vmull_n_u16, vmull_n_u32,
+	vmull_p8, vmull_s8, vmull_s16, vmull_s32, vmull_u8, vmull_u16,
+	vmull_u32, vmulx_f32, vmulx_lane_f32, vmulxd_f64, vmulxq_f32,
+	vmulxq_f64, vmulxq_lane_f32, vmulxq_lane_f64, vmulxs_f32): Rewrite
+	using builtin functions.
+	* config/aarch64/iterators.md (UNSPEC_FMULX, UNSPEC_FMULX_LANE,
+	VDQF_Q): New unspec and int iterator.
+
  2014-12-07  Felix Yang  <felix.yang@huawei.com>
  	    Shanyao Chen  <chenshanyao@huawei.com>
  
Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c
===================================================================
--- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c	(revision 0)
+++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c	(revision 0)
@@ -0,0 +1,111 @@ 
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+
+/* Expected results.  */
+VECT_VAR_DECL(expected,int,16,8) [] = { 0xfc48, 0xfcbf, 0xfd36, 0xfdad,
+                                        0xfe24, 0xfe9b, 0xff12, 0xff89 };
+VECT_VAR_DECL(expected,int,32,4) [] = { 0xfffff9a0, 0xfffffa28,
+                                        0xfffffab0, 0xfffffb38 };
+VECT_VAR_DECL(expected,int,64,2) [] = { 0xfffffffffffff7a2,
+                                        0xfffffffffffff83b };
+VECT_VAR_DECL(expected,uint,16,8) [] = { 0xa4b0, 0xa55a, 0xa604, 0xa6ae,
+                                         0xa758, 0xa802, 0xa8ac, 0xa956 };
+VECT_VAR_DECL(expected,uint,32,4) [] = { 0xbaf73c, 0xbaf7f7,
+                                         0xbaf8b2, 0xbaf96d };
+VECT_VAR_DECL(expected,uint,64,2) [] = { 0xcbfffff4d8,
+                                         0xcbfffff5a4};
+VECT_VAR_DECL(expected,poly,16,8) [] = { 0x6530, 0x659a, 0x6464, 0x64ce,
+                                         0x6798, 0x6732, 0x66cc, 0x6666 };
+
+#ifndef INSN_NAME
+#define INSN_NAME vmull_high
+#define TEST_MSG "VMUL_HIGH"
+#endif
+
+#define FNNAME1(NAME) exec_ ## NAME
+#define FNNAME(NAME) FNNAME1(NAME)
+
+void FNNAME (INSN_NAME) (void)
+{
+#define DECL_VMUL(T, W, N)                      \
+  DECL_VARIABLE(vector1, T, W, N);              \
+  DECL_VARIABLE(vector2, T, W, N);		
+
+  /* vector_res = OP(vector1, vector2), then store the result.  */
+#define TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1)                 \
+  VECT_VAR(vector_res, T1, W1, N1) =                                    \
+    INSN##Q##_##T2##W(VECT_VAR(vector1, T1, W, N),                      \
+                               VECT_VAR(vector2, T1, W, N));            \
+  vst1q##_##T2##W1(VECT_VAR(result, T1, W1, N1),                        \
+		    VECT_VAR(vector_res, T1, W1, N1))
+
+#define TEST_VMULL_HIGH(INSN, Q, T1, T2, W, N, W1, N1)	                \
+  TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1)
+
+#define CHECK_VMULL_HIGH_RESULTS(test_name,comment)                     \
+  {                                                                     \
+    CHECK(test_name, int, 16, 8, PRIx16, expected, comment);            \
+    CHECK(test_name, int, 32, 4, PRIx32, expected, comment);            \
+    CHECK(test_name, int, 64, 2, PRIx64, expected, comment);            \
+    CHECK(test_name, uint, 16, 8, PRIx16,  expected, comment);          \
+    CHECK(test_name, uint, 32, 4, PRIx32, expected, comment);           \
+    CHECK(test_name, uint, 64, 2, PRIx64, expected, comment);           \
+    CHECK(test_name, poly, 16, 8, PRIx16, expected, comment);           \
+  }
+
+  DECL_VMUL(int, 8, 16);
+  DECL_VMUL(int, 16, 8);
+  DECL_VMUL(int, 32, 4);
+  DECL_VMUL(uint, 8, 16);
+  DECL_VMUL(uint, 16, 8);
+  DECL_VMUL(uint, 32, 4);
+  DECL_VMUL(poly, 8, 16);
+
+  DECL_VARIABLE(vector_res, int, 16, 8);
+  DECL_VARIABLE(vector_res, int, 32, 4);
+  DECL_VARIABLE(vector_res, int, 64, 2);
+  DECL_VARIABLE(vector_res, uint, 16, 8);
+  DECL_VARIABLE(vector_res, uint, 32, 4);
+  DECL_VARIABLE(vector_res, uint, 64, 2);
+  DECL_VARIABLE(vector_res, poly, 16, 8);
+
+  clean_results ();
+
+  /* Initialize input "vector1" from "buffer".  */
+  VLOAD(vector1, buffer, q, int, s, 8, 16);
+  VLOAD(vector1, buffer, q, int, s, 16, 8);
+  VLOAD(vector1, buffer, q, int, s, 32, 4);
+  VLOAD(vector1, buffer, q, uint, u, 8, 16);
+  VLOAD(vector1, buffer, q, uint, u, 16, 8);
+  VLOAD(vector1, buffer, q, uint, u, 32, 4);
+  VLOAD(vector1, buffer, q, poly, p, 8, 16);
+
+  /* Choose init value arbitrarily.  */
+  VDUP(vector2, q, int, s, 8, 16, 0x77);
+  VDUP(vector2, q, int, s, 16, 8, 0x88);
+  VDUP(vector2, q, int, s, 32, 4, 0x99);
+  VDUP(vector2, q, uint, u, 8, 16, 0xAA);
+  VDUP(vector2, q, uint, u, 16, 8, 0xBB);
+  VDUP(vector2, q, uint, u, 32, 4, 0xCC);
+  VDUP(vector2, q, poly, p, 8, 16, 0xAA);
+
+  /* Execute the tests.  */
+  TEST_VMULL_HIGH(INSN_NAME, , int, s, 8, 16, 16, 8);
+  TEST_VMULL_HIGH(INSN_NAME, , int, s, 16, 8, 32, 4);
+  TEST_VMULL_HIGH(INSN_NAME, , int, s, 32, 4, 64, 2);
+  TEST_VMULL_HIGH(INSN_NAME, , uint, u, 8, 16, 16, 8);
+  TEST_VMULL_HIGH(INSN_NAME, , uint, u, 16, 8, 32, 4);
+  TEST_VMULL_HIGH(INSN_NAME, , uint, u, 32, 4, 64, 2);
+  TEST_VMULL_HIGH(INSN_NAME, , poly, p, 8, 16, 16, 8);
+
+  CHECK_VMULL_HIGH_RESULTS (TEST_MSG, "");
+}
+
+int main (void)
+{
+  FNNAME (INSN_NAME) ();
+
+  return 0;
+}

Property changes on: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c
___________________________________________________________________
Added: svn:executable
    + *

Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c
===================================================================
--- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c	(revision 0)
+++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c	(revision 0)
@@ -0,0 +1,135 @@ 
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+VECT_VAR_DECL(expected, int, 32, 4) [] = { 0x4000, 0x4000, 0x4000, 0x4000 };
+VECT_VAR_DECL(expected, int, 64, 2) [] = { 0x2000, 0x2000};
+VECT_VAR_DECL(expected, uint, 32, 4) [] = { 0x4000, 0x4000, 0x4000, 0x4000 };
+VECT_VAR_DECL(expected, uint, 64, 2) [] = { 0x2000, 0x2000 };
+
+#define TEST_MSG "VMULL_HIGH_LANE/VMULL_HIGH_LANEQ"
+void exec_vmull_high_lane (void)
+{
+  /* vector_res = vmull_lane(vector,vector2,lane), then store the result.  */
+#define TEST_VMULL_HIGH_LANE(T1, T2, W, W2, N1, N2, L)                  \
+  VECT_VAR(vector_res, T1, W2, N2) =                                    \
+    vmull##_high_lane_##T2##W(VECT_VAR(vector, T1, W, N1 ),             \
+                         VECT_VAR(vector2, T1, W, N2),                  \
+                         L);                                            \
+  vst1q_##T2##W2(VECT_VAR(result, T1, W2, N2), VECT_VAR(vector_res, T1, W2, N2))
+
+#define CHECK_VMULL_HIGH_LANE_RESULTS(test_name,comment)                \
+  {                                                                     \
+    CHECK(test_name, int, 32, 4, PRIx32, expected, comment);            \
+    CHECK(test_name, int, 64, 2, PRIx64, expected, comment);            \
+    CHECK(test_name, uint, 32, 4, PRIx32,  expected, comment);          \
+    CHECK(test_name, uint, 64, 2, PRIx64, expected, comment);           \
+  }	
+
+
+  /* With ARM RVCT, we need to declare variables before any executable
+     statement */
+  DECL_VARIABLE(vector, int, 16, 8);
+  DECL_VARIABLE(vector, int, 32, 4);
+  DECL_VARIABLE(vector, uint, 16, 8);
+  DECL_VARIABLE(vector, uint, 32, 4);
+  DECL_VARIABLE(vector2, int, 16, 4);
+  DECL_VARIABLE(vector2, int, 32, 2);
+  DECL_VARIABLE(vector2, uint, 16, 4);
+  DECL_VARIABLE(vector2, uint, 32, 2);
+
+  DECL_VARIABLE(vector_res, int, 32, 4);
+  DECL_VARIABLE(vector_res, int, 64, 2);
+  DECL_VARIABLE(vector_res, uint, 32, 4);
+  DECL_VARIABLE(vector_res, uint, 64, 2);
+
+  clean_results ();
+
+  /* Initialize vector */
+  VDUP(vector2, , int, s, 16, 4, 0x1000);
+  VDUP(vector2, , int, s, 32, 2, 0x1000);
+  VDUP(vector2, , uint, u, 16, 4, 0x1000);
+  VDUP(vector2, , uint, u, 32, 2, 0x1000);
+
+  /* Initialize vector2 */
+  VDUP(vector, q, int, s, 16, 8, 0x4);
+  VDUP(vector, q, int, s, 32, 4, 0x2);
+  VDUP(vector, q, uint, u, 16, 8, 0x4);
+  VDUP(vector, q, uint, u, 32, 4, 0x2);
+
+  /* Choose lane arbitrarily */
+  TEST_VMULL_HIGH_LANE(int, s, 16, 32, 8, 4, 2);
+  TEST_VMULL_HIGH_LANE(int, s, 32, 64, 4, 2, 1);
+  TEST_VMULL_HIGH_LANE(uint, u, 16, 32, 8, 4, 2);
+  TEST_VMULL_HIGH_LANE(uint, u, 32, 64, 4, 2, 1);
+
+  CHECK_VMULL_HIGH_LANE_RESULTS (TEST_MSG, "");
+}
+
+
+void exec_vmull_high_laneq (void)
+{
+  /* vector_res = vmull_lane(vector,vector2,lane), then store the result.  */
+#define TEST_VMULL_HIGH_LANEQ(T1, T2, W, W2, N2, N1, L)                 \
+  VECT_VAR(vector_res, T1, W2, N1) =                                    \
+    vmull##_high_laneq_##T2##W(VECT_VAR(vector, T1, W, N2 ),            \
+                         VECT_VAR(vector2, T1, W, N2),                  \
+                         L);                                            \
+  vst1q_##T2##W2(VECT_VAR(result, T1, W2, N1), VECT_VAR(vector_res, T1, W2, N1))
+
+#define CHECK_VMULL_HIGH_LANEQ_RESULTS(test_name,comment)               \
+  {                                                                     \
+    CHECK(test_name, int, 32, 4, PRIx32, expected, comment);            \
+    CHECK(test_name, int, 64, 2, PRIx64, expected, comment);            \
+    CHECK(test_name, uint, 32, 4, PRIx32,  expected, comment);          \
+    CHECK(test_name, uint, 64, 2, PRIx64, expected, comment);           \
+  }	
+
+
+  /* With ARM RVCT, we need to declare variables before any executable
+     statement */
+  DECL_VARIABLE(vector, int, 16, 8);
+  DECL_VARIABLE(vector, int, 32, 4);
+  DECL_VARIABLE(vector, uint, 16, 8);
+  DECL_VARIABLE(vector, uint, 32, 4);
+  DECL_VARIABLE(vector2, int, 16, 8);
+  DECL_VARIABLE(vector2, int, 32, 4);
+  DECL_VARIABLE(vector2, uint, 16, 8);
+  DECL_VARIABLE(vector2, uint, 32, 4);
+
+  DECL_VARIABLE(vector_res, int, 32, 4);
+  DECL_VARIABLE(vector_res, int, 64, 2);
+  DECL_VARIABLE(vector_res, uint, 32, 4);
+  DECL_VARIABLE(vector_res, uint, 64, 2);
+
+  clean_results ();
+
+  /* Initialize vector */
+  VDUP(vector2, q, int, s, 16, 8, 0x1000);
+  VDUP(vector2, q, int, s, 32, 4, 0x1000);
+  VDUP(vector2, q, uint, u, 16, 8, 0x1000);
+  VDUP(vector2, q, uint, u, 32, 4, 0x1000);
+
+  /* Initialize vector2 */
+  VDUP(vector, q, int, s, 16, 8, 0x4);
+  VDUP(vector, q, int, s, 32, 4, 0x2);
+  VDUP(vector, q, uint, u, 16, 8, 0x4);
+  VDUP(vector, q, uint, u, 32, 4, 0x2);
+
+  /* Choose lane arbitrarily */
+  TEST_VMULL_HIGH_LANEQ(int, s, 16, 32, 8, 4, 2);
+  TEST_VMULL_HIGH_LANEQ(int, s, 32, 64, 4, 2, 1);
+  TEST_VMULL_HIGH_LANEQ(uint, u, 16, 32, 8, 4, 2);
+  TEST_VMULL_HIGH_LANEQ(uint, u, 32, 64, 4, 2, 1);
+
+  CHECK_VMULL_HIGH_LANEQ_RESULTS (TEST_MSG, "");
+}
+
+
+
+
+int main (void)
+{
+  exec_vmull_high_lane();
+  return 0;
+}

Property changes on: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c
___________________________________________________________________
Added: svn:executable
    + *

Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c
===================================================================
--- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c	(revision 0)
+++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c	(revision 0)
@@ -0,0 +1,81 @@ 
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+
+/* Expected results.  */
+VECT_VAR_DECL(expected,int,32,4) [] = { 0xfffff73c, 0xfffff7f7,
+                                        0xfffff8b2, 0xfffff96d };
+VECT_VAR_DECL(expected,int,64,2) [] = { 0xfffffffffffff4d8,
+                                        0xfffffffffffff5a4 };
+VECT_VAR_DECL(expected,uint,32,4) [] = { 0xedf4d8, 0xedf5c6,
+                                         0xedf6b4, 0xedf7a2 };
+VECT_VAR_DECL(expected,uint,64,2) [] = { 0xfefffff20e,
+                                         0xfefffff30d};
+
+#ifndef INSN_NAME
+#define INSN_NAME vmull_high_n
+#define TEST_MSG "VMULL_HIGH_N"
+#endif
+
+#define FNNAME1(NAME) exec_ ## NAME
+#define FNNAME(NAME) FNNAME1(NAME)
+
+void FNNAME (INSN_NAME) (void)
+{
+#define DECL_VMUL(T, W, N)                      \
+  DECL_VARIABLE(vector1, T, W, N);              \
+
+  /* vector_res = OP(vector1, vector2), then store the result.  */
+#define TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1, C)              \
+  VECT_VAR(vector_res, T1, W1, N1) =                                    \
+    INSN##Q##_##T2##W(VECT_VAR(vector1, T1, W, N),                      \
+                      C);                                               \
+  vst1q##_##T2##W1(VECT_VAR(result, T1, W1, N1),                        \
+                      VECT_VAR(vector_res, T1, W1, N1))
+
+#define TEST_VMULL_HIGH(INSN, Q, T1, T2, W, N, W1, N1, C)               \
+  TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1, C)
+
+#define CHECK_VMULL_HIGH_N_RESULTS(test_name,comment)                   \
+  {                                                                     \
+    CHECK(test_name, int, 32, 4, PRIx32, expected, comment);            \
+    CHECK(test_name, int, 64, 2, PRIx64, expected, comment);            \
+    CHECK(test_name, uint, 32, 4, PRIx32, expected, comment);           \
+    CHECK(test_name, uint, 64, 2, PRIx64, expected, comment);           \
+  }
+
+  DECL_VMUL(int, 16, 8);
+  DECL_VMUL(int, 32, 4);
+  DECL_VMUL(uint, 16, 8);
+  DECL_VMUL(uint, 32, 4);
+
+  DECL_VARIABLE(vector_res, int, 32, 4);
+  DECL_VARIABLE(vector_res, int, 64, 2);
+  DECL_VARIABLE(vector_res, uint, 32, 4);
+  DECL_VARIABLE(vector_res, uint, 64, 2);
+
+  clean_results ();
+
+  /* Initialize input "vector1" from "buffer".  */
+  VLOAD(vector1, buffer, q, int, s, 16, 8);
+  VLOAD(vector1, buffer, q, int, s, 32, 4);
+  VLOAD(vector1, buffer, q, uint, u, 16, 8);
+  VLOAD(vector1, buffer, q, uint, u, 32, 4);
+
+
+  /* Execute the tests.  */
+  TEST_VMULL_HIGH(INSN_NAME, , int, s, 16, 8, 32, 4, 0xBB);
+  TEST_VMULL_HIGH(INSN_NAME, , int, s, 32, 4, 64, 2, 0xCC);
+  TEST_VMULL_HIGH(INSN_NAME, , uint, u, 16, 8, 32, 4, 0xEE);
+  TEST_VMULL_HIGH(INSN_NAME, , uint, u, 32, 4, 64, 2, 0xFF);
+
+  CHECK_VMULL_HIGH_N_RESULTS (TEST_MSG, "");
+}
+
+int main (void)
+{
+  FNNAME (INSN_NAME) ();
+
+  return 0;
+}

Property changes on: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c
___________________________________________________________________
Added: svn:executable
    + *

Index: gcc/testsuite/ChangeLog
===================================================================
--- gcc/testsuite/ChangeLog	(revision 218464)
+++ gcc/testsuite/ChangeLog	(working copy)
@@ -1,3 +1,13 @@ 
+2014-12-09  Felix Yang  <felix.yang@huawei.com>
+            Jiji Jiang  <jiangjiji@huawei.com>
+
+        * testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c: New
+        test.
+        * testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c:
+        New test.
+        * testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c: New
+        test.
+
  2014-12-07  Christophe Lyon  <christophe.lyon@linaro.org>
  
  	* gcc.target/aarch64/advsimd-intrinsics/vaddhn.c: Actually execute
Index: gcc/config/aarch64/arm_neon.h
===================================================================
--- gcc/config/aarch64/arm_neon.h	(revision 218464)
+++ gcc/config/aarch64/arm_neon.h	(working copy)
@@ -7627,671 +7627,6 @@  vmovn_u64 (uint64x2_t a)
    return result;
  }
  
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmul_n_f32 (float32x2_t a, float32_t b)
-{
-  float32x2_t result;
-  __asm__ ("fmul %0.2s,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmul_n_s16 (int16x4_t a, int16_t b)
-{
-  int16x4_t result;
-  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmul_n_s32 (int32x2_t a, int32_t b)
-{
-  int32x2_t result;
-  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmul_n_u16 (uint16x4_t a, uint16_t b)
-{
-  uint16x4_t result;
-  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmul_n_u32 (uint32x2_t a, uint32_t b)
-{
-  uint32x2_t result;
-  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-#define vmull_high_lane_s16(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x4_t b_ = (b);                                              \
-       int16x8_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_lane_s32(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x2_t b_ = (b);                                              \
-       int32x4_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_lane_u16(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x4_t b_ = (b);                                             \
-       uint16x8_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_lane_u32(a, b, c)                                    \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x2_t b_ = (b);                                             \
-       uint32x4_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_laneq_s16(a, b, c)                                   \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t b_ = (b);                                              \
-       int16x8_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_laneq_s32(a, b, c)                                   \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t b_ = (b);                                              \
-       int32x4_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_laneq_u16(a, b, c)                                   \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x8_t b_ = (b);                                             \
-       uint16x8_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_high_laneq_u32(a, b, c)                                   \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t b_ = (b);                                             \
-       uint32x4_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmull_high_n_s16 (int16x8_t a, int16_t b)
-{
-  int32x4_t result;
-  __asm__ ("smull2 %0.4s,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmull_high_n_s32 (int32x4_t a, int32_t b)
-{
-  int64x2_t result;
-  __asm__ ("smull2 %0.2d,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmull_high_n_u16 (uint16x8_t a, uint16_t b)
-{
-  uint32x4_t result;
-  __asm__ ("umull2 %0.4s,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmull_high_n_u32 (uint32x4_t a, uint32_t b)
-{
-  uint64x2_t result;
-  __asm__ ("umull2 %0.2d,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vmull_high_p8 (poly8x16_t a, poly8x16_t b)
-{
-  poly16x8_t result;
-  __asm__ ("pmull2 %0.8h,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmull_high_s8 (int8x16_t a, int8x16_t b)
-{
-  int16x8_t result;
-  __asm__ ("smull2 %0.8h,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmull_high_s16 (int16x8_t a, int16x8_t b)
-{
-  int32x4_t result;
-  __asm__ ("smull2 %0.4s,%1.8h,%2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmull_high_s32 (int32x4_t a, int32x4_t b)
-{
-  int64x2_t result;
-  __asm__ ("smull2 %0.2d,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmull_high_u8 (uint8x16_t a, uint8x16_t b)
-{
-  uint16x8_t result;
-  __asm__ ("umull2 %0.8h,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmull_high_u16 (uint16x8_t a, uint16x8_t b)
-{
-  uint32x4_t result;
-  __asm__ ("umull2 %0.4s,%1.8h,%2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmull_high_u32 (uint32x4_t a, uint32x4_t b)
-{
-  uint64x2_t result;
-  __asm__ ("umull2 %0.2d,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-#define vmull_lane_s16(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x4_t b_ = (b);                                              \
-       int16x4_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("smull %0.4s,%1.4h,%2.h[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_lane_s32(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x2_t b_ = (b);                                              \
-       int32x2_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("smull %0.2d,%1.2s,%2.s[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_lane_u16(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x4_t b_ = (b);                                             \
-       uint16x4_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("umull %0.4s,%1.4h,%2.h[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_lane_u32(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x2_t b_ = (b);                                             \
-       uint32x2_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("umull %0.2d, %1.2s, %2.s[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_laneq_s16(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t b_ = (b);                                              \
-       int16x4_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("smull %0.4s, %1.4h, %2.h[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_laneq_s32(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t b_ = (b);                                              \
-       int32x2_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("smull %0.2d, %1.2s, %2.s[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_laneq_u16(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x8_t b_ = (b);                                             \
-       uint16x4_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("umull %0.4s, %1.4h, %2.h[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmull_laneq_u32(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t b_ = (b);                                             \
-       uint32x2_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("umull %0.2d, %1.2s, %2.s[%3]"                          \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmull_n_s16 (int16x4_t a, int16_t b)
-{
-  int32x4_t result;
-  __asm__ ("smull %0.4s,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmull_n_s32 (int32x2_t a, int32_t b)
-{
-  int64x2_t result;
-  __asm__ ("smull %0.2d,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmull_n_u16 (uint16x4_t a, uint16_t b)
-{
-  uint32x4_t result;
-  __asm__ ("umull %0.4s,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmull_n_u32 (uint32x2_t a, uint32_t b)
-{
-  uint64x2_t result;
-  __asm__ ("umull %0.2d,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vmull_p8 (poly8x8_t a, poly8x8_t b)
-{
-  poly16x8_t result;
-  __asm__ ("pmull %0.8h, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmull_s8 (int8x8_t a, int8x8_t b)
-{
-  int16x8_t result;
-  __asm__ ("smull %0.8h, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmull_s16 (int16x4_t a, int16x4_t b)
-{
-  int32x4_t result;
-  __asm__ ("smull %0.4s, %1.4h, %2.4h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmull_s32 (int32x2_t a, int32x2_t b)
-{
-  int64x2_t result;
-  __asm__ ("smull %0.2d, %1.2s, %2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmull_u8 (uint8x8_t a, uint8x8_t b)
-{
-  uint16x8_t result;
-  __asm__ ("umull %0.8h, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmull_u16 (uint16x4_t a, uint16x4_t b)
-{
-  uint32x4_t result;
-  __asm__ ("umull %0.4s, %1.4h, %2.4h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmull_u32 (uint32x2_t a, uint32x2_t b)
-{
-  uint64x2_t result;
-  __asm__ ("umull %0.2d, %1.2s, %2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmulq_n_f32 (float32x4_t a, float32_t b)
-{
-  float32x4_t result;
-  __asm__ ("fmul %0.4s,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vmulq_n_f64 (float64x2_t a, float64_t b)
-{
-  float64x2_t result;
-  __asm__ ("fmul %0.2d,%1.2d,%2.d[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmulq_n_s16 (int16x8_t a, int16_t b)
-{
-  int16x8_t result;
-  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmulq_n_s32 (int32x4_t a, int32_t b)
-{
-  int32x4_t result;
-  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmulq_n_u16 (uint16x8_t a, uint16_t b)
-{
-  uint16x8_t result;
-  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmulq_n_u32 (uint32x4_t a, uint32_t b)
-{
-  uint32x4_t result;
-  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmulx_f32 (float32x2_t a, float32x2_t b)
-{
-  float32x2_t result;
-  __asm__ ("fmulx %0.2s,%1.2s,%2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-#define vmulx_lane_f32(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x4_t b_ = (b);                                            \
-       float32x2_t a_ = (a);                                            \
-       float32x2_t result;                                              \
-       __asm__ ("fmulx %0.2s,%1.2s,%2.s[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vmulxd_f64 (float64_t a, float64_t b)
-{
-  float64_t result;
-  __asm__ ("fmulx %d0, %d1, %d2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmulxq_f32 (float32x4_t a, float32x4_t b)
-{
-  float32x4_t result;
-  __asm__ ("fmulx %0.4s,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vmulxq_f64 (float64x2_t a, float64x2_t b)
-{
-  float64x2_t result;
-  __asm__ ("fmulx %0.2d,%1.2d,%2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-#define vmulxq_lane_f32(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x4_t b_ = (b);                                            \
-       float32x4_t a_ = (a);                                            \
-       float32x4_t result;                                              \
-       __asm__ ("fmulx %0.4s,%1.4s,%2.s[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmulxq_lane_f64(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       float64x2_t b_ = (b);                                            \
-       float64x2_t a_ = (a);                                            \
-       float64x2_t result;                                              \
-       __asm__ ("fmulx %0.2d,%1.2d,%2.d[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vmulxs_f32 (float32_t a, float32_t b)
-{
-  float32_t result;
-  __asm__ ("fmulx %s0, %s1, %s2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
  __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
  vmvn_p8 (poly8x8_t a)
  {
@@ -19172,6 +18507,78 @@  vmul_n_f64  (float64x1_t __a, float64_t __b)
    return (float64x1_t) { vget_lane_f64 (__a, 0) * __b };
  }
  
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmul_n_f32 (float32x2_t __a, float32_t __b)
+{
+  return __builtin_aarch64_mul_nv2sf (__a, __b);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmul_n_s16 (int16x4_t __a, int16_t __b)
+{
+  return __builtin_aarch64_mul_nv4hi (__a, __b);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmul_n_s32 (int32x2_t __a, int32_t __b)
+{
+  return __builtin_aarch64_mul_nv2si (__a, __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmul_n_u16 (uint16x4_t __a, uint16_t __b)
+{
+  return (uint16x4_t) __builtin_aarch64_mul_nv4hi ((int16x4_t)__a,
+                                                   (int16_t)__b);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmul_n_u32 (uint32x2_t __a, uint32_t __b)
+{
+  return (uint32x2_t) __builtin_aarch64_mul_nv2si ((int32x2_t)__a,
+                                                   (int32_t)__b);
+}
+
+/* vmulq_n  */
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmulq_n_f32 (float32x4_t __a, float32_t __b)
+{
+  return __builtin_aarch64_mul_nv4sf (__a, __b);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vmulq_n_f64 (float64x2_t __a, float64_t __b)
+{
+  return __builtin_aarch64_mul_nv2df (__a, __b);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmulq_n_s16 (int16x8_t __a, int16_t __b)
+{
+  return __builtin_aarch64_mul_nv8hi (__a, __b);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmulq_n_s32 (int32x4_t __a, int32_t __b)
+{
+  return __builtin_aarch64_mul_nv4si (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmulq_n_u16 (uint16x8_t __a, uint16_t __b)
+{
+  return (uint16x8_t) __builtin_aarch64_mul_nv8hi ((int16x8_t)__a,
+                                                   (int16_t)__b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmulq_n_u32 (uint32x4_t __a, uint32_t __b)
+{
+  return (uint32x4_t) __builtin_aarch64_mul_nv4si ((int32x4_t)__a,
+                                                   (int32_t)__b);
+}
+
  /* vmulq_lane  */
  
  __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
@@ -19249,6 +18656,308 @@  vmulq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, c
    return __a * __aarch64_vgetq_lane_u32 (__b, __lane);
  }
  
+/* vmull_high_lane  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_high_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
+{
+  return __builtin_aarch64_smull2_lanev8hi (__a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_high_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
+{
+  return __builtin_aarch64_smull2_lanev4si (__a, __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_high_lane_u16 (uint16x8_t __a, uint16x4_t __b, const int __c)
+{
+  return (uint32x4_t) __builtin_aarch64_umull2_lanev8hi ((int16x8_t) __a,
+                                                         (int16x4_t) __b,
+                                                         __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_high_lane_u32 (uint32x4_t __a, uint32x2_t __b, const int __c)
+{
+  return (uint64x2_t) __builtin_aarch64_umull2_lanev4si ((int32x4_t) __a,
+                                                         (int32x2_t) __b,
+                                                          __c);
+}
+
+/* vmull_high_laneq  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_high_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+{
+  return __builtin_aarch64_smull2_laneqv8hi (__a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_high_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_smull2_laneqv4si (__a, __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_high_laneq_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
+{
+  return (uint32x4_t) __builtin_aarch64_umull2_laneqv8hi ((int16x8_t)__a,
+                                                          (int16x8_t)__b,
+                                                          __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_high_laneq_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
+{
+  return (uint64x2_t) __builtin_aarch64_umull2_laneqv4si ((int32x4_t) __a,
+                                                          (int32x4_t) __b,
+                                                           __c);
+}
+
+/* vmull_high_n  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_high_n_s16 (int16x8_t __a, int16_t __b)
+{
+  return __builtin_aarch64_smull2_nv8hi (__a, __b);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_high_n_s32 (int32x4_t __a, int32_t __b)
+{
+  return __builtin_aarch64_smull2_nv4si (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_high_n_u16 (uint16x8_t __a, uint16_t __b)
+{
+  return __builtin_aarch64_umull2_nv8hi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_high_n_u32 (uint32x4_t __a, uint32_t __b)
+{
+  return __builtin_aarch64_umull2_nv4si_uuu (__a, __b);
+}
+
+/* vmull_high  */
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vmull_high_p8 (poly8x16_t __a, poly8x16_t __b)
+{
+    return  __builtin_aarch64_pmull2v16qi_ppp (__a, __b);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmull_high_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return __builtin_aarch64_vec_widen_smult_hi_v16qi (__a, __b);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_high_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return __builtin_aarch64_vec_widen_smult_hi_v8hi (__a, __b);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_high_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return __builtin_aarch64_vec_widen_smult_hi_v4si (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmull_high_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  return __builtin_aarch64_vec_widen_umult_hi_v16qi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_high_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return __builtin_aarch64_vec_widen_umult_hi_v8hi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_high_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return __builtin_aarch64_vec_widen_umult_hi_v4si_uuu (__a, __b);
+}
+
+/* vmull_lane  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+{
+  return __builtin_aarch64_smull_lanev4hi (__a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+{
+  return __builtin_aarch64_smull_lanev2si (__a, __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_lane_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
+{
+  return __builtin_aarch64_umull_lanev4hi_uuuu (__a, __b, __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_lane_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
+{
+  return __builtin_aarch64_umull_lanev2si_uuuu (__a, __b, __c);
+}
+
+/* vmull_laneq  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __c)
+{
+  return __builtin_aarch64_smull_laneqv4hi (__a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_smull_laneqv2si (__a, __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_laneq_u16 (uint16x4_t __a, uint16x8_t __b, const int __c)
+{
+  return __builtin_aarch64_umull_laneqv4hi_uuuu (__a, __b, __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_laneq_u32 (uint32x2_t __a, uint32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_umull_laneqv2si_uuuu (__a, __b, __c);
+}
+
+/* vmull_n  */
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_n_s16 (int16x4_t __a, int16_t __b)
+{
+   return __builtin_aarch64_smull_nv4hi (__a, __b);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_n_s32 (int32x2_t __a, int32_t __b)
+{
+   return __builtin_aarch64_smull_nv2si (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_n_u16 (uint16x4_t __a, uint16_t __b)
+{
+   return __builtin_aarch64_umull_nv4hi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_n_u32 (uint32x2_t __a, uint32_t __b)
+{
+  return __builtin_aarch64_umull_nv2si_uuu (__a, __b);
+}
+
+/* vmull  */
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vmull_p8 (poly8x8_t __a, poly8x8_t __b)
+{
+  return __builtin_aarch64_pmullv8qi_ppp (__a, __b);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmull_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return __builtin_aarch64_smullv8qi (__a, __b);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return __builtin_aarch64_smullv4hi (__a, __b);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return __builtin_aarch64_smullv2si (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmull_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return __builtin_aarch64_umullv8qi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return __builtin_aarch64_umullv4hi_uuu (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return __builtin_aarch64_umullv2si_uuu (__a, __b);
+}
+
+/* vmulx  */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmulx_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return __builtin_aarch64_fmulxv2sf (__a, __b);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmulx_lane_f32 (float32x2_t __a, float32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_fmulx_lanev2sf (__a, __b, __c);
+}
+
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vmulxd_f64 (float64_t __a, float64_t __b)
+{
+  return __builtin_aarch64_fmulxdf (__a, __b);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmulxq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return __builtin_aarch64_fmulxv4sf (__a, __b);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vmulxq_f64 (float64x2_t __a, float64x2_t __b)
+{
+  return __builtin_aarch64_fmulxv2df (__a, __b);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmulxq_lane_f32 (float32x4_t __a, float32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_fmulx_lanev4sf (__a, __b, __c);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vmulxq_lane_f64 (float64x2_t __a, float64x2_t __b, const int __c)
+{
+  return __builtin_aarch64_fmulx_lanev2df (__a, __b, __c);
+}
+
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vmulxs_f32 (float32_t __a, float32_t __b)
+{
+  return __builtin_aarch64_fmulxsf (__a, __b);
+}
+
  /* vneg  */
  
  __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
Index: gcc/config/aarch64/iterators.md
===================================================================
--- gcc/config/aarch64/iterators.md	(revision 218464)
+++ gcc/config/aarch64/iterators.md	(working copy)
@@ -273,6 +273,8 @@ 
      UNSPEC_SHA256SU1    ; Used in aarch64-simd.md.
      UNSPEC_PMULL        ; Used in aarch64-simd.md.
      UNSPEC_PMULL2       ; Used in aarch64-simd.md.
+    UNSPEC_FMULX        ; Used in aarch64-simd.md.
+    UNSPEC_FMULX_LANE   ; Used in aarch64-simd.md.
  ])
  
  ;; -------------------------------------------------------------------
@@ -462,6 +464,9 @@ 
  
  )
  
+(define_mode_attr VDQF_Q [(V2SF "V4SF") (V4SF "V4SF")
+                          (V2DF "V2DF")])
+
  ;; Widened mode register suffixes for VD_BHSI/VQW.
  (define_mode_attr Vwtype [(V8QI "8h") (V4HI "4s")
  			  (V2SI "2d") (V16QI "8h")
Index: gcc/config/aarch64/aarch64-simd.md
===================================================================
--- gcc/config/aarch64/aarch64-simd.md	(revision 218464)
+++ gcc/config/aarch64/aarch64-simd.md	(working copy)
@@ -1394,6 +1394,253 @@ 
   }
  )
  
+(define_insn "aarch64_mul_n<mode>"
+  [(set (match_operand:VMUL 0 "register_operand" "=w")
+        (mult:VMUL
+          (match_operand:VMUL 1 "register_operand" "w")
+          (vec_duplicate:VMUL
+            (match_operand:<VEL> 2 "register_operand" "<h_con>"))))]
+  "TARGET_SIMD"
+  "<f>mul\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[0]"
+  [(set_attr "type" "neon_mul_<Vetype>_long")]
+)
+
+(define_insn "aarch64_<su>mull_n<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (match_operand:VD_HSI 1 "register_operand" "w"))
+          (ANY_EXTEND:<VWIDE>
+            (vec_duplicate:VD_HSI
+              (match_operand:<VEL> 2 "register_operand" "<vwx>")))))]
+  "TARGET_SIMD"
+  "<su>mull\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[0]"
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+
+(define_insn "aarch64_<su>mull<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (match_operand:VD_BHSI 1 "register_operand" "w"))
+          (ANY_EXTEND:<VWIDE>
+            (match_operand:VD_BHSI 2 "register_operand" "w"))))]
+ "TARGET_SIMD"
+ "<su>mull\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vtype>"
+  [(set_attr "type" "neon_mul_<Vetype>_long")]
+)
+
+(define_insn "aarch64_simd_<su>mull2_n<mode>"
+ [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+       (mult:<VWIDE> (ANY_EXTEND:<VWIDE> (vec_select:<VHALF>
+                            (match_operand:VQ_HSI 1 "register_operand" "w")
+                            (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" "")))
+                     (ANY_EXTEND:<VWIDE> (vec_duplicate:<VHALF>
+                            (match_operand:<VEL> 2 "register_operand" "<vw>")))))]
+  "TARGET_SIMD"
+  "<su>mull2\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[0]"
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_expand "aarch64_<su>mull2_n<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "")
+   (ANY_EXTEND:<VWIDE> (match_operand:VQ_HSI 1 "register_operand" ""))
+   (match_operand:<VEL> 2 "register_operand" "")]
+ "TARGET_SIMD"
+ {
+   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+   emit_insn (gen_aarch64_simd_<su>mull2_n<mode> (operands[0],
+                                                  operands[1],
+                                                  operands[2], p));
+   DONE;
+
+ }
+)
+
+(define_insn "aarch64_<su>mull_lane<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (match_operand:VD_HSI 1 "register_operand" "w"))
+          (ANY_EXTEND:<VWIDE>
+            (vec_duplicate:VD_HSI
+              (vec_select:<VEL>
+                (match_operand:<VCOND> 2 "register_operand" "<vwx>")
+                  (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))]
+  "TARGET_SIMD"
+  {
+    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[3])));
+    return "<su>mull\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]";
+  }
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_insn "aarch64_<su>mull_laneq<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (match_operand:VD_HSI 1 "register_operand" "w"))
+          (ANY_EXTEND:<VWIDE>
+            (vec_duplicate:VD_HSI
+              (vec_select:<VEL>
+                (match_operand:<VCONQ> 2 "register_operand" "<vwx>")
+                  (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))]
+  "TARGET_SIMD"
+  {
+    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[3])));
+    return "<su>mull\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]";
+  }
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_insn "aarch64_<su>mull2_lane<mode>_internal"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (vec_select:<VHALF>
+              (match_operand:VQ_HSI 1 "register_operand" "w")
+              (match_operand:VQ_HSI 4 "vect_par_cnst_hi_half" "")))
+          (ANY_EXTEND:<VWIDE>
+            (vec_duplicate:<VHALF>
+              (vec_select:<VEL>
+                (match_operand:<VCOND> 2 "register_operand" "<vwx>")
+                  (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))]
+  "TARGET_SIMD"
+  {
+    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[3])));
+    return "<su>mull2\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]";
+  }
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_insn "aarch64_<su>mull2_laneq<mode>_internal"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (mult:<VWIDE>
+          (ANY_EXTEND:<VWIDE>
+            (vec_select:<VHALF>
+              (match_operand:VQ_HSI 1 "register_operand" "w")
+              (match_operand:VQ_HSI 4 "vect_par_cnst_hi_half" "")))
+          (ANY_EXTEND:<VWIDE>
+            (vec_duplicate:<VHALF>
+              (vec_select:<VEL>
+                (match_operand:<VCONQ> 2 "register_operand" "<vwx>")
+                  (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))]
+  "TARGET_SIMD"
+  {
+    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[3])));
+    return "<su>mull2\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]";
+  }
+  [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_expand "aarch64_smull2_lane<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+   (match_operand:VQ_HSI 1 "register_operand" "w")
+   (match_operand:<VCOND> 2 "register_operand" "<vwx>")
+   (match_operand:SI 3 "immediate_operand" "i")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  emit_insn (gen_aarch64_smull2_lane<mode>_internal (operands[0], operands[1],
+                                                     operands[2], operands[3],
+                                                     p));
+  DONE;
+})
+
+(define_expand "aarch64_umull2_lane<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+   (match_operand:VQ_HSI 1 "register_operand" "w")
+   (match_operand:<VCOND> 2 "register_operand" "<vwx>")
+   (match_operand:SI 3 "immediate_operand" "i")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  emit_insn (gen_aarch64_umull2_lane<mode>_internal (operands[0], operands[1],
+                                                     operands[2], operands[3],
+                                                     p));
+  DONE;
+})
+
+(define_expand "aarch64_smull2_laneq<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+   (match_operand:VQ_HSI 1 "register_operand" "w")
+   (match_operand:<VCONQ> 2 "register_operand" "<vwx>")
+   (match_operand:SI 3 "immediate_operand" "i")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  emit_insn (gen_aarch64_smull2_laneq<mode>_internal (operands[0], operands[1],
+                                                      operands[2], operands[3],
+                                                      p));
+  DONE;
+})
+
+(define_expand "aarch64_umull2_laneq<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+   (match_operand:VQ_HSI 1 "register_operand" "w")
+   (match_operand:<VCONQ> 2 "register_operand" "<vwx>")
+   (match_operand:SI 3 "immediate_operand" "i")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  emit_insn (gen_aarch64_umull2_laneq<mode>_internal (operands[0], operands[1],
+                                                      operands[2], operands[3],
+                                                      p));
+  DONE;
+})
+
+(define_insn "aarch64_fmulx<mode>"
+  [(set (match_operand:VDQF 0 "register_operand" "=w")
+        (unspec:VDQF  [(match_operand:VDQF 1 "register_operand" "w")
+                       (match_operand:VDQF 2 "register_operand" "w")]
+                      UNSPEC_FMULX))]
+ "TARGET_SIMD"
+ "fmulx\\t%0.<vtype>, %1.<vtype>, %2.<vtype>"
+  [(set_attr "type" "neon_mul_s")]
+)
+
+(define_insn "aarch64_fmulx<mode>"
+  [(set (match_operand:GPF 0 "register_operand" "=w")
+        (unspec:GPF  [(match_operand:GPF 1 "register_operand" "w")
+                      (match_operand:GPF 2 "register_operand" "w")]
+                     UNSPEC_FMULX))]
+ "TARGET_SIMD"
+ "fmulx\\t%<s>0, %<s>1, %<s>2"
+  [(set_attr "type" "neon_mul_s")]
+)
+
+(define_insn "aarch64_fmulx_lane<mode>"
+  [(set (match_operand:VDQF 0 "register_operand" "=w")
+        (unspec:VDQF  [(match_operand:VDQF 1 "register_operand" "w")
+                       (match_operand:<VDQF_Q> 2 "register_operand" "w")
+                       (match_operand:SI 3 "immediate_operand" "i")]
+                      UNSPEC_FMULX_LANE))]
+ "TARGET_SIMD"
+ "fmulx\\t%0.<vtype>, %1.<vtype>, %2.<vetype>"
+  [(set_attr "type" "neon_mul_s")]
+)
+
+(define_insn "aarch64_pmull2v16qi"
+ [(set (match_operand:V8HI 0 "register_operand" "=w")
+       (unspec:V8HI [(match_operand:V16QI 1 "register_operand" "w")
+                     (match_operand:V16QI 2 "register_operand" "w")]
+                    UNSPEC_PMULL2))]
+  "TARGET_SIMD"
+  "pmull2\\t%0.8h, %1.16b, %2.16b"
+  [(set_attr "type" "neon_mul_b_long")]
+)
+
+(define_insn "aarch64_pmullv8qi"
+  [(set (match_operand:V8HI 0 "register_operand" "=w")
+        (unspec:V8HI  [(match_operand:V8QI 1 "register_operand" "w")
+                       (match_operand:V8QI 2 "register_operand" "w")]
+                      UNSPEC_PMULL))]
+ "TARGET_SIMD"
+ "pmull\\t%0.8h, %1.8b, %2.8b"
+  [(set_attr "type" "neon_mul_b_long")]
+)
+
  ;; FP vector operations.
  ;; AArch64 AdvSIMD supports single-precision (32-bit) and
  ;; double-precision (64-bit) floating-point data types and arithmetic as
Index: gcc/config/aarch64/aarch64-simd-builtins.def
===================================================================
--- gcc/config/aarch64/aarch64-simd-builtins.def	(revision 218464)
+++ gcc/config/aarch64/aarch64-simd-builtins.def	(working copy)
@@ -184,6 +184,39 @@ 
    BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_lane, 0)
    BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_laneq, 0)
  
+  /* Implemented by vec_widen_<su>mult_hi_<mode>.  */
+  BUILTIN_VQW (BINOP, vec_widen_smult_hi_, 10)
+  BUILTIN_VQW (BINOPU, vec_widen_umult_hi_, 10)
+  /* Implemented by aarch64_<su>mull<mode>.  */
+  BUILTIN_VD_BHSI (BINOPU, umull, 0)
+  BUILTIN_VD_BHSI (BINOP, smull, 0)
+  /* Implemented by aarch64_<su>mull_n<mode>.  */
+  BUILTIN_VD_HSI (BINOP, smull_n, 0)
+  BUILTIN_VD_HSI (BINOPU, umull_n, 0)
+  /* Implemented by aarch64_mul_n<mode>.  */
+  BUILTIN_VMUL (BINOP, mul_n, 0)
+  /* Implemented by aarch64_<su>mull2_n<mode>.  */
+  BUILTIN_VQ_HSI (BINOP, smull2_n, 0)
+  BUILTIN_VQ_HSI (BINOPU, umull2_n, 0)
+  /* Implemented by aarch64_<su>mull_lane<q><mode>.  */
+  BUILTIN_VD_HSI (TERNOP, smull_lane, 0)
+  BUILTIN_VD_HSI (TERNOPU, umull_lane, 0)
+  BUILTIN_VD_HSI (TERNOP, smull_laneq, 0)
+  BUILTIN_VD_HSI (TERNOPU, umull_laneq, 0)
+  /* Implemented by aarch64_<su>mull2_lane<q><mode>.  */
+  BUILTIN_VQ_HSI (TERNOP, smull2_lane, 0)
+  BUILTIN_VQ_HSI (TERNOP_LANE, umull2_lane, 0)
+  BUILTIN_VQ_HSI (TERNOP, smull2_laneq, 0)
+  BUILTIN_VQ_HSI (TERNOP_LANE, umull2_laneq, 0)
+  /* Implemented by aarch64_fmulx<mode>.  */
+  BUILTIN_VDQF (BINOP, fmulx, 0)
+  BUILTIN_GPF (BINOP, fmulx, 0)
+  BUILTIN_VDQF (BINOP, fmulx_lane, 0)
+
+  /* Implemented by aarch64_pmull<2><mode>.*/
+  VAR1 (BINOPP, pmull, 0, v8qi)
+  VAR1 (BINOPP, pmull2, 0, v16qi)
+
    BUILTIN_VSDQ_I_DI (BINOP, ashl, 3)
    /* Implemented by aarch64_<sur>shl<mode>.  */
    BUILTIN_VSDQ_I_DI (BINOP, sshl, 0)