Message ID | 5486F084.6010202@huawei.com |
---|---|
State | New |
Headers | show |
On 9 December 2014 at 13:52, Jiangjiji <jiangjiji@huawei.com> wrote: > Hi, > This patch converts more intrinsics to use builtin functions instead of > the > previous inline assembly syntax. > Passed the glorious testsuite of Christophe Lyon. > > Three testcases are added for the testing of intriniscs which are not > covered by the testsuite: > gcc.target/aarch64/vmull_high.c > gcc.target/aarch64/vmull_high_lane.c > gcc.target/aarch64/vmull_high_n.c > As I said here: https://gcc.gnu.org/ml/gcc-patches/2014-10/msg01934.html I am in tre process of converting my existing testsuite to GCC/Dejagnu. Please do not duplicate work. > Regtested with aarch64-linux-gnu on QEMU. > This patch has no regressions for aarch64_be-linux-gnu big-endian > target too. > OK for the trunk? > > > > Index: gcc/ChangeLog > =================================================================== > --- gcc/ChangeLog (revision 218464) > +++ gcc/ChangeLog (working copy) > @@ -1,3 +1,38 @@ > +2014-12-09 Felix Yang <felix.yang@huawei.com> > + Jiji Jiang <jiangjiji@huawei.com> > + > + * config/aarch64/aarch64-simd.md (aarch64_mul_n<mode>, > + aarch64_<su>mull_n<mode>, aarch64_<su>mull<mode>, > + aarch64_simd_<su>mull2_n<mode>, aarch64_<su>mull2_n<mode>, > + aarch64_<su>mull_lane<mode>, aarch64_<su>mull2_lane<mode>_internal, > + aarch64_<su>mull_laneq<mode>, > aarch64_<su>mull2_laneq<mode>_internal, > + aarch64_smull2_lane<mode>, aarch64_umull2_lane<mode>, > + aarch64_smull2_laneq<mode>, aarch64_umull2_laneq<mode>, > + aarch64_fmulx<mode>, aarch64_fmulx<mode>, aarch64_fmulx_lane<mode>, > + aarch64_pmull2v16qi, aarch64_pmullv8qi): New patterns. > + * config/aarch64/aarch64-simd-builtins.def (vec_widen_smult_hi_, > + vec_widen_umult_hi_, umull, smull, smull_n, umull_n, mul_n, > smull2_n, > + umull2_n, smull_lane, umull_lane, smull_laneq, umull_laneq, pmull, > + umull2_lane, smull2_laneq, umull2_laneq, fmulx, fmulx_lane, pmull2, > + smull2_lane): New builtins. > + * config/aarch64/arm_neon.h (vmul_n_f32, vmul_n_s16, vmul_n_s32, > + vmul_n_u16, vmul_n_u32, vmulq_n_f32, vmulq_n_f64, vmulq_n_s16, > + vmulq_n_s32, vmulq_n_u16, vmulq_n_u32, vmull_high_lane_s16, > + vmull_high_lane_s32, vmull_high_lane_u16, vmull_high_lane_u32, > + vmull_high_laneq_s16, vmull_high_laneq_s32, vmull_high_laneq_u16, > + vmull_high_laneq_u32, vmull_high_n_s16, vmull_high_n_s32, > + vmull_high_n_u16, vmull_high_n_u32, vmull_high_p8, vmull_high_s8, > + vmull_high_s16, vmull_high_s32, vmull_high_u8, vmull_high_u16, > + vmull_high_u32, vmull_lane_s16, vmull_lane_s32, vmull_lane_u16, > + vmull_lane_u32, vmull_laneq_s16, vmull_laneq_s32, vmull_laneq_u16, > + vmull_laneq_u32, vmull_n_s16, vmull_n_s32, vmull_n_u16, vmull_n_u32, > + vmull_p8, vmull_s8, vmull_s16, vmull_s32, vmull_u8, vmull_u16, > + vmull_u32, vmulx_f32, vmulx_lane_f32, vmulxd_f64, vmulxq_f32, > + vmulxq_f64, vmulxq_lane_f32, vmulxq_lane_f64, vmulxs_f32): Rewrite > + using builtin functions. > + * config/aarch64/iterators.md (UNSPEC_FMULX, UNSPEC_FMULX_LANE, > + VDQF_Q): New unspec and int iterator. > + > 2014-12-07 Felix Yang <felix.yang@huawei.com> > Shanyao Chen <chenshanyao@huawei.com> > Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c > =================================================================== > --- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c > (revision 0) > +++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c > (revision 0) > @@ -0,0 +1,111 @@ > +#include <arm_neon.h> > +#include "arm-neon-ref.h" > +#include "compute-ref-data.h" > + > + > +/* Expected results. */ > +VECT_VAR_DECL(expected,int,16,8) [] = { 0xfc48, 0xfcbf, 0xfd36, 0xfdad, > + 0xfe24, 0xfe9b, 0xff12, 0xff89 }; > +VECT_VAR_DECL(expected,int,32,4) [] = { 0xfffff9a0, 0xfffffa28, > + 0xfffffab0, 0xfffffb38 }; > +VECT_VAR_DECL(expected,int,64,2) [] = { 0xfffffffffffff7a2, > + 0xfffffffffffff83b }; > +VECT_VAR_DECL(expected,uint,16,8) [] = { 0xa4b0, 0xa55a, 0xa604, 0xa6ae, > + 0xa758, 0xa802, 0xa8ac, 0xa956 }; > +VECT_VAR_DECL(expected,uint,32,4) [] = { 0xbaf73c, 0xbaf7f7, > + 0xbaf8b2, 0xbaf96d }; > +VECT_VAR_DECL(expected,uint,64,2) [] = { 0xcbfffff4d8, > + 0xcbfffff5a4}; > +VECT_VAR_DECL(expected,poly,16,8) [] = { 0x6530, 0x659a, 0x6464, 0x64ce, > + 0x6798, 0x6732, 0x66cc, 0x6666 }; > + > +#ifndef INSN_NAME > +#define INSN_NAME vmull_high > +#define TEST_MSG "VMUL_HIGH" > +#endif > + > +#define FNNAME1(NAME) exec_ ## NAME > +#define FNNAME(NAME) FNNAME1(NAME) > + > +void FNNAME (INSN_NAME) (void) > +{ > +#define DECL_VMUL(T, W, N) \ > + DECL_VARIABLE(vector1, T, W, N); \ > + DECL_VARIABLE(vector2, T, W, N); > + > + /* vector_res = OP(vector1, vector2), then store the result. */ > +#define TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1) \ > + VECT_VAR(vector_res, T1, W1, N1) = \ > + INSN##Q##_##T2##W(VECT_VAR(vector1, T1, W, N), \ > + VECT_VAR(vector2, T1, W, N)); \ > + vst1q##_##T2##W1(VECT_VAR(result, T1, W1, N1), \ > + VECT_VAR(vector_res, T1, W1, N1)) > + > +#define TEST_VMULL_HIGH(INSN, Q, T1, T2, W, N, W1, N1) \ > + TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1) > + > +#define CHECK_VMULL_HIGH_RESULTS(test_name,comment) \ > + { \ > + CHECK(test_name, int, 16, 8, PRIx16, expected, comment); \ > + CHECK(test_name, int, 32, 4, PRIx32, expected, comment); \ > + CHECK(test_name, int, 64, 2, PRIx64, expected, comment); \ > + CHECK(test_name, uint, 16, 8, PRIx16, expected, comment); \ > + CHECK(test_name, uint, 32, 4, PRIx32, expected, comment); \ > + CHECK(test_name, uint, 64, 2, PRIx64, expected, comment); \ > + CHECK(test_name, poly, 16, 8, PRIx16, expected, comment); \ > + } > + > + DECL_VMUL(int, 8, 16); > + DECL_VMUL(int, 16, 8); > + DECL_VMUL(int, 32, 4); > + DECL_VMUL(uint, 8, 16); > + DECL_VMUL(uint, 16, 8); > + DECL_VMUL(uint, 32, 4); > + DECL_VMUL(poly, 8, 16); > + > + DECL_VARIABLE(vector_res, int, 16, 8); > + DECL_VARIABLE(vector_res, int, 32, 4); > + DECL_VARIABLE(vector_res, int, 64, 2); > + DECL_VARIABLE(vector_res, uint, 16, 8); > + DECL_VARIABLE(vector_res, uint, 32, 4); > + DECL_VARIABLE(vector_res, uint, 64, 2); > + DECL_VARIABLE(vector_res, poly, 16, 8); > + > + clean_results (); > + > + /* Initialize input "vector1" from "buffer". */ > + VLOAD(vector1, buffer, q, int, s, 8, 16); > + VLOAD(vector1, buffer, q, int, s, 16, 8); > + VLOAD(vector1, buffer, q, int, s, 32, 4); > + VLOAD(vector1, buffer, q, uint, u, 8, 16); > + VLOAD(vector1, buffer, q, uint, u, 16, 8); > + VLOAD(vector1, buffer, q, uint, u, 32, 4); > + VLOAD(vector1, buffer, q, poly, p, 8, 16); > + > + /* Choose init value arbitrarily. */ > + VDUP(vector2, q, int, s, 8, 16, 0x77); > + VDUP(vector2, q, int, s, 16, 8, 0x88); > + VDUP(vector2, q, int, s, 32, 4, 0x99); > + VDUP(vector2, q, uint, u, 8, 16, 0xAA); > + VDUP(vector2, q, uint, u, 16, 8, 0xBB); > + VDUP(vector2, q, uint, u, 32, 4, 0xCC); > + VDUP(vector2, q, poly, p, 8, 16, 0xAA); > + > + /* Execute the tests. */ > + TEST_VMULL_HIGH(INSN_NAME, , int, s, 8, 16, 16, 8); > + TEST_VMULL_HIGH(INSN_NAME, , int, s, 16, 8, 32, 4); > + TEST_VMULL_HIGH(INSN_NAME, , int, s, 32, 4, 64, 2); > + TEST_VMULL_HIGH(INSN_NAME, , uint, u, 8, 16, 16, 8); > + TEST_VMULL_HIGH(INSN_NAME, , uint, u, 16, 8, 32, 4); > + TEST_VMULL_HIGH(INSN_NAME, , uint, u, 32, 4, 64, 2); > + TEST_VMULL_HIGH(INSN_NAME, , poly, p, 8, 16, 16, 8); > + > + CHECK_VMULL_HIGH_RESULTS (TEST_MSG, ""); > +} > + > +int main (void) > +{ > + FNNAME (INSN_NAME) (); > + > + return 0; > +} > > Property changes on: > gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c > ___________________________________________________________________ > Added: svn:executable > + * > > Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c > =================================================================== > --- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c > (revision 0) > +++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c > (revision 0) > @@ -0,0 +1,135 @@ > +#include <arm_neon.h> > +#include "arm-neon-ref.h" > +#include "compute-ref-data.h" > + > +VECT_VAR_DECL(expected, int, 32, 4) [] = { 0x4000, 0x4000, 0x4000, 0x4000 > }; > +VECT_VAR_DECL(expected, int, 64, 2) [] = { 0x2000, 0x2000}; > +VECT_VAR_DECL(expected, uint, 32, 4) [] = { 0x4000, 0x4000, 0x4000, 0x4000 > }; > +VECT_VAR_DECL(expected, uint, 64, 2) [] = { 0x2000, 0x2000 }; > + > +#define TEST_MSG "VMULL_HIGH_LANE/VMULL_HIGH_LANEQ" > +void exec_vmull_high_lane (void) > +{ > + /* vector_res = vmull_lane(vector,vector2,lane), then store the result. > */ > +#define TEST_VMULL_HIGH_LANE(T1, T2, W, W2, N1, N2, L) \ > + VECT_VAR(vector_res, T1, W2, N2) = \ > + vmull##_high_lane_##T2##W(VECT_VAR(vector, T1, W, N1 ), \ > + VECT_VAR(vector2, T1, W, N2), \ > + L); \ > + vst1q_##T2##W2(VECT_VAR(result, T1, W2, N2), VECT_VAR(vector_res, T1, W2, > N2)) > + > +#define CHECK_VMULL_HIGH_LANE_RESULTS(test_name,comment) \ > + { \ > + CHECK(test_name, int, 32, 4, PRIx32, expected, comment); \ > + CHECK(test_name, int, 64, 2, PRIx64, expected, comment); \ > + CHECK(test_name, uint, 32, 4, PRIx32, expected, comment); \ > + CHECK(test_name, uint, 64, 2, PRIx64, expected, comment); \ > + } > + > + > + /* With ARM RVCT, we need to declare variables before any executable > + statement */ > + DECL_VARIABLE(vector, int, 16, 8); > + DECL_VARIABLE(vector, int, 32, 4); > + DECL_VARIABLE(vector, uint, 16, 8); > + DECL_VARIABLE(vector, uint, 32, 4); > + DECL_VARIABLE(vector2, int, 16, 4); > + DECL_VARIABLE(vector2, int, 32, 2); > + DECL_VARIABLE(vector2, uint, 16, 4); > + DECL_VARIABLE(vector2, uint, 32, 2); > + > + DECL_VARIABLE(vector_res, int, 32, 4); > + DECL_VARIABLE(vector_res, int, 64, 2); > + DECL_VARIABLE(vector_res, uint, 32, 4); > + DECL_VARIABLE(vector_res, uint, 64, 2); > + > + clean_results (); > + > + /* Initialize vector */ > + VDUP(vector2, , int, s, 16, 4, 0x1000); > + VDUP(vector2, , int, s, 32, 2, 0x1000); > + VDUP(vector2, , uint, u, 16, 4, 0x1000); > + VDUP(vector2, , uint, u, 32, 2, 0x1000); > + > + /* Initialize vector2 */ > + VDUP(vector, q, int, s, 16, 8, 0x4); > + VDUP(vector, q, int, s, 32, 4, 0x2); > + VDUP(vector, q, uint, u, 16, 8, 0x4); > + VDUP(vector, q, uint, u, 32, 4, 0x2); > + > + /* Choose lane arbitrarily */ > + TEST_VMULL_HIGH_LANE(int, s, 16, 32, 8, 4, 2); > + TEST_VMULL_HIGH_LANE(int, s, 32, 64, 4, 2, 1); > + TEST_VMULL_HIGH_LANE(uint, u, 16, 32, 8, 4, 2); > + TEST_VMULL_HIGH_LANE(uint, u, 32, 64, 4, 2, 1); > + > + CHECK_VMULL_HIGH_LANE_RESULTS (TEST_MSG, ""); > +} > + > + > +void exec_vmull_high_laneq (void) > +{ > + /* vector_res = vmull_lane(vector,vector2,lane), then store the result. > */ > +#define TEST_VMULL_HIGH_LANEQ(T1, T2, W, W2, N2, N1, L) \ > + VECT_VAR(vector_res, T1, W2, N1) = \ > + vmull##_high_laneq_##T2##W(VECT_VAR(vector, T1, W, N2 ), \ > + VECT_VAR(vector2, T1, W, N2), \ > + L); \ > + vst1q_##T2##W2(VECT_VAR(result, T1, W2, N1), VECT_VAR(vector_res, T1, W2, > N1)) > + > +#define CHECK_VMULL_HIGH_LANEQ_RESULTS(test_name,comment) \ > + { \ > + CHECK(test_name, int, 32, 4, PRIx32, expected, comment); \ > + CHECK(test_name, int, 64, 2, PRIx64, expected, comment); \ > + CHECK(test_name, uint, 32, 4, PRIx32, expected, comment); \ > + CHECK(test_name, uint, 64, 2, PRIx64, expected, comment); \ > + } > + > + > + /* With ARM RVCT, we need to declare variables before any executable > + statement */ > + DECL_VARIABLE(vector, int, 16, 8); > + DECL_VARIABLE(vector, int, 32, 4); > + DECL_VARIABLE(vector, uint, 16, 8); > + DECL_VARIABLE(vector, uint, 32, 4); > + DECL_VARIABLE(vector2, int, 16, 8); > + DECL_VARIABLE(vector2, int, 32, 4); > + DECL_VARIABLE(vector2, uint, 16, 8); > + DECL_VARIABLE(vector2, uint, 32, 4); > + > + DECL_VARIABLE(vector_res, int, 32, 4); > + DECL_VARIABLE(vector_res, int, 64, 2); > + DECL_VARIABLE(vector_res, uint, 32, 4); > + DECL_VARIABLE(vector_res, uint, 64, 2); > + > + clean_results (); > + > + /* Initialize vector */ > + VDUP(vector2, q, int, s, 16, 8, 0x1000); > + VDUP(vector2, q, int, s, 32, 4, 0x1000); > + VDUP(vector2, q, uint, u, 16, 8, 0x1000); > + VDUP(vector2, q, uint, u, 32, 4, 0x1000); > + > + /* Initialize vector2 */ > + VDUP(vector, q, int, s, 16, 8, 0x4); > + VDUP(vector, q, int, s, 32, 4, 0x2); > + VDUP(vector, q, uint, u, 16, 8, 0x4); > + VDUP(vector, q, uint, u, 32, 4, 0x2); > + > + /* Choose lane arbitrarily */ > + TEST_VMULL_HIGH_LANEQ(int, s, 16, 32, 8, 4, 2); > + TEST_VMULL_HIGH_LANEQ(int, s, 32, 64, 4, 2, 1); > + TEST_VMULL_HIGH_LANEQ(uint, u, 16, 32, 8, 4, 2); > + TEST_VMULL_HIGH_LANEQ(uint, u, 32, 64, 4, 2, 1); > + > + CHECK_VMULL_HIGH_LANEQ_RESULTS (TEST_MSG, ""); > +} > + > + > + > + > +int main (void) > +{ > + exec_vmull_high_lane(); > + return 0; > +} > > Property changes on: > gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c > ___________________________________________________________________ > Added: svn:executable > + * > > Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c > =================================================================== > --- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c > (revision 0) > +++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c > (revision 0) > @@ -0,0 +1,81 @@ > +#include <arm_neon.h> > +#include "arm-neon-ref.h" > +#include "compute-ref-data.h" > + > + > +/* Expected results. */ > +VECT_VAR_DECL(expected,int,32,4) [] = { 0xfffff73c, 0xfffff7f7, > + 0xfffff8b2, 0xfffff96d }; > +VECT_VAR_DECL(expected,int,64,2) [] = { 0xfffffffffffff4d8, > + 0xfffffffffffff5a4 }; > +VECT_VAR_DECL(expected,uint,32,4) [] = { 0xedf4d8, 0xedf5c6, > + 0xedf6b4, 0xedf7a2 }; > +VECT_VAR_DECL(expected,uint,64,2) [] = { 0xfefffff20e, > + 0xfefffff30d}; > + > +#ifndef INSN_NAME > +#define INSN_NAME vmull_high_n > +#define TEST_MSG "VMULL_HIGH_N" > +#endif > + > +#define FNNAME1(NAME) exec_ ## NAME > +#define FNNAME(NAME) FNNAME1(NAME) > + > +void FNNAME (INSN_NAME) (void) > +{ > +#define DECL_VMUL(T, W, N) \ > + DECL_VARIABLE(vector1, T, W, N); \ > + > + /* vector_res = OP(vector1, vector2), then store the result. */ > +#define TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1, C) \ > + VECT_VAR(vector_res, T1, W1, N1) = \ > + INSN##Q##_##T2##W(VECT_VAR(vector1, T1, W, N), \ > + C); \ > + vst1q##_##T2##W1(VECT_VAR(result, T1, W1, N1), \ > + VECT_VAR(vector_res, T1, W1, N1)) > + > +#define TEST_VMULL_HIGH(INSN, Q, T1, T2, W, N, W1, N1, C) \ > + TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1, C) > + > +#define CHECK_VMULL_HIGH_N_RESULTS(test_name,comment) \ > + { \ > + CHECK(test_name, int, 32, 4, PRIx32, expected, comment); \ > + CHECK(test_name, int, 64, 2, PRIx64, expected, comment); \ > + CHECK(test_name, uint, 32, 4, PRIx32, expected, comment); \ > + CHECK(test_name, uint, 64, 2, PRIx64, expected, comment); \ > + } > + > + DECL_VMUL(int, 16, 8); > + DECL_VMUL(int, 32, 4); > + DECL_VMUL(uint, 16, 8); > + DECL_VMUL(uint, 32, 4); > + > + DECL_VARIABLE(vector_res, int, 32, 4); > + DECL_VARIABLE(vector_res, int, 64, 2); > + DECL_VARIABLE(vector_res, uint, 32, 4); > + DECL_VARIABLE(vector_res, uint, 64, 2); > + > + clean_results (); > + > + /* Initialize input "vector1" from "buffer". */ > + VLOAD(vector1, buffer, q, int, s, 16, 8); > + VLOAD(vector1, buffer, q, int, s, 32, 4); > + VLOAD(vector1, buffer, q, uint, u, 16, 8); > + VLOAD(vector1, buffer, q, uint, u, 32, 4); > + > + > + /* Execute the tests. */ > + TEST_VMULL_HIGH(INSN_NAME, , int, s, 16, 8, 32, 4, 0xBB); > + TEST_VMULL_HIGH(INSN_NAME, , int, s, 32, 4, 64, 2, 0xCC); > + TEST_VMULL_HIGH(INSN_NAME, , uint, u, 16, 8, 32, 4, 0xEE); > + TEST_VMULL_HIGH(INSN_NAME, , uint, u, 32, 4, 64, 2, 0xFF); > + > + CHECK_VMULL_HIGH_N_RESULTS (TEST_MSG, ""); > +} > + > +int main (void) > +{ > + FNNAME (INSN_NAME) (); > + > + return 0; > +} > > Property changes on: > gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c > ___________________________________________________________________ > Added: svn:executable > + * > > Index: gcc/testsuite/ChangeLog > =================================================================== > --- gcc/testsuite/ChangeLog (revision 218464) > +++ gcc/testsuite/ChangeLog (working copy) > @@ -1,3 +1,13 @@ > +2014-12-09 Felix Yang <felix.yang@huawei.com> > + Jiji Jiang <jiangjiji@huawei.com> > + > + * testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c: New > + test. > + * > testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c: > + New test. > + * testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c: > New > + test. > + > 2014-12-07 Christophe Lyon <christophe.lyon@linaro.org> > * gcc.target/aarch64/advsimd-intrinsics/vaddhn.c: Actually execute > Index: gcc/config/aarch64/arm_neon.h > =================================================================== > --- gcc/config/aarch64/arm_neon.h (revision 218464) > +++ gcc/config/aarch64/arm_neon.h (working copy) > @@ -7627,671 +7627,6 @@ vmovn_u64 (uint64x2_t a) > return result; > } > -__extension__ static __inline float32x2_t __attribute__ > ((__always_inline__)) > -vmul_n_f32 (float32x2_t a, float32_t b) > -{ > - float32x2_t result; > - __asm__ ("fmul %0.2s,%1.2s,%2.s[0]" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) > -vmul_n_s16 (int16x4_t a, int16_t b) > -{ > - int16x4_t result; > - __asm__ ("mul %0.4h,%1.4h,%2.h[0]" > - : "=w"(result) > - : "w"(a), "x"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) > -vmul_n_s32 (int32x2_t a, int32_t b) > -{ > - int32x2_t result; > - __asm__ ("mul %0.2s,%1.2s,%2.s[0]" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint16x4_t __attribute__ > ((__always_inline__)) > -vmul_n_u16 (uint16x4_t a, uint16_t b) > -{ > - uint16x4_t result; > - __asm__ ("mul %0.4h,%1.4h,%2.h[0]" > - : "=w"(result) > - : "w"(a), "x"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint32x2_t __attribute__ > ((__always_inline__)) > -vmul_n_u32 (uint32x2_t a, uint32_t b) > -{ > - uint32x2_t result; > - __asm__ ("mul %0.2s,%1.2s,%2.s[0]" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -#define vmull_high_lane_s16(a, b, c) \ > - __extension__ \ > - ({ \ > - int16x4_t b_ = (b); \ > - int16x8_t a_ = (a); \ > - int32x4_t result; \ > - __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "x"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_high_lane_s32(a, b, c) \ > - __extension__ \ > - ({ \ > - int32x2_t b_ = (b); \ > - int32x4_t a_ = (a); \ > - int64x2_t result; \ > - __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_high_lane_u16(a, b, c) \ > - __extension__ \ > - ({ \ > - uint16x4_t b_ = (b); \ > - uint16x8_t a_ = (a); \ > - uint32x4_t result; \ > - __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "x"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_high_lane_u32(a, b, c) \ > - __extension__ \ > - ({ \ > - uint32x2_t b_ = (b); \ > - uint32x4_t a_ = (a); \ > - uint64x2_t result; \ > - __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_high_laneq_s16(a, b, c) \ > - __extension__ \ > - ({ \ > - int16x8_t b_ = (b); \ > - int16x8_t a_ = (a); \ > - int32x4_t result; \ > - __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "x"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_high_laneq_s32(a, b, c) \ > - __extension__ \ > - ({ \ > - int32x4_t b_ = (b); \ > - int32x4_t a_ = (a); \ > - int64x2_t result; \ > - __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_high_laneq_u16(a, b, c) \ > - __extension__ \ > - ({ \ > - uint16x8_t b_ = (b); \ > - uint16x8_t a_ = (a); \ > - uint32x4_t result; \ > - __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "x"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_high_laneq_u32(a, b, c) \ > - __extension__ \ > - ({ \ > - uint32x4_t b_ = (b); \ > - uint32x4_t a_ = (a); \ > - uint64x2_t result; \ > - __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) > -vmull_high_n_s16 (int16x8_t a, int16_t b) > -{ > - int32x4_t result; > - __asm__ ("smull2 %0.4s,%1.8h,%2.h[0]" > - : "=w"(result) > - : "w"(a), "x"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) > -vmull_high_n_s32 (int32x4_t a, int32_t b) > -{ > - int64x2_t result; > - __asm__ ("smull2 %0.2d,%1.4s,%2.s[0]" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint32x4_t __attribute__ > ((__always_inline__)) > -vmull_high_n_u16 (uint16x8_t a, uint16_t b) > -{ > - uint32x4_t result; > - __asm__ ("umull2 %0.4s,%1.8h,%2.h[0]" > - : "=w"(result) > - : "w"(a), "x"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint64x2_t __attribute__ > ((__always_inline__)) > -vmull_high_n_u32 (uint32x4_t a, uint32_t b) > -{ > - uint64x2_t result; > - __asm__ ("umull2 %0.2d,%1.4s,%2.s[0]" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline poly16x8_t __attribute__ > ((__always_inline__)) > -vmull_high_p8 (poly8x16_t a, poly8x16_t b) > -{ > - poly16x8_t result; > - __asm__ ("pmull2 %0.8h,%1.16b,%2.16b" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) > -vmull_high_s8 (int8x16_t a, int8x16_t b) > -{ > - int16x8_t result; > - __asm__ ("smull2 %0.8h,%1.16b,%2.16b" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) > -vmull_high_s16 (int16x8_t a, int16x8_t b) > -{ > - int32x4_t result; > - __asm__ ("smull2 %0.4s,%1.8h,%2.8h" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) > -vmull_high_s32 (int32x4_t a, int32x4_t b) > -{ > - int64x2_t result; > - __asm__ ("smull2 %0.2d,%1.4s,%2.4s" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint16x8_t __attribute__ > ((__always_inline__)) > -vmull_high_u8 (uint8x16_t a, uint8x16_t b) > -{ > - uint16x8_t result; > - __asm__ ("umull2 %0.8h,%1.16b,%2.16b" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint32x4_t __attribute__ > ((__always_inline__)) > -vmull_high_u16 (uint16x8_t a, uint16x8_t b) > -{ > - uint32x4_t result; > - __asm__ ("umull2 %0.4s,%1.8h,%2.8h" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint64x2_t __attribute__ > ((__always_inline__)) > -vmull_high_u32 (uint32x4_t a, uint32x4_t b) > -{ > - uint64x2_t result; > - __asm__ ("umull2 %0.2d,%1.4s,%2.4s" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -#define vmull_lane_s16(a, b, c) \ > - __extension__ \ > - ({ \ > - int16x4_t b_ = (b); \ > - int16x4_t a_ = (a); \ > - int32x4_t result; \ > - __asm__ ("smull %0.4s,%1.4h,%2.h[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "x"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_lane_s32(a, b, c) \ > - __extension__ \ > - ({ \ > - int32x2_t b_ = (b); \ > - int32x2_t a_ = (a); \ > - int64x2_t result; \ > - __asm__ ("smull %0.2d,%1.2s,%2.s[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_lane_u16(a, b, c) \ > - __extension__ \ > - ({ \ > - uint16x4_t b_ = (b); \ > - uint16x4_t a_ = (a); \ > - uint32x4_t result; \ > - __asm__ ("umull %0.4s,%1.4h,%2.h[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "x"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_lane_u32(a, b, c) \ > - __extension__ \ > - ({ \ > - uint32x2_t b_ = (b); \ > - uint32x2_t a_ = (a); \ > - uint64x2_t result; \ > - __asm__ ("umull %0.2d, %1.2s, %2.s[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_laneq_s16(a, b, c) \ > - __extension__ \ > - ({ \ > - int16x8_t b_ = (b); \ > - int16x4_t a_ = (a); \ > - int32x4_t result; \ > - __asm__ ("smull %0.4s, %1.4h, %2.h[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "x"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_laneq_s32(a, b, c) \ > - __extension__ \ > - ({ \ > - int32x4_t b_ = (b); \ > - int32x2_t a_ = (a); \ > - int64x2_t result; \ > - __asm__ ("smull %0.2d, %1.2s, %2.s[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_laneq_u16(a, b, c) \ > - __extension__ \ > - ({ \ > - uint16x8_t b_ = (b); \ > - uint16x4_t a_ = (a); \ > - uint32x4_t result; \ > - __asm__ ("umull %0.4s, %1.4h, %2.h[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "x"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_laneq_u32(a, b, c) \ > - __extension__ \ > - ({ \ > - uint32x4_t b_ = (b); \ > - uint32x2_t a_ = (a); \ > - uint64x2_t result; \ > - __asm__ ("umull %0.2d, %1.2s, %2.s[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) > -vmull_n_s16 (int16x4_t a, int16_t b) > -{ > - int32x4_t result; > - __asm__ ("smull %0.4s,%1.4h,%2.h[0]" > - : "=w"(result) > - : "w"(a), "x"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) > -vmull_n_s32 (int32x2_t a, int32_t b) > -{ > - int64x2_t result; > - __asm__ ("smull %0.2d,%1.2s,%2.s[0]" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint32x4_t __attribute__ > ((__always_inline__)) > -vmull_n_u16 (uint16x4_t a, uint16_t b) > -{ > - uint32x4_t result; > - __asm__ ("umull %0.4s,%1.4h,%2.h[0]" > - : "=w"(result) > - : "w"(a), "x"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint64x2_t __attribute__ > ((__always_inline__)) > -vmull_n_u32 (uint32x2_t a, uint32_t b) > -{ > - uint64x2_t result; > - __asm__ ("umull %0.2d,%1.2s,%2.s[0]" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline poly16x8_t __attribute__ > ((__always_inline__)) > -vmull_p8 (poly8x8_t a, poly8x8_t b) > -{ > - poly16x8_t result; > - __asm__ ("pmull %0.8h, %1.8b, %2.8b" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) > -vmull_s8 (int8x8_t a, int8x8_t b) > -{ > - int16x8_t result; > - __asm__ ("smull %0.8h, %1.8b, %2.8b" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) > -vmull_s16 (int16x4_t a, int16x4_t b) > -{ > - int32x4_t result; > - __asm__ ("smull %0.4s, %1.4h, %2.4h" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) > -vmull_s32 (int32x2_t a, int32x2_t b) > -{ > - int64x2_t result; > - __asm__ ("smull %0.2d, %1.2s, %2.2s" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint16x8_t __attribute__ > ((__always_inline__)) > -vmull_u8 (uint8x8_t a, uint8x8_t b) > -{ > - uint16x8_t result; > - __asm__ ("umull %0.8h, %1.8b, %2.8b" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint32x4_t __attribute__ > ((__always_inline__)) > -vmull_u16 (uint16x4_t a, uint16x4_t b) > -{ > - uint32x4_t result; > - __asm__ ("umull %0.4s, %1.4h, %2.4h" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint64x2_t __attribute__ > ((__always_inline__)) > -vmull_u32 (uint32x2_t a, uint32x2_t b) > -{ > - uint64x2_t result; > - __asm__ ("umull %0.2d, %1.2s, %2.2s" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline float32x4_t __attribute__ > ((__always_inline__)) > -vmulq_n_f32 (float32x4_t a, float32_t b) > -{ > - float32x4_t result; > - __asm__ ("fmul %0.4s,%1.4s,%2.s[0]" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline float64x2_t __attribute__ > ((__always_inline__)) > -vmulq_n_f64 (float64x2_t a, float64_t b) > -{ > - float64x2_t result; > - __asm__ ("fmul %0.2d,%1.2d,%2.d[0]" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) > -vmulq_n_s16 (int16x8_t a, int16_t b) > -{ > - int16x8_t result; > - __asm__ ("mul %0.8h,%1.8h,%2.h[0]" > - : "=w"(result) > - : "w"(a), "x"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) > -vmulq_n_s32 (int32x4_t a, int32_t b) > -{ > - int32x4_t result; > - __asm__ ("mul %0.4s,%1.4s,%2.s[0]" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint16x8_t __attribute__ > ((__always_inline__)) > -vmulq_n_u16 (uint16x8_t a, uint16_t b) > -{ > - uint16x8_t result; > - __asm__ ("mul %0.8h,%1.8h,%2.h[0]" > - : "=w"(result) > - : "w"(a), "x"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint32x4_t __attribute__ > ((__always_inline__)) > -vmulq_n_u32 (uint32x4_t a, uint32_t b) > -{ > - uint32x4_t result; > - __asm__ ("mul %0.4s,%1.4s,%2.s[0]" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline float32x2_t __attribute__ > ((__always_inline__)) > -vmulx_f32 (float32x2_t a, float32x2_t b) > -{ > - float32x2_t result; > - __asm__ ("fmulx %0.2s,%1.2s,%2.2s" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -#define vmulx_lane_f32(a, b, c) \ > - __extension__ \ > - ({ \ > - float32x4_t b_ = (b); \ > - float32x2_t a_ = (a); \ > - float32x2_t result; \ > - __asm__ ("fmulx %0.2s,%1.2s,%2.s[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -__extension__ static __inline float64_t __attribute__ ((__always_inline__)) > -vmulxd_f64 (float64_t a, float64_t b) > -{ > - float64_t result; > - __asm__ ("fmulx %d0, %d1, %d2" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline float32x4_t __attribute__ > ((__always_inline__)) > -vmulxq_f32 (float32x4_t a, float32x4_t b) > -{ > - float32x4_t result; > - __asm__ ("fmulx %0.4s,%1.4s,%2.4s" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline float64x2_t __attribute__ > ((__always_inline__)) > -vmulxq_f64 (float64x2_t a, float64x2_t b) > -{ > - float64x2_t result; > - __asm__ ("fmulx %0.2d,%1.2d,%2.2d" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -#define vmulxq_lane_f32(a, b, c) \ > - __extension__ \ > - ({ \ > - float32x4_t b_ = (b); \ > - float32x4_t a_ = (a); \ > - float32x4_t result; \ > - __asm__ ("fmulx %0.4s,%1.4s,%2.s[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmulxq_lane_f64(a, b, c) \ > - __extension__ \ > - ({ \ > - float64x2_t b_ = (b); \ > - float64x2_t a_ = (a); \ > - float64x2_t result; \ > - __asm__ ("fmulx %0.2d,%1.2d,%2.d[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -__extension__ static __inline float32_t __attribute__ ((__always_inline__)) > -vmulxs_f32 (float32_t a, float32_t b) > -{ > - float32_t result; > - __asm__ ("fmulx %s0, %s1, %s2" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) > vmvn_p8 (poly8x8_t a) > { > @@ -19172,6 +18507,78 @@ vmul_n_f64 (float64x1_t __a, float64_t __b) > return (float64x1_t) { vget_lane_f64 (__a, 0) * __b }; > } > +__extension__ static __inline float32x2_t __attribute__ > ((__always_inline__)) > +vmul_n_f32 (float32x2_t __a, float32_t __b) > +{ > + return __builtin_aarch64_mul_nv2sf (__a, __b); > +} > + > +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) > +vmul_n_s16 (int16x4_t __a, int16_t __b) > +{ > + return __builtin_aarch64_mul_nv4hi (__a, __b); > +} > + > +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) > +vmul_n_s32 (int32x2_t __a, int32_t __b) > +{ > + return __builtin_aarch64_mul_nv2si (__a, __b); > +} > + > +__extension__ static __inline uint16x4_t __attribute__ > ((__always_inline__)) > +vmul_n_u16 (uint16x4_t __a, uint16_t __b) > +{ > + return (uint16x4_t) __builtin_aarch64_mul_nv4hi ((int16x4_t)__a, > + (int16_t)__b); > +} > + > +__extension__ static __inline uint32x2_t __attribute__ > ((__always_inline__)) > +vmul_n_u32 (uint32x2_t __a, uint32_t __b) > +{ > + return (uint32x2_t) __builtin_aarch64_mul_nv2si ((int32x2_t)__a, > + (int32_t)__b); > +} > + > +/* vmulq_n */ > + > +__extension__ static __inline float32x4_t __attribute__ > ((__always_inline__)) > +vmulq_n_f32 (float32x4_t __a, float32_t __b) > +{ > + return __builtin_aarch64_mul_nv4sf (__a, __b); > +} > + > +__extension__ static __inline float64x2_t __attribute__ > ((__always_inline__)) > +vmulq_n_f64 (float64x2_t __a, float64_t __b) > +{ > + return __builtin_aarch64_mul_nv2df (__a, __b); > +} > + > +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) > +vmulq_n_s16 (int16x8_t __a, int16_t __b) > +{ > + return __builtin_aarch64_mul_nv8hi (__a, __b); > +} > + > +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) > +vmulq_n_s32 (int32x4_t __a, int32_t __b) > +{ > + return __builtin_aarch64_mul_nv4si (__a, __b); > +} > + > +__extension__ static __inline uint16x8_t __attribute__ > ((__always_inline__)) > +vmulq_n_u16 (uint16x8_t __a, uint16_t __b) > +{ > + return (uint16x8_t) __builtin_aarch64_mul_nv8hi ((int16x8_t)__a, > + (int16_t)__b); > +} > + > +__extension__ static __inline uint32x4_t __attribute__ > ((__always_inline__)) > +vmulq_n_u32 (uint32x4_t __a, uint32_t __b) > +{ > + return (uint32x4_t) __builtin_aarch64_mul_nv4si ((int32x4_t)__a, > + (int32_t)__b); > +} > + > /* vmulq_lane */ > __extension__ static __inline float32x4_t __attribute__ > ((__always_inline__)) > @@ -19249,6 +18656,308 @@ vmulq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, c > return __a * __aarch64_vgetq_lane_u32 (__b, __lane); > } > +/* vmull_high_lane */ > + > +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) > +vmull_high_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c) > +{ > + return __builtin_aarch64_smull2_lanev8hi (__a, __b, __c); > +} > + > +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) > +vmull_high_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c) > +{ > + return __builtin_aarch64_smull2_lanev4si (__a, __b, __c); > +} > + > +__extension__ static __inline uint32x4_t __attribute__ > ((__always_inline__)) > +vmull_high_lane_u16 (uint16x8_t __a, uint16x4_t __b, const int __c) > +{ > + return (uint32x4_t) __builtin_aarch64_umull2_lanev8hi ((int16x8_t) __a, > + (int16x4_t) __b, > + __c); > +} > + > +__extension__ static __inline uint64x2_t __attribute__ > ((__always_inline__)) > +vmull_high_lane_u32 (uint32x4_t __a, uint32x2_t __b, const int __c) > +{ > + return (uint64x2_t) __builtin_aarch64_umull2_lanev4si ((int32x4_t) __a, > + (int32x2_t) __b, > + __c); > +} > + > +/* vmull_high_laneq */ > + > +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) > +vmull_high_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c) > +{ > + return __builtin_aarch64_smull2_laneqv8hi (__a, __b, __c); > +} > + > +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) > +vmull_high_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c) > +{ > + return __builtin_aarch64_smull2_laneqv4si (__a, __b, __c); > +} > + > +__extension__ static __inline uint32x4_t __attribute__ > ((__always_inline__)) > +vmull_high_laneq_u16 (uint16x8_t __a, uint16x8_t __b, const int __c) > +{ > + return (uint32x4_t) __builtin_aarch64_umull2_laneqv8hi ((int16x8_t)__a, > + (int16x8_t)__b, > + __c); > +} > + > +__extension__ static __inline uint64x2_t __attribute__ > ((__always_inline__)) > +vmull_high_laneq_u32 (uint32x4_t __a, uint32x4_t __b, const int __c) > +{ > + return (uint64x2_t) __builtin_aarch64_umull2_laneqv4si ((int32x4_t) __a, > + (int32x4_t) __b, > + __c); > +} > + > +/* vmull_high_n */ > + > +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) > +vmull_high_n_s16 (int16x8_t __a, int16_t __b) > +{ > + return __builtin_aarch64_smull2_nv8hi (__a, __b); > +} > + > +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) > +vmull_high_n_s32 (int32x4_t __a, int32_t __b) > +{ > + return __builtin_aarch64_smull2_nv4si (__a, __b); > +} > + > +__extension__ static __inline uint32x4_t __attribute__ > ((__always_inline__)) > +vmull_high_n_u16 (uint16x8_t __a, uint16_t __b) > +{ > + return __builtin_aarch64_umull2_nv8hi_uuu (__a, __b); > +} > + > +__extension__ static __inline uint64x2_t __attribute__ > ((__always_inline__)) > +vmull_high_n_u32 (uint32x4_t __a, uint32_t __b) > +{ > + return __builtin_aarch64_umull2_nv4si_uuu (__a, __b); > +} > + > +/* vmull_high */ > + > +__extension__ static __inline poly16x8_t __attribute__ > ((__always_inline__)) > +vmull_high_p8 (poly8x16_t __a, poly8x16_t __b) > +{ > + return __builtin_aarch64_pmull2v16qi_ppp (__a, __b); > +} > + > +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) > +vmull_high_s8 (int8x16_t __a, int8x16_t __b) > +{ > + return __builtin_aarch64_vec_widen_smult_hi_v16qi (__a, __b); > +} > + > +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) > +vmull_high_s16 (int16x8_t __a, int16x8_t __b) > +{ > + return __builtin_aarch64_vec_widen_smult_hi_v8hi (__a, __b);
Hi, Christophe Lyon These testcases are not covered by the glorious testsuite. If these cases are in your todo list , I will exclude them. Thanks. -----邮件原件----- 发件人: Christophe Lyon [mailto:christophe.lyon@linaro.org] 发送时间: 2014年12月9日 21:43 收件人: Jiangjiji 抄送: gcc-patches@gcc.gnu.org; Richard Earnshaw; Yangfei (Felix); Marcus Shawcroft 主题: Re: [AArch64, NEON] Improve vmulX intrinsics On 9 December 2014 at 13:52, Jiangjiji <jiangjiji@huawei.com> wrote: > Hi, > This patch converts more intrinsics to use builtin functions instead of > the > previous inline assembly syntax. > Passed the glorious testsuite of Christophe Lyon. > > Three testcases are added for the testing of intriniscs which are not > covered by the testsuite: > gcc.target/aarch64/vmull_high.c > gcc.target/aarch64/vmull_high_lane.c > gcc.target/aarch64/vmull_high_n.c > As I said here: https://gcc.gnu.org/ml/gcc-patches/2014-10/msg01934.html I am in tre process of converting my existing testsuite to GCC/Dejagnu. Please do not duplicate work. > Regtested with aarch64-linux-gnu on QEMU. > This patch has no regressions for aarch64_be-linux-gnu big-endian > target too. > OK for the trunk? > > > > Index: gcc/ChangeLog > =================================================================== > --- gcc/ChangeLog (revision 218464) > +++ gcc/ChangeLog (working copy) > @@ -1,3 +1,38 @@ > +2014-12-09 Felix Yang <felix.yang@huawei.com> > + Jiji Jiang <jiangjiji@huawei.com> > + > + * config/aarch64/aarch64-simd.md (aarch64_mul_n<mode>, > + aarch64_<su>mull_n<mode>, aarch64_<su>mull<mode>, > + aarch64_simd_<su>mull2_n<mode>, aarch64_<su>mull2_n<mode>, > + aarch64_<su>mull_lane<mode>, aarch64_<su>mull2_lane<mode>_internal, > + aarch64_<su>mull_laneq<mode>, > aarch64_<su>mull2_laneq<mode>_internal, > + aarch64_smull2_lane<mode>, aarch64_umull2_lane<mode>, > + aarch64_smull2_laneq<mode>, aarch64_umull2_laneq<mode>, > + aarch64_fmulx<mode>, aarch64_fmulx<mode>, aarch64_fmulx_lane<mode>, > + aarch64_pmull2v16qi, aarch64_pmullv8qi): New patterns. > + * config/aarch64/aarch64-simd-builtins.def (vec_widen_smult_hi_, > + vec_widen_umult_hi_, umull, smull, smull_n, umull_n, mul_n, > smull2_n, > + umull2_n, smull_lane, umull_lane, smull_laneq, umull_laneq, pmull, > + umull2_lane, smull2_laneq, umull2_laneq, fmulx, fmulx_lane, pmull2, > + smull2_lane): New builtins. > + * config/aarch64/arm_neon.h (vmul_n_f32, vmul_n_s16, vmul_n_s32, > + vmul_n_u16, vmul_n_u32, vmulq_n_f32, vmulq_n_f64, vmulq_n_s16, > + vmulq_n_s32, vmulq_n_u16, vmulq_n_u32, vmull_high_lane_s16, > + vmull_high_lane_s32, vmull_high_lane_u16, vmull_high_lane_u32, > + vmull_high_laneq_s16, vmull_high_laneq_s32, vmull_high_laneq_u16, > + vmull_high_laneq_u32, vmull_high_n_s16, vmull_high_n_s32, > + vmull_high_n_u16, vmull_high_n_u32, vmull_high_p8, vmull_high_s8, > + vmull_high_s16, vmull_high_s32, vmull_high_u8, vmull_high_u16, > + vmull_high_u32, vmull_lane_s16, vmull_lane_s32, vmull_lane_u16, > + vmull_lane_u32, vmull_laneq_s16, vmull_laneq_s32, vmull_laneq_u16, > + vmull_laneq_u32, vmull_n_s16, vmull_n_s32, vmull_n_u16, vmull_n_u32, > + vmull_p8, vmull_s8, vmull_s16, vmull_s32, vmull_u8, vmull_u16, > + vmull_u32, vmulx_f32, vmulx_lane_f32, vmulxd_f64, vmulxq_f32, > + vmulxq_f64, vmulxq_lane_f32, vmulxq_lane_f64, vmulxs_f32): Rewrite > + using builtin functions. > + * config/aarch64/iterators.md (UNSPEC_FMULX, UNSPEC_FMULX_LANE, > + VDQF_Q): New unspec and int iterator. > + > 2014-12-07 Felix Yang <felix.yang@huawei.com> > Shanyao Chen <chenshanyao@huawei.com> > Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c > =================================================================== > --- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c > (revision 0) > +++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c > (revision 0) > @@ -0,0 +1,111 @@ > +#include <arm_neon.h> > +#include "arm-neon-ref.h" > +#include "compute-ref-data.h" > + > + > +/* Expected results. */ > +VECT_VAR_DECL(expected,int,16,8) [] = { 0xfc48, 0xfcbf, 0xfd36, 0xfdad, > + 0xfe24, 0xfe9b, 0xff12, 0xff89 }; > +VECT_VAR_DECL(expected,int,32,4) [] = { 0xfffff9a0, 0xfffffa28, > + 0xfffffab0, 0xfffffb38 }; > +VECT_VAR_DECL(expected,int,64,2) [] = { 0xfffffffffffff7a2, > + 0xfffffffffffff83b }; > +VECT_VAR_DECL(expected,uint,16,8) [] = { 0xa4b0, 0xa55a, 0xa604, 0xa6ae, > + 0xa758, 0xa802, 0xa8ac, 0xa956 }; > +VECT_VAR_DECL(expected,uint,32,4) [] = { 0xbaf73c, 0xbaf7f7, > + 0xbaf8b2, 0xbaf96d }; > +VECT_VAR_DECL(expected,uint,64,2) [] = { 0xcbfffff4d8, > + 0xcbfffff5a4}; > +VECT_VAR_DECL(expected,poly,16,8) [] = { 0x6530, 0x659a, 0x6464, 0x64ce, > + 0x6798, 0x6732, 0x66cc, 0x6666 }; > + > +#ifndef INSN_NAME > +#define INSN_NAME vmull_high > +#define TEST_MSG "VMUL_HIGH" > +#endif > + > +#define FNNAME1(NAME) exec_ ## NAME > +#define FNNAME(NAME) FNNAME1(NAME) > + > +void FNNAME (INSN_NAME) (void) > +{ > +#define DECL_VMUL(T, W, N) \ > + DECL_VARIABLE(vector1, T, W, N); \ > + DECL_VARIABLE(vector2, T, W, N); > + > + /* vector_res = OP(vector1, vector2), then store the result. */ > +#define TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1) \ > + VECT_VAR(vector_res, T1, W1, N1) = \ > + INSN##Q##_##T2##W(VECT_VAR(vector1, T1, W, N), \ > + VECT_VAR(vector2, T1, W, N)); \ > + vst1q##_##T2##W1(VECT_VAR(result, T1, W1, N1), \ > + VECT_VAR(vector_res, T1, W1, N1)) > + > +#define TEST_VMULL_HIGH(INSN, Q, T1, T2, W, N, W1, N1) \ > + TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1) > + > +#define CHECK_VMULL_HIGH_RESULTS(test_name,comment) \ > + { \ > + CHECK(test_name, int, 16, 8, PRIx16, expected, comment); \ > + CHECK(test_name, int, 32, 4, PRIx32, expected, comment); \ > + CHECK(test_name, int, 64, 2, PRIx64, expected, comment); \ > + CHECK(test_name, uint, 16, 8, PRIx16, expected, comment); \ > + CHECK(test_name, uint, 32, 4, PRIx32, expected, comment); \ > + CHECK(test_name, uint, 64, 2, PRIx64, expected, comment); \ > + CHECK(test_name, poly, 16, 8, PRIx16, expected, comment); \ > + } > + > + DECL_VMUL(int, 8, 16); > + DECL_VMUL(int, 16, 8); > + DECL_VMUL(int, 32, 4); > + DECL_VMUL(uint, 8, 16); > + DECL_VMUL(uint, 16, 8); > + DECL_VMUL(uint, 32, 4); > + DECL_VMUL(poly, 8, 16); > + > + DECL_VARIABLE(vector_res, int, 16, 8); > + DECL_VARIABLE(vector_res, int, 32, 4); > + DECL_VARIABLE(vector_res, int, 64, 2); > + DECL_VARIABLE(vector_res, uint, 16, 8); > + DECL_VARIABLE(vector_res, uint, 32, 4); > + DECL_VARIABLE(vector_res, uint, 64, 2); > + DECL_VARIABLE(vector_res, poly, 16, 8); > + > + clean_results (); > + > + /* Initialize input "vector1" from "buffer". */ > + VLOAD(vector1, buffer, q, int, s, 8, 16); > + VLOAD(vector1, buffer, q, int, s, 16, 8); > + VLOAD(vector1, buffer, q, int, s, 32, 4); > + VLOAD(vector1, buffer, q, uint, u, 8, 16); > + VLOAD(vector1, buffer, q, uint, u, 16, 8); > + VLOAD(vector1, buffer, q, uint, u, 32, 4); > + VLOAD(vector1, buffer, q, poly, p, 8, 16); > + > + /* Choose init value arbitrarily. */ > + VDUP(vector2, q, int, s, 8, 16, 0x77); > + VDUP(vector2, q, int, s, 16, 8, 0x88); > + VDUP(vector2, q, int, s, 32, 4, 0x99); > + VDUP(vector2, q, uint, u, 8, 16, 0xAA); > + VDUP(vector2, q, uint, u, 16, 8, 0xBB); > + VDUP(vector2, q, uint, u, 32, 4, 0xCC); > + VDUP(vector2, q, poly, p, 8, 16, 0xAA); > + > + /* Execute the tests. */ > + TEST_VMULL_HIGH(INSN_NAME, , int, s, 8, 16, 16, 8); > + TEST_VMULL_HIGH(INSN_NAME, , int, s, 16, 8, 32, 4); > + TEST_VMULL_HIGH(INSN_NAME, , int, s, 32, 4, 64, 2); > + TEST_VMULL_HIGH(INSN_NAME, , uint, u, 8, 16, 16, 8); > + TEST_VMULL_HIGH(INSN_NAME, , uint, u, 16, 8, 32, 4); > + TEST_VMULL_HIGH(INSN_NAME, , uint, u, 32, 4, 64, 2); > + TEST_VMULL_HIGH(INSN_NAME, , poly, p, 8, 16, 16, 8); > + > + CHECK_VMULL_HIGH_RESULTS (TEST_MSG, ""); > +} > + > +int main (void) > +{ > + FNNAME (INSN_NAME) (); > + > + return 0; > +} > > Property changes on: > gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c > ___________________________________________________________________ > Added: svn:executable > + * > > Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c > =================================================================== > --- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c > (revision 0) > +++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c > (revision 0) > @@ -0,0 +1,135 @@ > +#include <arm_neon.h> > +#include "arm-neon-ref.h" > +#include "compute-ref-data.h" > + > +VECT_VAR_DECL(expected, int, 32, 4) [] = { 0x4000, 0x4000, 0x4000, 0x4000 > }; > +VECT_VAR_DECL(expected, int, 64, 2) [] = { 0x2000, 0x2000}; > +VECT_VAR_DECL(expected, uint, 32, 4) [] = { 0x4000, 0x4000, 0x4000, 0x4000 > }; > +VECT_VAR_DECL(expected, uint, 64, 2) [] = { 0x2000, 0x2000 }; > + > +#define TEST_MSG "VMULL_HIGH_LANE/VMULL_HIGH_LANEQ" > +void exec_vmull_high_lane (void) > +{ > + /* vector_res = vmull_lane(vector,vector2,lane), then store the result. > */ > +#define TEST_VMULL_HIGH_LANE(T1, T2, W, W2, N1, N2, L) \ > + VECT_VAR(vector_res, T1, W2, N2) = \ > + vmull##_high_lane_##T2##W(VECT_VAR(vector, T1, W, N1 ), \ > + VECT_VAR(vector2, T1, W, N2), \ > + L); \ > + vst1q_##T2##W2(VECT_VAR(result, T1, W2, N2), VECT_VAR(vector_res, T1, W2, > N2)) > + > +#define CHECK_VMULL_HIGH_LANE_RESULTS(test_name,comment) \ > + { \ > + CHECK(test_name, int, 32, 4, PRIx32, expected, comment); \ > + CHECK(test_name, int, 64, 2, PRIx64, expected, comment); \ > + CHECK(test_name, uint, 32, 4, PRIx32, expected, comment); \ > + CHECK(test_name, uint, 64, 2, PRIx64, expected, comment); \ > + } > + > + > + /* With ARM RVCT, we need to declare variables before any executable > + statement */ > + DECL_VARIABLE(vector, int, 16, 8); > + DECL_VARIABLE(vector, int, 32, 4); > + DECL_VARIABLE(vector, uint, 16, 8); > + DECL_VARIABLE(vector, uint, 32, 4); > + DECL_VARIABLE(vector2, int, 16, 4); > + DECL_VARIABLE(vector2, int, 32, 2); > + DECL_VARIABLE(vector2, uint, 16, 4); > + DECL_VARIABLE(vector2, uint, 32, 2); > + > + DECL_VARIABLE(vector_res, int, 32, 4); > + DECL_VARIABLE(vector_res, int, 64, 2); > + DECL_VARIABLE(vector_res, uint, 32, 4); > + DECL_VARIABLE(vector_res, uint, 64, 2); > + > + clean_results (); > + > + /* Initialize vector */ > + VDUP(vector2, , int, s, 16, 4, 0x1000); > + VDUP(vector2, , int, s, 32, 2, 0x1000); > + VDUP(vector2, , uint, u, 16, 4, 0x1000); > + VDUP(vector2, , uint, u, 32, 2, 0x1000); > + > + /* Initialize vector2 */ > + VDUP(vector, q, int, s, 16, 8, 0x4); > + VDUP(vector, q, int, s, 32, 4, 0x2); > + VDUP(vector, q, uint, u, 16, 8, 0x4); > + VDUP(vector, q, uint, u, 32, 4, 0x2); > + > + /* Choose lane arbitrarily */ > + TEST_VMULL_HIGH_LANE(int, s, 16, 32, 8, 4, 2); > + TEST_VMULL_HIGH_LANE(int, s, 32, 64, 4, 2, 1); > + TEST_VMULL_HIGH_LANE(uint, u, 16, 32, 8, 4, 2); > + TEST_VMULL_HIGH_LANE(uint, u, 32, 64, 4, 2, 1); > + > + CHECK_VMULL_HIGH_LANE_RESULTS (TEST_MSG, ""); > +} > + > + > +void exec_vmull_high_laneq (void) > +{ > + /* vector_res = vmull_lane(vector,vector2,lane), then store the result. > */ > +#define TEST_VMULL_HIGH_LANEQ(T1, T2, W, W2, N2, N1, L) \ > + VECT_VAR(vector_res, T1, W2, N1) = \ > + vmull##_high_laneq_##T2##W(VECT_VAR(vector, T1, W, N2 ), \ > + VECT_VAR(vector2, T1, W, N2), \ > + L); \ > + vst1q_##T2##W2(VECT_VAR(result, T1, W2, N1), VECT_VAR(vector_res, T1, W2, > N1)) > + > +#define CHECK_VMULL_HIGH_LANEQ_RESULTS(test_name,comment) \ > + { \ > + CHECK(test_name, int, 32, 4, PRIx32, expected, comment); \ > + CHECK(test_name, int, 64, 2, PRIx64, expected, comment); \ > + CHECK(test_name, uint, 32, 4, PRIx32, expected, comment); \ > + CHECK(test_name, uint, 64, 2, PRIx64, expected, comment); \ > + } > + > + > + /* With ARM RVCT, we need to declare variables before any executable > + statement */ > + DECL_VARIABLE(vector, int, 16, 8); > + DECL_VARIABLE(vector, int, 32, 4); > + DECL_VARIABLE(vector, uint, 16, 8); > + DECL_VARIABLE(vector, uint, 32, 4); > + DECL_VARIABLE(vector2, int, 16, 8); > + DECL_VARIABLE(vector2, int, 32, 4); > + DECL_VARIABLE(vector2, uint, 16, 8); > + DECL_VARIABLE(vector2, uint, 32, 4); > + > + DECL_VARIABLE(vector_res, int, 32, 4); > + DECL_VARIABLE(vector_res, int, 64, 2); > + DECL_VARIABLE(vector_res, uint, 32, 4); > + DECL_VARIABLE(vector_res, uint, 64, 2); > + > + clean_results (); > + > + /* Initialize vector */ > + VDUP(vector2, q, int, s, 16, 8, 0x1000); > + VDUP(vector2, q, int, s, 32, 4, 0x1000); > + VDUP(vector2, q, uint, u, 16, 8, 0x1000); > + VDUP(vector2, q, uint, u, 32, 4, 0x1000); > + > + /* Initialize vector2 */ > + VDUP(vector, q, int, s, 16, 8, 0x4); > + VDUP(vector, q, int, s, 32, 4, 0x2); > + VDUP(vector, q, uint, u, 16, 8, 0x4); > + VDUP(vector, q, uint, u, 32, 4, 0x2); > + > + /* Choose lane arbitrarily */ > + TEST_VMULL_HIGH_LANEQ(int, s, 16, 32, 8, 4, 2); > + TEST_VMULL_HIGH_LANEQ(int, s, 32, 64, 4, 2, 1); > + TEST_VMULL_HIGH_LANEQ(uint, u, 16, 32, 8, 4, 2); > + TEST_VMULL_HIGH_LANEQ(uint, u, 32, 64, 4, 2, 1); > + > + CHECK_VMULL_HIGH_LANEQ_RESULTS (TEST_MSG, ""); > +} > + > + > + > + > +int main (void) > +{ > + exec_vmull_high_lane(); > + return 0; > +} > > Property changes on: > gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c > ___________________________________________________________________ > Added: svn:executable > + * > > Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c > =================================================================== > --- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c > (revision 0) > +++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c > (revision 0) > @@ -0,0 +1,81 @@ > +#include <arm_neon.h> > +#include "arm-neon-ref.h" > +#include "compute-ref-data.h" > + > + > +/* Expected results. */ > +VECT_VAR_DECL(expected,int,32,4) [] = { 0xfffff73c, 0xfffff7f7, > + 0xfffff8b2, 0xfffff96d }; > +VECT_VAR_DECL(expected,int,64,2) [] = { 0xfffffffffffff4d8, > + 0xfffffffffffff5a4 }; > +VECT_VAR_DECL(expected,uint,32,4) [] = { 0xedf4d8, 0xedf5c6, > + 0xedf6b4, 0xedf7a2 }; > +VECT_VAR_DECL(expected,uint,64,2) [] = { 0xfefffff20e, > + 0xfefffff30d}; > + > +#ifndef INSN_NAME > +#define INSN_NAME vmull_high_n > +#define TEST_MSG "VMULL_HIGH_N" > +#endif > + > +#define FNNAME1(NAME) exec_ ## NAME > +#define FNNAME(NAME) FNNAME1(NAME) > + > +void FNNAME (INSN_NAME) (void) > +{ > +#define DECL_VMUL(T, W, N) \ > + DECL_VARIABLE(vector1, T, W, N); \ > + > + /* vector_res = OP(vector1, vector2), then store the result. */ > +#define TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1, C) \ > + VECT_VAR(vector_res, T1, W1, N1) = \ > + INSN##Q##_##T2##W(VECT_VAR(vector1, T1, W, N), \ > + C); \ > + vst1q##_##T2##W1(VECT_VAR(result, T1, W1, N1), \ > + VECT_VAR(vector_res, T1, W1, N1)) > + > +#define TEST_VMULL_HIGH(INSN, Q, T1, T2, W, N, W1, N1, C) \ > + TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1, C) > + > +#define CHECK_VMULL_HIGH_N_RESULTS(test_name,comment) \ > + { \ > + CHECK(test_name, int, 32, 4, PRIx32, expected, comment); \ > + CHECK(test_name, int, 64, 2, PRIx64, expected, comment); \ > + CHECK(test_name, uint, 32, 4, PRIx32, expected, comment); \ > + CHECK(test_name, uint, 64, 2, PRIx64, expected, comment); \ > + } > + > + DECL_VMUL(int, 16, 8); > + DECL_VMUL(int, 32, 4); > + DECL_VMUL(uint, 16, 8); > + DECL_VMUL(uint, 32, 4); > + > + DECL_VARIABLE(vector_res, int, 32, 4); > + DECL_VARIABLE(vector_res, int, 64, 2); > + DECL_VARIABLE(vector_res, uint, 32, 4); > + DECL_VARIABLE(vector_res, uint, 64, 2); > + > + clean_results (); > + > + /* Initialize input "vector1" from "buffer". */ > + VLOAD(vector1, buffer, q, int, s, 16, 8); > + VLOAD(vector1, buffer, q, int, s, 32, 4); > + VLOAD(vector1, buffer, q, uint, u, 16, 8); > + VLOAD(vector1, buffer, q, uint, u, 32, 4); > + > + > + /* Execute the tests. */ > + TEST_VMULL_HIGH(INSN_NAME, , int, s, 16, 8, 32, 4, 0xBB); > + TEST_VMULL_HIGH(INSN_NAME, , int, s, 32, 4, 64, 2, 0xCC); > + TEST_VMULL_HIGH(INSN_NAME, , uint, u, 16, 8, 32, 4, 0xEE); > + TEST_VMULL_HIGH(INSN_NAME, , uint, u, 32, 4, 64, 2, 0xFF); > + > + CHECK_VMULL_HIGH_N_RESULTS (TEST_MSG, ""); > +} > + > +int main (void) > +{ > + FNNAME (INSN_NAME) (); > + > + return 0; > +} > > Property changes on: > gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c > ___________________________________________________________________ > Added: svn:executable > + * > > Index: gcc/testsuite/ChangeLog > =================================================================== > --- gcc/testsuite/ChangeLog (revision 218464) > +++ gcc/testsuite/ChangeLog (working copy) > @@ -1,3 +1,13 @@ > +2014-12-09 Felix Yang <felix.yang@huawei.com> > + Jiji Jiang <jiangjiji@huawei.com> > + > + * testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c: New > + test. > + * > testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c: > + New test. > + * testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c: > New > + test. > + > 2014-12-07 Christophe Lyon <christophe.lyon@linaro.org> > * gcc.target/aarch64/advsimd-intrinsics/vaddhn.c: Actually execute > Index: gcc/config/aarch64/arm_neon.h > =================================================================== > --- gcc/config/aarch64/arm_neon.h (revision 218464) > +++ gcc/config/aarch64/arm_neon.h (working copy) > @@ -7627,671 +7627,6 @@ vmovn_u64 (uint64x2_t a) > return result; > } > -__extension__ static __inline float32x2_t __attribute__ > ((__always_inline__)) > -vmul_n_f32 (float32x2_t a, float32_t b) > -{ > - float32x2_t result; > - __asm__ ("fmul %0.2s,%1.2s,%2.s[0]" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) > -vmul_n_s16 (int16x4_t a, int16_t b) > -{ > - int16x4_t result; > - __asm__ ("mul %0.4h,%1.4h,%2.h[0]" > - : "=w"(result) > - : "w"(a), "x"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) > -vmul_n_s32 (int32x2_t a, int32_t b) > -{ > - int32x2_t result; > - __asm__ ("mul %0.2s,%1.2s,%2.s[0]" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint16x4_t __attribute__ > ((__always_inline__)) > -vmul_n_u16 (uint16x4_t a, uint16_t b) > -{ > - uint16x4_t result; > - __asm__ ("mul %0.4h,%1.4h,%2.h[0]" > - : "=w"(result) > - : "w"(a), "x"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint32x2_t __attribute__ > ((__always_inline__)) > -vmul_n_u32 (uint32x2_t a, uint32_t b) > -{ > - uint32x2_t result; > - __asm__ ("mul %0.2s,%1.2s,%2.s[0]" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -#define vmull_high_lane_s16(a, b, c) \ > - __extension__ \ > - ({ \ > - int16x4_t b_ = (b); \ > - int16x8_t a_ = (a); \ > - int32x4_t result; \ > - __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "x"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_high_lane_s32(a, b, c) \ > - __extension__ \ > - ({ \ > - int32x2_t b_ = (b); \ > - int32x4_t a_ = (a); \ > - int64x2_t result; \ > - __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_high_lane_u16(a, b, c) \ > - __extension__ \ > - ({ \ > - uint16x4_t b_ = (b); \ > - uint16x8_t a_ = (a); \ > - uint32x4_t result; \ > - __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "x"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_high_lane_u32(a, b, c) \ > - __extension__ \ > - ({ \ > - uint32x2_t b_ = (b); \ > - uint32x4_t a_ = (a); \ > - uint64x2_t result; \ > - __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_high_laneq_s16(a, b, c) \ > - __extension__ \ > - ({ \ > - int16x8_t b_ = (b); \ > - int16x8_t a_ = (a); \ > - int32x4_t result; \ > - __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "x"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_high_laneq_s32(a, b, c) \ > - __extension__ \ > - ({ \ > - int32x4_t b_ = (b); \ > - int32x4_t a_ = (a); \ > - int64x2_t result; \ > - __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_high_laneq_u16(a, b, c) \ > - __extension__ \ > - ({ \ > - uint16x8_t b_ = (b); \ > - uint16x8_t a_ = (a); \ > - uint32x4_t result; \ > - __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "x"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_high_laneq_u32(a, b, c) \ > - __extension__ \ > - ({ \ > - uint32x4_t b_ = (b); \ > - uint32x4_t a_ = (a); \ > - uint64x2_t result; \ > - __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) > -vmull_high_n_s16 (int16x8_t a, int16_t b) > -{ > - int32x4_t result; > - __asm__ ("smull2 %0.4s,%1.8h,%2.h[0]" > - : "=w"(result) > - : "w"(a), "x"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) > -vmull_high_n_s32 (int32x4_t a, int32_t b) > -{ > - int64x2_t result; > - __asm__ ("smull2 %0.2d,%1.4s,%2.s[0]" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint32x4_t __attribute__ > ((__always_inline__)) > -vmull_high_n_u16 (uint16x8_t a, uint16_t b) > -{ > - uint32x4_t result; > - __asm__ ("umull2 %0.4s,%1.8h,%2.h[0]" > - : "=w"(result) > - : "w"(a), "x"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint64x2_t __attribute__ > ((__always_inline__)) > -vmull_high_n_u32 (uint32x4_t a, uint32_t b) > -{ > - uint64x2_t result; > - __asm__ ("umull2 %0.2d,%1.4s,%2.s[0]" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline poly16x8_t __attribute__ > ((__always_inline__)) > -vmull_high_p8 (poly8x16_t a, poly8x16_t b) > -{ > - poly16x8_t result; > - __asm__ ("pmull2 %0.8h,%1.16b,%2.16b" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) > -vmull_high_s8 (int8x16_t a, int8x16_t b) > -{ > - int16x8_t result; > - __asm__ ("smull2 %0.8h,%1.16b,%2.16b" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) > -vmull_high_s16 (int16x8_t a, int16x8_t b) > -{ > - int32x4_t result; > - __asm__ ("smull2 %0.4s,%1.8h,%2.8h" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) > -vmull_high_s32 (int32x4_t a, int32x4_t b) > -{ > - int64x2_t result; > - __asm__ ("smull2 %0.2d,%1.4s,%2.4s" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint16x8_t __attribute__ > ((__always_inline__)) > -vmull_high_u8 (uint8x16_t a, uint8x16_t b) > -{ > - uint16x8_t result; > - __asm__ ("umull2 %0.8h,%1.16b,%2.16b" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint32x4_t __attribute__ > ((__always_inline__)) > -vmull_high_u16 (uint16x8_t a, uint16x8_t b) > -{ > - uint32x4_t result; > - __asm__ ("umull2 %0.4s,%1.8h,%2.8h" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint64x2_t __attribute__ > ((__always_inline__)) > -vmull_high_u32 (uint32x4_t a, uint32x4_t b) > -{ > - uint64x2_t result; > - __asm__ ("umull2 %0.2d,%1.4s,%2.4s" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -#define vmull_lane_s16(a, b, c) \ > - __extension__ \ > - ({ \ > - int16x4_t b_ = (b); \ > - int16x4_t a_ = (a); \ > - int32x4_t result; \ > - __asm__ ("smull %0.4s,%1.4h,%2.h[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "x"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_lane_s32(a, b, c) \ > - __extension__ \ > - ({ \ > - int32x2_t b_ = (b); \ > - int32x2_t a_ = (a); \ > - int64x2_t result; \ > - __asm__ ("smull %0.2d,%1.2s,%2.s[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_lane_u16(a, b, c) \ > - __extension__ \ > - ({ \ > - uint16x4_t b_ = (b); \ > - uint16x4_t a_ = (a); \ > - uint32x4_t result; \ > - __asm__ ("umull %0.4s,%1.4h,%2.h[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "x"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_lane_u32(a, b, c) \ > - __extension__ \ > - ({ \ > - uint32x2_t b_ = (b); \ > - uint32x2_t a_ = (a); \ > - uint64x2_t result; \ > - __asm__ ("umull %0.2d, %1.2s, %2.s[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_laneq_s16(a, b, c) \ > - __extension__ \ > - ({ \ > - int16x8_t b_ = (b); \ > - int16x4_t a_ = (a); \ > - int32x4_t result; \ > - __asm__ ("smull %0.4s, %1.4h, %2.h[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "x"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_laneq_s32(a, b, c) \ > - __extension__ \ > - ({ \ > - int32x4_t b_ = (b); \ > - int32x2_t a_ = (a); \ > - int64x2_t result; \ > - __asm__ ("smull %0.2d, %1.2s, %2.s[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_laneq_u16(a, b, c) \ > - __extension__ \ > - ({ \ > - uint16x8_t b_ = (b); \ > - uint16x4_t a_ = (a); \ > - uint32x4_t result; \ > - __asm__ ("umull %0.4s, %1.4h, %2.h[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "x"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_laneq_u32(a, b, c) \ > - __extension__ \ > - ({ \ > - uint32x4_t b_ = (b); \ > - uint32x2_t a_ = (a); \ > - uint64x2_t result; \ > - __asm__ ("umull %0.2d, %1.2s, %2.s[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) > -vmull_n_s16 (int16x4_t a, int16_t b) > -{ > - int32x4_t result; > - __asm__ ("smull %0.4s,%1.4h,%2.h[0]" > - : "=w"(result) > - : "w"(a), "x"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) > -vmull_n_s32 (int32x2_t a, int32_t b) > -{ > - int64x2_t result; > - __asm__ ("smull %0.2d,%1.2s,%2.s[0]" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint32x4_t __attribute__ > ((__always_inline__)) > -vmull_n_u16 (uint16x4_t a, uint16_t b) > -{ > - uint32x4_t result; > - __asm__ ("umull %0.4s,%1.4h,%2.h[0]" > - : "=w"(result) > - : "w"(a), "x"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint64x2_t __attribute__ > ((__always_inline__)) > -vmull_n_u32 (uint32x2_t a, uint32_t b) > -{ > - uint64x2_t result; > - __asm__ ("umull %0.2d,%1.2s,%2.s[0]" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline poly16x8_t __attribute__ > ((__always_inline__)) > -vmull_p8 (poly8x8_t a, poly8x8_t b) > -{ > - poly16x8_t result; > - __asm__ ("pmull %0.8h, %1.8b, %2.8b" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) > -vmull_s8 (int8x8_t a, int8x8_t b) > -{ > - int16x8_t result; > - __asm__ ("smull %0.8h, %1.8b, %2.8b" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) > -vmull_s16 (int16x4_t a, int16x4_t b) > -{ > - int32x4_t result; > - __asm__ ("smull %0.4s, %1.4h, %2.4h" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) > -vmull_s32 (int32x2_t a, int32x2_t b) > -{ > - int64x2_t result; > - __asm__ ("smull %0.2d, %1.2s, %2.2s" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint16x8_t __attribute__ > ((__always_inline__)) > -vmull_u8 (uint8x8_t a, uint8x8_t b) > -{ > - uint16x8_t result; > - __asm__ ("umull %0.8h, %1.8b, %2.8b" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint32x4_t __attribute__ > ((__always_inline__)) > -vmull_u16 (uint16x4_t a, uint16x4_t b) > -{ > - uint32x4_t result; > - __asm__ ("umull %0.4s, %1.4h, %2.4h" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint64x2_t __attribute__ > ((__always_inline__)) > -vmull_u32 (uint32x2_t a, uint32x2_t b) > -{ > - uint64x2_t result; > - __asm__ ("umull %0.2d, %1.2s, %2.2s" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline float32x4_t __attribute__ > ((__always_inline__)) > -vmulq_n_f32 (float32x4_t a, float32_t b) > -{ > - float32x4_t result; > - __asm__ ("fmul %0.4s,%1.4s,%2.s[0]" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline float64x2_t __attribute__ > ((__always_inline__)) > -vmulq_n_f64 (float64x2_t a, float64_t b) > -{ > - float64x2_t result; > - __asm__ ("fmul %0.2d,%1.2d,%2.d[0]" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) > -vmulq_n_s16 (int16x8_t a, int16_t b) > -{ > - int16x8_t result; > - __asm__ ("mul %0.8h,%1.8h,%2.h[0]" > - : "=w"(result) > - : "w"(a), "x"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) > -vmulq_n_s32 (int32x4_t a, int32_t b) > -{ > - int32x4_t result; > - __asm__ ("mul %0.4s,%1.4s,%2.s[0]" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint16x8_t __attribute__ > ((__always_inline__)) > -vmulq_n_u16 (uint16x8_t a, uint16_t b) > -{ > - uint16x8_t result; > - __asm__ ("mul %0.8h,%1.8h,%2.h[0]" > - : "=w"(result) > - : "w"(a), "x"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint32x4_t __attribute__ > ((__always_inline__)) > -vmulq_n_u32 (uint32x4_t a, uint32_t b) > -{ > - uint32x4_t result; > - __asm__ ("mul %0.4s,%1.4s,%2.s[0]" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline float32x2_t __attribute__ > ((__always_inline__)) > -vmulx_f32 (float32x2_t a, float32x2_t b) > -{ > - float32x2_t result; > - __asm__ ("fmulx %0.2s,%1.2s,%2.2s" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -#define vmulx_lane_f32(a, b, c) \ > - __extension__ \ > - ({ \ > - float32x4_t b_ = (b); \ > - float32x2_t a_ = (a); \ > - float32x2_t result; \ > - __asm__ ("fmulx %0.2s,%1.2s,%2.s[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -__extension__ static __inline float64_t __attribute__ ((__always_inline__)) > -vmulxd_f64 (float64_t a, float64_t b) > -{ > - float64_t result; > - __asm__ ("fmulx %d0, %d1, %d2" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline float32x4_t __attribute__ > ((__always_inline__)) > -vmulxq_f32 (float32x4_t a, float32x4_t b) > -{ > - float32x4_t result; > - __asm__ ("fmulx %0.4s,%1.4s,%2.4s" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline float64x2_t __attribute__ > ((__always_inline__)) > -vmulxq_f64 (float64x2_t a, float64x2_t b) > -{ > - float64x2_t result; > - __asm__ ("fmulx %0.2d,%1.2d,%2.2d" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -#define vmulxq_lane_f32(a, b, c) \ > - __extension__ \ > - ({ \ > - float32x4_t b_ = (b); \ > - float32x4_t a_ = (a); \ > - float32x4_t result; \ > - __asm__ ("fmulx %0.4s,%1.4s,%2.s[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmulxq_lane_f64(a, b, c) \ > - __extension__ \ > - ({ \ > - float64x2_t b_ = (b); \ > - float64x2_t a_ = (a); \ > - float64x2_t result; \ > - __asm__ ("fmulx %0.2d,%1.2d,%2.d[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -__extension__ static __inline float32_t __attribute__ ((__always_inline__)) > -vmulxs_f32 (float32_t a, float32_t b) > -{ > - float32_t result; > - __asm__ ("fmulx %s0, %s1, %s2" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) > vmvn_p8 (poly8x8_t a) > { > @@ -19172,6 +18507,78 @@ vmul_n_f64 (float64x1_t __a, float64_t __b) > return (float64x1_t) { vget_lane_f64 (__a, 0) * __b }; > } > +__extension__ static __inline float32x2_t __attribute__ > ((__always_inline__)) > +vmul_n_f32 (float32x2_t __a, float32_t __b) > +{ > + return __builtin_aarch64_mul_nv2sf (__a, __b); > +} > + > +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) > +vmul_n_s16 (int16x4_t __a, int16_t __b) > +{ > + return __builtin_aarch64_mul_nv4hi (__a, __b); > +} > + > +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) > +vmul_n_s32 (int32x2_t __a, int32_t __b) > +{ > + return __builtin_aarch64_mul_nv2si (__a, __b); > +} > + > +__extension__ static __inline uint16x4_t __attribute__ > ((__always_inline__)) > +vmul_n_u16 (uint16x4_t __a, uint16_t __b) > +{ > + return (uint16x4_t) __builtin_aarch64_mul_nv4hi ((int16x4_t)__a, > + (int16_t)__b); > +} > + > +__extension__ static __inline uint32x2_t __attribute__ > ((__always_inline__)) > +vmul_n_u32 (uint32x2_t __a, uint32_t __b) > +{ > + return (uint32x2_t) __builtin_aarch64_mul_nv2si ((int32x2_t)__a, > + (int32_t)__b); > +} > + > +/* vmulq_n */ > + > +__extension__ static __inline float32x4_t __attribute__ > ((__always_inline__)) > +vmulq_n_f32 (float32x4_t __a, float32_t __b) > +{ > + return __builtin_aarch64_mul_nv4sf (__a, __b); > +} > + > +__extension__ static __inline float64x2_t __attribute__ > ((__always_inline__)) > +vmulq_n_f64 (float64x2_t __a, float64_t __b) > +{ > + return __builtin_aarch64_mul_nv2df (__a, __b); > +} > + > +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) > +vmulq_n_s16 (int16x8_t __a, int16_t __b) > +{ > + return __builtin_aarch64_mul_nv8hi (__a, __b); > +} > + > +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) > +vmulq_n_s32 (int32x4_t __a, int32_t __b) > +{ > + return __builtin_aarch64_mul_nv4si (__a, __b); > +} > + > +__extension__ static __inline uint16x8_t __attribute__ > ((__always_inline__)) > +vmulq_n_u16 (uint16x8_t __a, uint16_t __b) > +{ > + return (uint16x8_t) __builtin_aarch64_mul_nv8hi ((int16x8_t)__a, > + (int16_t)__b); > +} > + > +__extension__ static __inline uint32x4_t __attribute__ > ((__always_inline__)) > +vmulq_n_u32 (uint32x4_t __a, uint32_t __b) > +{ > + return (uint32x4_t) __builtin_aarch64_mul_nv4si ((int32x4_t)__a, > + (int32_t)__b); > +} > + > /* vmulq_lane */ > __extension__ static __inline float32x4_t __attribute__ > ((__always_inline__)) > @@ -19249,6 +18656,308 @@ vmulq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, c > return __a * __aarch64_vgetq_lane_u32 (__b, __lane); > } > +/* vmull_high_lane */ > + > +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) > +vmull_high_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c) > +{ > + return __builtin_aarch64_smull2_lanev8hi (__a, __b, __c); > +} > + > +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) > +vmull_high_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c) > +{ > + return __builtin_aarch64_smull2_lanev4si (__a, __b, __c); > +} > + > +__extension__ static __inline uint32x4_t __attribute__ > ((__always_inline__)) > +vmull_high_lane_u16 (uint16x8_t __a, uint16x4_t __b, const int __c) > +{ > + return (uint32x4_t) __builtin_aarch64_umull2_lanev8hi ((int16x8_t) __a, > + (int16x4_t) __b, > + __c); > +} > + > +__extension__ static __inline uint64x2_t __attribute__ > ((__always_inline__)) > +vmull_high_lane_u32 (uint32x4_t __a, uint32x2_t __b, const int __c) > +{ > + return (uint64x2_t) __builtin_aarch64_umull2_lanev4si ((int32x4_t) __a, > + (int32x2_t) __b, > + __c); > +} > + > +/* vmull_high_laneq */ > + > +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) > +vmull_high_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c) > +{ > + return __builtin_aarch64_smull2_laneqv8hi (__a, __b, __c); > +} > + > +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) > +vmull_high_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c) > +{ > + return __builtin_aarch64_smull2_laneqv4si (__a, __b, __c); > +} > + > +__extension__ static __inline uint32x4_t __attribute__ > ((__always_inline__)) > +vmull_high_laneq_u16 (uint16x8_t __a, uint16x8_t __b, const int __c) > +{ > + return (uint32x4_t) __builtin_aarch64_umull2_laneqv8hi ((int16x8_t)__a, > + (int16x8_t)__b, > + __c); > +} > + > +__extension__ static __inline uint64x2_t __attribute__ > ((__always_inline__)) > +vmull_high_laneq_u32 (uint32x4_t __a, uint32x4_t __b, const int __c) > +{ > + return (uint64x2_t) __builtin_aarch64_umull2_laneqv4si ((int32x4_t) __a, > + (int32x4_t) __b, > + __c); > +} > + > +/* vmull_high_n */ > + > +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) > +vmull_high_n_s16 (int16x8_t __a, int16_t __b) > +{ > + return __builtin_aarch64_smull2_nv8hi (__a, __b); > +} > + > +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) > +vmull_high_n_s32 (int32x4_t __a, int32_t __b) > +{ > + return __builtin_aarch64_smull2_nv4si (__a, __b); > +} > + > +__extension__ static __inline uint32x4_t __attribute__ > ((__always_inline__)) > +vmull_high_n_u16 (uint16x8_t __a, uint16_t __b) > +{ > + return __builtin_aarch64_umull2_nv8hi_uuu (__a, __b); > +} > + > +__extension__ static __inline uint64x2_t __attribute__ > ((__always_inline__)) > +vmull_high_n_u32 (uint32x4_t __a, uint32_t __b) > +{ > + return __builtin_aarch64_umull2_nv4si_uuu (__a, __b); > +} > + > +/* vmull_high */ > + > +__extension__ static __inline poly16x8_t __attribute__ > ((__always_inline__)) > +vmull_high_p8 (poly8x16_t __a, poly8x16_t __b) > +{ > + return __builtin_aarch64_pmull2v16qi_ppp (__a, __b); > +} > + > +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) > +vmull_high_s8 (int8x16_t __a, int8x16_t __b) > +{ > + return __builtin_aarch64_vec_widen_smult_hi_v16qi (__a, __b); > +} > + > +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) > +vmull_high_s16 (int16x8_t __a, int16x8_t __b) > +{ > + return __builtin_aarch64_vec_widen_smult_hi_v8hi (__a, __b);
Index: gcc/ChangeLog =================================================================== --- gcc/ChangeLog (revision 218464) +++ gcc/ChangeLog (working copy) @@ -1,3 +1,38 @@ +2014-12-09 Felix Yang <felix.yang@huawei.com> + Jiji Jiang <jiangjiji@huawei.com> + + * config/aarch64/aarch64-simd.md (aarch64_mul_n<mode>, + aarch64_<su>mull_n<mode>, aarch64_<su>mull<mode>, + aarch64_simd_<su>mull2_n<mode>, aarch64_<su>mull2_n<mode>, + aarch64_<su>mull_lane<mode>, aarch64_<su>mull2_lane<mode>_internal, + aarch64_<su>mull_laneq<mode>, aarch64_<su>mull2_laneq<mode>_internal, + aarch64_smull2_lane<mode>, aarch64_umull2_lane<mode>, + aarch64_smull2_laneq<mode>, aarch64_umull2_laneq<mode>, + aarch64_fmulx<mode>, aarch64_fmulx<mode>, aarch64_fmulx_lane<mode>, + aarch64_pmull2v16qi, aarch64_pmullv8qi): New patterns. + * config/aarch64/aarch64-simd-builtins.def (vec_widen_smult_hi_, + vec_widen_umult_hi_, umull, smull, smull_n, umull_n, mul_n, smull2_n, + umull2_n, smull_lane, umull_lane, smull_laneq, umull_laneq, pmull, + umull2_lane, smull2_laneq, umull2_laneq, fmulx, fmulx_lane, pmull2, + smull2_lane): New builtins. + * config/aarch64/arm_neon.h (vmul_n_f32, vmul_n_s16, vmul_n_s32, + vmul_n_u16, vmul_n_u32, vmulq_n_f32, vmulq_n_f64, vmulq_n_s16, + vmulq_n_s32, vmulq_n_u16, vmulq_n_u32, vmull_high_lane_s16, + vmull_high_lane_s32, vmull_high_lane_u16, vmull_high_lane_u32, + vmull_high_laneq_s16, vmull_high_laneq_s32, vmull_high_laneq_u16, + vmull_high_laneq_u32, vmull_high_n_s16, vmull_high_n_s32, + vmull_high_n_u16, vmull_high_n_u32, vmull_high_p8, vmull_high_s8, + vmull_high_s16, vmull_high_s32, vmull_high_u8, vmull_high_u16, + vmull_high_u32, vmull_lane_s16, vmull_lane_s32, vmull_lane_u16, + vmull_lane_u32, vmull_laneq_s16, vmull_laneq_s32, vmull_laneq_u16, + vmull_laneq_u32, vmull_n_s16, vmull_n_s32, vmull_n_u16, vmull_n_u32, + vmull_p8, vmull_s8, vmull_s16, vmull_s32, vmull_u8, vmull_u16, + vmull_u32, vmulx_f32, vmulx_lane_f32, vmulxd_f64, vmulxq_f32, + vmulxq_f64, vmulxq_lane_f32, vmulxq_lane_f64, vmulxs_f32): Rewrite + using builtin functions. + * config/aarch64/iterators.md (UNSPEC_FMULX, UNSPEC_FMULX_LANE, + VDQF_Q): New unspec and int iterator. + 2014-12-07 Felix Yang <felix.yang@huawei.com> Shanyao Chen <chenshanyao@huawei.com> Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c =================================================================== --- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c (revision 0) +++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c (revision 0) @@ -0,0 +1,111 @@ +#include <arm_neon.h> +#include "arm-neon-ref.h" +#include "compute-ref-data.h" + + +/* Expected results. */ +VECT_VAR_DECL(expected,int,16,8) [] = { 0xfc48, 0xfcbf, 0xfd36, 0xfdad, + 0xfe24, 0xfe9b, 0xff12, 0xff89 }; +VECT_VAR_DECL(expected,int,32,4) [] = { 0xfffff9a0, 0xfffffa28, + 0xfffffab0, 0xfffffb38 }; +VECT_VAR_DECL(expected,int,64,2) [] = { 0xfffffffffffff7a2, + 0xfffffffffffff83b }; +VECT_VAR_DECL(expected,uint,16,8) [] = { 0xa4b0, 0xa55a, 0xa604, 0xa6ae, + 0xa758, 0xa802, 0xa8ac, 0xa956 }; +VECT_VAR_DECL(expected,uint,32,4) [] = { 0xbaf73c, 0xbaf7f7, + 0xbaf8b2, 0xbaf96d }; +VECT_VAR_DECL(expected,uint,64,2) [] = { 0xcbfffff4d8, + 0xcbfffff5a4}; +VECT_VAR_DECL(expected,poly,16,8) [] = { 0x6530, 0x659a, 0x6464, 0x64ce, + 0x6798, 0x6732, 0x66cc, 0x6666 }; + +#ifndef INSN_NAME +#define INSN_NAME vmull_high +#define TEST_MSG "VMUL_HIGH" +#endif + +#define FNNAME1(NAME) exec_ ## NAME +#define FNNAME(NAME) FNNAME1(NAME) + +void FNNAME (INSN_NAME) (void) +{ +#define DECL_VMUL(T, W, N) \ + DECL_VARIABLE(vector1, T, W, N); \ + DECL_VARIABLE(vector2, T, W, N); + + /* vector_res = OP(vector1, vector2), then store the result. */ +#define TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1) \ + VECT_VAR(vector_res, T1, W1, N1) = \ + INSN##Q##_##T2##W(VECT_VAR(vector1, T1, W, N), \ + VECT_VAR(vector2, T1, W, N)); \ + vst1q##_##T2##W1(VECT_VAR(result, T1, W1, N1), \ + VECT_VAR(vector_res, T1, W1, N1)) + +#define TEST_VMULL_HIGH(INSN, Q, T1, T2, W, N, W1, N1) \ + TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1) + +#define CHECK_VMULL_HIGH_RESULTS(test_name,comment) \ + { \ + CHECK(test_name, int, 16, 8, PRIx16, expected, comment); \ + CHECK(test_name, int, 32, 4, PRIx32, expected, comment); \ + CHECK(test_name, int, 64, 2, PRIx64, expected, comment); \ + CHECK(test_name, uint, 16, 8, PRIx16, expected, comment); \ + CHECK(test_name, uint, 32, 4, PRIx32, expected, comment); \ + CHECK(test_name, uint, 64, 2, PRIx64, expected, comment); \ + CHECK(test_name, poly, 16, 8, PRIx16, expected, comment); \ + } + + DECL_VMUL(int, 8, 16); + DECL_VMUL(int, 16, 8); + DECL_VMUL(int, 32, 4); + DECL_VMUL(uint, 8, 16); + DECL_VMUL(uint, 16, 8); + DECL_VMUL(uint, 32, 4); + DECL_VMUL(poly, 8, 16); + + DECL_VARIABLE(vector_res, int, 16, 8); + DECL_VARIABLE(vector_res, int, 32, 4); + DECL_VARIABLE(vector_res, int, 64, 2); + DECL_VARIABLE(vector_res, uint, 16, 8); + DECL_VARIABLE(vector_res, uint, 32, 4); + DECL_VARIABLE(vector_res, uint, 64, 2); + DECL_VARIABLE(vector_res, poly, 16, 8); + + clean_results (); + + /* Initialize input "vector1" from "buffer". */ + VLOAD(vector1, buffer, q, int, s, 8, 16); + VLOAD(vector1, buffer, q, int, s, 16, 8); + VLOAD(vector1, buffer, q, int, s, 32, 4); + VLOAD(vector1, buffer, q, uint, u, 8, 16); + VLOAD(vector1, buffer, q, uint, u, 16, 8); + VLOAD(vector1, buffer, q, uint, u, 32, 4); + VLOAD(vector1, buffer, q, poly, p, 8, 16); + + /* Choose init value arbitrarily. */ + VDUP(vector2, q, int, s, 8, 16, 0x77); + VDUP(vector2, q, int, s, 16, 8, 0x88); + VDUP(vector2, q, int, s, 32, 4, 0x99); + VDUP(vector2, q, uint, u, 8, 16, 0xAA); + VDUP(vector2, q, uint, u, 16, 8, 0xBB); + VDUP(vector2, q, uint, u, 32, 4, 0xCC); + VDUP(vector2, q, poly, p, 8, 16, 0xAA); + + /* Execute the tests. */ + TEST_VMULL_HIGH(INSN_NAME, , int, s, 8, 16, 16, 8); + TEST_VMULL_HIGH(INSN_NAME, , int, s, 16, 8, 32, 4); + TEST_VMULL_HIGH(INSN_NAME, , int, s, 32, 4, 64, 2); + TEST_VMULL_HIGH(INSN_NAME, , uint, u, 8, 16, 16, 8); + TEST_VMULL_HIGH(INSN_NAME, , uint, u, 16, 8, 32, 4); + TEST_VMULL_HIGH(INSN_NAME, , uint, u, 32, 4, 64, 2); + TEST_VMULL_HIGH(INSN_NAME, , poly, p, 8, 16, 16, 8); + + CHECK_VMULL_HIGH_RESULTS (TEST_MSG, ""); +} + +int main (void) +{ + FNNAME (INSN_NAME) (); + + return 0; +} Property changes on: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c ___________________________________________________________________ Added: svn:executable + * Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c =================================================================== --- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c (revision 0) +++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c (revision 0) @@ -0,0 +1,135 @@ +#include <arm_neon.h> +#include "arm-neon-ref.h" +#include "compute-ref-data.h" + +VECT_VAR_DECL(expected, int, 32, 4) [] = { 0x4000, 0x4000, 0x4000, 0x4000 }; +VECT_VAR_DECL(expected, int, 64, 2) [] = { 0x2000, 0x2000}; +VECT_VAR_DECL(expected, uint, 32, 4) [] = { 0x4000, 0x4000, 0x4000, 0x4000 }; +VECT_VAR_DECL(expected, uint, 64, 2) [] = { 0x2000, 0x2000 }; + +#define TEST_MSG "VMULL_HIGH_LANE/VMULL_HIGH_LANEQ" +void exec_vmull_high_lane (void) +{ + /* vector_res = vmull_lane(vector,vector2,lane), then store the result. */ +#define TEST_VMULL_HIGH_LANE(T1, T2, W, W2, N1, N2, L) \ + VECT_VAR(vector_res, T1, W2, N2) = \ + vmull##_high_lane_##T2##W(VECT_VAR(vector, T1, W, N1 ), \ + VECT_VAR(vector2, T1, W, N2), \ + L); \ + vst1q_##T2##W2(VECT_VAR(result, T1, W2, N2), VECT_VAR(vector_res, T1, W2, N2)) + +#define CHECK_VMULL_HIGH_LANE_RESULTS(test_name,comment) \ + { \ + CHECK(test_name, int, 32, 4, PRIx32, expected, comment); \ + CHECK(test_name, int, 64, 2, PRIx64, expected, comment); \ + CHECK(test_name, uint, 32, 4, PRIx32, expected, comment); \ + CHECK(test_name, uint, 64, 2, PRIx64, expected, comment); \ + } + + + /* With ARM RVCT, we need to declare variables before any executable + statement */ + DECL_VARIABLE(vector, int, 16, 8); + DECL_VARIABLE(vector, int, 32, 4); + DECL_VARIABLE(vector, uint, 16, 8); + DECL_VARIABLE(vector, uint, 32, 4); + DECL_VARIABLE(vector2, int, 16, 4); + DECL_VARIABLE(vector2, int, 32, 2); + DECL_VARIABLE(vector2, uint, 16, 4); + DECL_VARIABLE(vector2, uint, 32, 2); + + DECL_VARIABLE(vector_res, int, 32, 4); + DECL_VARIABLE(vector_res, int, 64, 2); + DECL_VARIABLE(vector_res, uint, 32, 4); + DECL_VARIABLE(vector_res, uint, 64, 2); + + clean_results (); + + /* Initialize vector */ + VDUP(vector2, , int, s, 16, 4, 0x1000); + VDUP(vector2, , int, s, 32, 2, 0x1000); + VDUP(vector2, , uint, u, 16, 4, 0x1000); + VDUP(vector2, , uint, u, 32, 2, 0x1000); + + /* Initialize vector2 */ + VDUP(vector, q, int, s, 16, 8, 0x4); + VDUP(vector, q, int, s, 32, 4, 0x2); + VDUP(vector, q, uint, u, 16, 8, 0x4); + VDUP(vector, q, uint, u, 32, 4, 0x2); + + /* Choose lane arbitrarily */ + TEST_VMULL_HIGH_LANE(int, s, 16, 32, 8, 4, 2); + TEST_VMULL_HIGH_LANE(int, s, 32, 64, 4, 2, 1); + TEST_VMULL_HIGH_LANE(uint, u, 16, 32, 8, 4, 2); + TEST_VMULL_HIGH_LANE(uint, u, 32, 64, 4, 2, 1); + + CHECK_VMULL_HIGH_LANE_RESULTS (TEST_MSG, ""); +} + + +void exec_vmull_high_laneq (void) +{ + /* vector_res = vmull_lane(vector,vector2,lane), then store the result. */ +#define TEST_VMULL_HIGH_LANEQ(T1, T2, W, W2, N2, N1, L) \ + VECT_VAR(vector_res, T1, W2, N1) = \ + vmull##_high_laneq_##T2##W(VECT_VAR(vector, T1, W, N2 ), \ + VECT_VAR(vector2, T1, W, N2), \ + L); \ + vst1q_##T2##W2(VECT_VAR(result, T1, W2, N1), VECT_VAR(vector_res, T1, W2, N1)) + +#define CHECK_VMULL_HIGH_LANEQ_RESULTS(test_name,comment) \ + { \ + CHECK(test_name, int, 32, 4, PRIx32, expected, comment); \ + CHECK(test_name, int, 64, 2, PRIx64, expected, comment); \ + CHECK(test_name, uint, 32, 4, PRIx32, expected, comment); \ + CHECK(test_name, uint, 64, 2, PRIx64, expected, comment); \ + } + + + /* With ARM RVCT, we need to declare variables before any executable + statement */ + DECL_VARIABLE(vector, int, 16, 8); + DECL_VARIABLE(vector, int, 32, 4); + DECL_VARIABLE(vector, uint, 16, 8); + DECL_VARIABLE(vector, uint, 32, 4); + DECL_VARIABLE(vector2, int, 16, 8); + DECL_VARIABLE(vector2, int, 32, 4); + DECL_VARIABLE(vector2, uint, 16, 8); + DECL_VARIABLE(vector2, uint, 32, 4); + + DECL_VARIABLE(vector_res, int, 32, 4); + DECL_VARIABLE(vector_res, int, 64, 2); + DECL_VARIABLE(vector_res, uint, 32, 4); + DECL_VARIABLE(vector_res, uint, 64, 2); + + clean_results (); + + /* Initialize vector */ + VDUP(vector2, q, int, s, 16, 8, 0x1000); + VDUP(vector2, q, int, s, 32, 4, 0x1000); + VDUP(vector2, q, uint, u, 16, 8, 0x1000); + VDUP(vector2, q, uint, u, 32, 4, 0x1000); + + /* Initialize vector2 */ + VDUP(vector, q, int, s, 16, 8, 0x4); + VDUP(vector, q, int, s, 32, 4, 0x2); + VDUP(vector, q, uint, u, 16, 8, 0x4); + VDUP(vector, q, uint, u, 32, 4, 0x2); + + /* Choose lane arbitrarily */ + TEST_VMULL_HIGH_LANEQ(int, s, 16, 32, 8, 4, 2); + TEST_VMULL_HIGH_LANEQ(int, s, 32, 64, 4, 2, 1); + TEST_VMULL_HIGH_LANEQ(uint, u, 16, 32, 8, 4, 2); + TEST_VMULL_HIGH_LANEQ(uint, u, 32, 64, 4, 2, 1); + + CHECK_VMULL_HIGH_LANEQ_RESULTS (TEST_MSG, ""); +} + + + + +int main (void) +{ + exec_vmull_high_lane(); + return 0; +} Property changes on: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c ___________________________________________________________________ Added: svn:executable + * Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c =================================================================== --- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c (revision 0) +++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c (revision 0) @@ -0,0 +1,81 @@ +#include <arm_neon.h> +#include "arm-neon-ref.h" +#include "compute-ref-data.h" + + +/* Expected results. */ +VECT_VAR_DECL(expected,int,32,4) [] = { 0xfffff73c, 0xfffff7f7, + 0xfffff8b2, 0xfffff96d }; +VECT_VAR_DECL(expected,int,64,2) [] = { 0xfffffffffffff4d8, + 0xfffffffffffff5a4 }; +VECT_VAR_DECL(expected,uint,32,4) [] = { 0xedf4d8, 0xedf5c6, + 0xedf6b4, 0xedf7a2 }; +VECT_VAR_DECL(expected,uint,64,2) [] = { 0xfefffff20e, + 0xfefffff30d}; + +#ifndef INSN_NAME +#define INSN_NAME vmull_high_n +#define TEST_MSG "VMULL_HIGH_N" +#endif + +#define FNNAME1(NAME) exec_ ## NAME +#define FNNAME(NAME) FNNAME1(NAME) + +void FNNAME (INSN_NAME) (void) +{ +#define DECL_VMUL(T, W, N) \ + DECL_VARIABLE(vector1, T, W, N); \ + + /* vector_res = OP(vector1, vector2), then store the result. */ +#define TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1, C) \ + VECT_VAR(vector_res, T1, W1, N1) = \ + INSN##Q##_##T2##W(VECT_VAR(vector1, T1, W, N), \ + C); \ + vst1q##_##T2##W1(VECT_VAR(result, T1, W1, N1), \ + VECT_VAR(vector_res, T1, W1, N1)) + +#define TEST_VMULL_HIGH(INSN, Q, T1, T2, W, N, W1, N1, C) \ + TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1, C) + +#define CHECK_VMULL_HIGH_N_RESULTS(test_name,comment) \ + { \ + CHECK(test_name, int, 32, 4, PRIx32, expected, comment); \ + CHECK(test_name, int, 64, 2, PRIx64, expected, comment); \ + CHECK(test_name, uint, 32, 4, PRIx32, expected, comment); \ + CHECK(test_name, uint, 64, 2, PRIx64, expected, comment); \ + } + + DECL_VMUL(int, 16, 8); + DECL_VMUL(int, 32, 4); + DECL_VMUL(uint, 16, 8); + DECL_VMUL(uint, 32, 4); + + DECL_VARIABLE(vector_res, int, 32, 4); + DECL_VARIABLE(vector_res, int, 64, 2); + DECL_VARIABLE(vector_res, uint, 32, 4); + DECL_VARIABLE(vector_res, uint, 64, 2); + + clean_results (); + + /* Initialize input "vector1" from "buffer". */ + VLOAD(vector1, buffer, q, int, s, 16, 8); + VLOAD(vector1, buffer, q, int, s, 32, 4); + VLOAD(vector1, buffer, q, uint, u, 16, 8); + VLOAD(vector1, buffer, q, uint, u, 32, 4); + + + /* Execute the tests. */ + TEST_VMULL_HIGH(INSN_NAME, , int, s, 16, 8, 32, 4, 0xBB); + TEST_VMULL_HIGH(INSN_NAME, , int, s, 32, 4, 64, 2, 0xCC); + TEST_VMULL_HIGH(INSN_NAME, , uint, u, 16, 8, 32, 4, 0xEE); + TEST_VMULL_HIGH(INSN_NAME, , uint, u, 32, 4, 64, 2, 0xFF); + + CHECK_VMULL_HIGH_N_RESULTS (TEST_MSG, ""); +} + +int main (void) +{ + FNNAME (INSN_NAME) (); + + return 0; +} Property changes on: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c ___________________________________________________________________ Added: svn:executable + * Index: gcc/testsuite/ChangeLog =================================================================== --- gcc/testsuite/ChangeLog (revision 218464) +++ gcc/testsuite/ChangeLog (working copy) @@ -1,3 +1,13 @@ +2014-12-09 Felix Yang <felix.yang@huawei.com> + Jiji Jiang <jiangjiji@huawei.com> + + * testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c: New + test. + * testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c: + New test. + * testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c: New + test. + 2014-12-07 Christophe Lyon <christophe.lyon@linaro.org> * gcc.target/aarch64/advsimd-intrinsics/vaddhn.c: Actually execute Index: gcc/config/aarch64/arm_neon.h =================================================================== --- gcc/config/aarch64/arm_neon.h (revision 218464) +++ gcc/config/aarch64/arm_neon.h (working copy) @@ -7627,671 +7627,6 @@ vmovn_u64 (uint64x2_t a) return result; } -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vmul_n_f32 (float32x2_t a, float32_t b) -{ - float32x2_t result; - __asm__ ("fmul %0.2s,%1.2s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vmul_n_s16 (int16x4_t a, int16_t b) -{ - int16x4_t result; - __asm__ ("mul %0.4h,%1.4h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vmul_n_s32 (int32x2_t a, int32_t b) -{ - int32x2_t result; - __asm__ ("mul %0.2s,%1.2s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vmul_n_u16 (uint16x4_t a, uint16_t b) -{ - uint16x4_t result; - __asm__ ("mul %0.4h,%1.4h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vmul_n_u32 (uint32x2_t a, uint32_t b) -{ - uint32x2_t result; - __asm__ ("mul %0.2s,%1.2s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -#define vmull_high_lane_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x4_t b_ = (b); \ - int16x8_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_high_lane_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x2_t b_ = (b); \ - int32x4_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_high_lane_u16(a, b, c) \ - __extension__ \ - ({ \ - uint16x4_t b_ = (b); \ - uint16x8_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_high_lane_u32(a, b, c) \ - __extension__ \ - ({ \ - uint32x2_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_high_laneq_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x8_t b_ = (b); \ - int16x8_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_high_laneq_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x4_t b_ = (b); \ - int32x4_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_high_laneq_u16(a, b, c) \ - __extension__ \ - ({ \ - uint16x8_t b_ = (b); \ - uint16x8_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_high_laneq_u32(a, b, c) \ - __extension__ \ - ({ \ - uint32x4_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vmull_high_n_s16 (int16x8_t a, int16_t b) -{ - int32x4_t result; - __asm__ ("smull2 %0.4s,%1.8h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vmull_high_n_s32 (int32x4_t a, int32_t b) -{ - int64x2_t result; - __asm__ ("smull2 %0.2d,%1.4s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vmull_high_n_u16 (uint16x8_t a, uint16_t b) -{ - uint32x4_t result; - __asm__ ("umull2 %0.4s,%1.8h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vmull_high_n_u32 (uint32x4_t a, uint32_t b) -{ - uint64x2_t result; - __asm__ ("umull2 %0.2d,%1.4s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) -vmull_high_p8 (poly8x16_t a, poly8x16_t b) -{ - poly16x8_t result; - __asm__ ("pmull2 %0.8h,%1.16b,%2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vmull_high_s8 (int8x16_t a, int8x16_t b) -{ - int16x8_t result; - __asm__ ("smull2 %0.8h,%1.16b,%2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vmull_high_s16 (int16x8_t a, int16x8_t b) -{ - int32x4_t result; - __asm__ ("smull2 %0.4s,%1.8h,%2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vmull_high_s32 (int32x4_t a, int32x4_t b) -{ - int64x2_t result; - __asm__ ("smull2 %0.2d,%1.4s,%2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vmull_high_u8 (uint8x16_t a, uint8x16_t b) -{ - uint16x8_t result; - __asm__ ("umull2 %0.8h,%1.16b,%2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vmull_high_u16 (uint16x8_t a, uint16x8_t b) -{ - uint32x4_t result; - __asm__ ("umull2 %0.4s,%1.8h,%2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vmull_high_u32 (uint32x4_t a, uint32x4_t b) -{ - uint64x2_t result; - __asm__ ("umull2 %0.2d,%1.4s,%2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -#define vmull_lane_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x4_t b_ = (b); \ - int16x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("smull %0.4s,%1.4h,%2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_lane_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x2_t b_ = (b); \ - int32x2_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("smull %0.2d,%1.2s,%2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_lane_u16(a, b, c) \ - __extension__ \ - ({ \ - uint16x4_t b_ = (b); \ - uint16x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("umull %0.4s,%1.4h,%2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_lane_u32(a, b, c) \ - __extension__ \ - ({ \ - uint32x2_t b_ = (b); \ - uint32x2_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("umull %0.2d, %1.2s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_laneq_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x8_t b_ = (b); \ - int16x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("smull %0.4s, %1.4h, %2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_laneq_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x4_t b_ = (b); \ - int32x2_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("smull %0.2d, %1.2s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_laneq_u16(a, b, c) \ - __extension__ \ - ({ \ - uint16x8_t b_ = (b); \ - uint16x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("umull %0.4s, %1.4h, %2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_laneq_u32(a, b, c) \ - __extension__ \ - ({ \ - uint32x4_t b_ = (b); \ - uint32x2_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("umull %0.2d, %1.2s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vmull_n_s16 (int16x4_t a, int16_t b) -{ - int32x4_t result; - __asm__ ("smull %0.4s,%1.4h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vmull_n_s32 (int32x2_t a, int32_t b) -{ - int64x2_t result; - __asm__ ("smull %0.2d,%1.2s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vmull_n_u16 (uint16x4_t a, uint16_t b) -{ - uint32x4_t result; - __asm__ ("umull %0.4s,%1.4h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vmull_n_u32 (uint32x2_t a, uint32_t b) -{ - uint64x2_t result; - __asm__ ("umull %0.2d,%1.2s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) -vmull_p8 (poly8x8_t a, poly8x8_t b) -{ - poly16x8_t result; - __asm__ ("pmull %0.8h, %1.8b, %2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vmull_s8 (int8x8_t a, int8x8_t b) -{ - int16x8_t result; - __asm__ ("smull %0.8h, %1.8b, %2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vmull_s16 (int16x4_t a, int16x4_t b) -{ - int32x4_t result; - __asm__ ("smull %0.4s, %1.4h, %2.4h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vmull_s32 (int32x2_t a, int32x2_t b) -{ - int64x2_t result; - __asm__ ("smull %0.2d, %1.2s, %2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vmull_u8 (uint8x8_t a, uint8x8_t b) -{ - uint16x8_t result; - __asm__ ("umull %0.8h, %1.8b, %2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vmull_u16 (uint16x4_t a, uint16x4_t b) -{ - uint32x4_t result; - __asm__ ("umull %0.4s, %1.4h, %2.4h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vmull_u32 (uint32x2_t a, uint32x2_t b) -{ - uint64x2_t result; - __asm__ ("umull %0.2d, %1.2s, %2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vmulq_n_f32 (float32x4_t a, float32_t b) -{ - float32x4_t result; - __asm__ ("fmul %0.4s,%1.4s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vmulq_n_f64 (float64x2_t a, float64_t b) -{ - float64x2_t result; - __asm__ ("fmul %0.2d,%1.2d,%2.d[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vmulq_n_s16 (int16x8_t a, int16_t b) -{ - int16x8_t result; - __asm__ ("mul %0.8h,%1.8h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vmulq_n_s32 (int32x4_t a, int32_t b) -{ - int32x4_t result; - __asm__ ("mul %0.4s,%1.4s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vmulq_n_u16 (uint16x8_t a, uint16_t b) -{ - uint16x8_t result; - __asm__ ("mul %0.8h,%1.8h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vmulq_n_u32 (uint32x4_t a, uint32_t b) -{ - uint32x4_t result; - __asm__ ("mul %0.4s,%1.4s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vmulx_f32 (float32x2_t a, float32x2_t b) -{ - float32x2_t result; - __asm__ ("fmulx %0.2s,%1.2s,%2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -#define vmulx_lane_f32(a, b, c) \ - __extension__ \ - ({ \ - float32x4_t b_ = (b); \ - float32x2_t a_ = (a); \ - float32x2_t result; \ - __asm__ ("fmulx %0.2s,%1.2s,%2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -__extension__ static __inline float64_t __attribute__ ((__always_inline__)) -vmulxd_f64 (float64_t a, float64_t b) -{ - float64_t result; - __asm__ ("fmulx %d0, %d1, %d2" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vmulxq_f32 (float32x4_t a, float32x4_t b) -{ - float32x4_t result; - __asm__ ("fmulx %0.4s,%1.4s,%2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vmulxq_f64 (float64x2_t a, float64x2_t b) -{ - float64x2_t result; - __asm__ ("fmulx %0.2d,%1.2d,%2.2d" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -#define vmulxq_lane_f32(a, b, c) \ - __extension__ \ - ({ \ - float32x4_t b_ = (b); \ - float32x4_t a_ = (a); \ - float32x4_t result; \ - __asm__ ("fmulx %0.4s,%1.4s,%2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmulxq_lane_f64(a, b, c) \ - __extension__ \ - ({ \ - float64x2_t b_ = (b); \ - float64x2_t a_ = (a); \ - float64x2_t result; \ - __asm__ ("fmulx %0.2d,%1.2d,%2.d[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -__extension__ static __inline float32_t __attribute__ ((__always_inline__)) -vmulxs_f32 (float32_t a, float32_t b) -{ - float32_t result; - __asm__ ("fmulx %s0, %s1, %s2" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) vmvn_p8 (poly8x8_t a) { @@ -19172,6 +18507,78 @@ vmul_n_f64 (float64x1_t __a, float64_t __b) return (float64x1_t) { vget_lane_f64 (__a, 0) * __b }; } +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vmul_n_f32 (float32x2_t __a, float32_t __b) +{ + return __builtin_aarch64_mul_nv2sf (__a, __b); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vmul_n_s16 (int16x4_t __a, int16_t __b) +{ + return __builtin_aarch64_mul_nv4hi (__a, __b); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vmul_n_s32 (int32x2_t __a, int32_t __b) +{ + return __builtin_aarch64_mul_nv2si (__a, __b); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vmul_n_u16 (uint16x4_t __a, uint16_t __b) +{ + return (uint16x4_t) __builtin_aarch64_mul_nv4hi ((int16x4_t)__a, + (int16_t)__b); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vmul_n_u32 (uint32x2_t __a, uint32_t __b) +{ + return (uint32x2_t) __builtin_aarch64_mul_nv2si ((int32x2_t)__a, + (int32_t)__b); +} + +/* vmulq_n */ + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vmulq_n_f32 (float32x4_t __a, float32_t __b) +{ + return __builtin_aarch64_mul_nv4sf (__a, __b); +} + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vmulq_n_f64 (float64x2_t __a, float64_t __b) +{ + return __builtin_aarch64_mul_nv2df (__a, __b); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmulq_n_s16 (int16x8_t __a, int16_t __b) +{ + return __builtin_aarch64_mul_nv8hi (__a, __b); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmulq_n_s32 (int32x4_t __a, int32_t __b) +{ + return __builtin_aarch64_mul_nv4si (__a, __b); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmulq_n_u16 (uint16x8_t __a, uint16_t __b) +{ + return (uint16x8_t) __builtin_aarch64_mul_nv8hi ((int16x8_t)__a, + (int16_t)__b); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmulq_n_u32 (uint32x4_t __a, uint32_t __b) +{ + return (uint32x4_t) __builtin_aarch64_mul_nv4si ((int32x4_t)__a, + (int32_t)__b); +} + /* vmulq_lane */ __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) @@ -19249,6 +18656,308 @@ vmulq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, c return __a * __aarch64_vgetq_lane_u32 (__b, __lane); } +/* vmull_high_lane */ + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmull_high_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c) +{ + return __builtin_aarch64_smull2_lanev8hi (__a, __b, __c); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmull_high_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c) +{ + return __builtin_aarch64_smull2_lanev4si (__a, __b, __c); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmull_high_lane_u16 (uint16x8_t __a, uint16x4_t __b, const int __c) +{ + return (uint32x4_t) __builtin_aarch64_umull2_lanev8hi ((int16x8_t) __a, + (int16x4_t) __b, + __c); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmull_high_lane_u32 (uint32x4_t __a, uint32x2_t __b, const int __c) +{ + return (uint64x2_t) __builtin_aarch64_umull2_lanev4si ((int32x4_t) __a, + (int32x2_t) __b, + __c); +} + +/* vmull_high_laneq */ + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmull_high_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c) +{ + return __builtin_aarch64_smull2_laneqv8hi (__a, __b, __c); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmull_high_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c) +{ + return __builtin_aarch64_smull2_laneqv4si (__a, __b, __c); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmull_high_laneq_u16 (uint16x8_t __a, uint16x8_t __b, const int __c) +{ + return (uint32x4_t) __builtin_aarch64_umull2_laneqv8hi ((int16x8_t)__a, + (int16x8_t)__b, + __c); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmull_high_laneq_u32 (uint32x4_t __a, uint32x4_t __b, const int __c) +{ + return (uint64x2_t) __builtin_aarch64_umull2_laneqv4si ((int32x4_t) __a, + (int32x4_t) __b, + __c); +} + +/* vmull_high_n */ + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmull_high_n_s16 (int16x8_t __a, int16_t __b) +{ + return __builtin_aarch64_smull2_nv8hi (__a, __b); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmull_high_n_s32 (int32x4_t __a, int32_t __b) +{ + return __builtin_aarch64_smull2_nv4si (__a, __b); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmull_high_n_u16 (uint16x8_t __a, uint16_t __b) +{ + return __builtin_aarch64_umull2_nv8hi_uuu (__a, __b); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmull_high_n_u32 (uint32x4_t __a, uint32_t __b) +{ + return __builtin_aarch64_umull2_nv4si_uuu (__a, __b); +} + +/* vmull_high */ + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vmull_high_p8 (poly8x16_t __a, poly8x16_t __b) +{ + return __builtin_aarch64_pmull2v16qi_ppp (__a, __b); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmull_high_s8 (int8x16_t __a, int8x16_t __b) +{ + return __builtin_aarch64_vec_widen_smult_hi_v16qi (__a, __b); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmull_high_s16 (int16x8_t __a, int16x8_t __b) +{ + return __builtin_aarch64_vec_widen_smult_hi_v8hi (__a, __b); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmull_high_s32 (int32x4_t __a, int32x4_t __b) +{ + return __builtin_aarch64_vec_widen_smult_hi_v4si (__a, __b); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmull_high_u8 (uint8x16_t __a, uint8x16_t __b) +{ + return __builtin_aarch64_vec_widen_umult_hi_v16qi_uuu (__a, __b); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmull_high_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return __builtin_aarch64_vec_widen_umult_hi_v8hi_uuu (__a, __b); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmull_high_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return __builtin_aarch64_vec_widen_umult_hi_v4si_uuu (__a, __b); +} + +/* vmull_lane */ + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmull_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c) +{ + return __builtin_aarch64_smull_lanev4hi (__a, __b, __c); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmull_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c) +{ + return __builtin_aarch64_smull_lanev2si (__a, __b, __c); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmull_lane_u16 (uint16x4_t __a, uint16x4_t __b, const int __c) +{ + return __builtin_aarch64_umull_lanev4hi_uuuu (__a, __b, __c); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmull_lane_u32 (uint32x2_t __a, uint32x2_t __b, const int __c) +{ + return __builtin_aarch64_umull_lanev2si_uuuu (__a, __b, __c); +} + +/* vmull_laneq */ + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmull_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __c) +{ + return __builtin_aarch64_smull_laneqv4hi (__a, __b, __c); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmull_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __c) +{ + return __builtin_aarch64_smull_laneqv2si (__a, __b, __c); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmull_laneq_u16 (uint16x4_t __a, uint16x8_t __b, const int __c) +{ + return __builtin_aarch64_umull_laneqv4hi_uuuu (__a, __b, __c); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmull_laneq_u32 (uint32x2_t __a, uint32x4_t __b, const int __c) +{ + return __builtin_aarch64_umull_laneqv2si_uuuu (__a, __b, __c); +} + +/* vmull_n */ + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmull_n_s16 (int16x4_t __a, int16_t __b) +{ + return __builtin_aarch64_smull_nv4hi (__a, __b); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmull_n_s32 (int32x2_t __a, int32_t __b) +{ + return __builtin_aarch64_smull_nv2si (__a, __b); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmull_n_u16 (uint16x4_t __a, uint16_t __b) +{ + return __builtin_aarch64_umull_nv4hi_uuu (__a, __b); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmull_n_u32 (uint32x2_t __a, uint32_t __b) +{ + return __builtin_aarch64_umull_nv2si_uuu (__a, __b); +} + +/* vmull */ +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vmull_p8 (poly8x8_t __a, poly8x8_t __b) +{ + return __builtin_aarch64_pmullv8qi_ppp (__a, __b); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmull_s8 (int8x8_t __a, int8x8_t __b) +{ + return __builtin_aarch64_smullv8qi (__a, __b); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmull_s16 (int16x4_t __a, int16x4_t __b) +{ + return __builtin_aarch64_smullv4hi (__a, __b); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmull_s32 (int32x2_t __a, int32x2_t __b) +{ + return __builtin_aarch64_smullv2si (__a, __b); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmull_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return __builtin_aarch64_umullv8qi_uuu (__a, __b); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmull_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return __builtin_aarch64_umullv4hi_uuu (__a, __b); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmull_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return __builtin_aarch64_umullv2si_uuu (__a, __b); +} + +/* vmulx */ + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vmulx_f32 (float32x2_t __a, float32x2_t __b) +{ + return __builtin_aarch64_fmulxv2sf (__a, __b); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vmulx_lane_f32 (float32x2_t __a, float32x4_t __b, const int __c) +{ + return __builtin_aarch64_fmulx_lanev2sf (__a, __b, __c); +} + + +__extension__ static __inline float64_t __attribute__ ((__always_inline__)) +vmulxd_f64 (float64_t __a, float64_t __b) +{ + return __builtin_aarch64_fmulxdf (__a, __b); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vmulxq_f32 (float32x4_t __a, float32x4_t __b) +{ + return __builtin_aarch64_fmulxv4sf (__a, __b); +} + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vmulxq_f64 (float64x2_t __a, float64x2_t __b) +{ + return __builtin_aarch64_fmulxv2df (__a, __b); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vmulxq_lane_f32 (float32x4_t __a, float32x4_t __b, const int __c) +{ + return __builtin_aarch64_fmulx_lanev4sf (__a, __b, __c); +} + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vmulxq_lane_f64 (float64x2_t __a, float64x2_t __b, const int __c) +{ + return __builtin_aarch64_fmulx_lanev2df (__a, __b, __c); +} + +__extension__ static __inline float32_t __attribute__ ((__always_inline__)) +vmulxs_f32 (float32_t __a, float32_t __b) +{ + return __builtin_aarch64_fmulxsf (__a, __b); +} + /* vneg */ __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) Index: gcc/config/aarch64/iterators.md =================================================================== --- gcc/config/aarch64/iterators.md (revision 218464) +++ gcc/config/aarch64/iterators.md (working copy) @@ -273,6 +273,8 @@ UNSPEC_SHA256SU1 ; Used in aarch64-simd.md. UNSPEC_PMULL ; Used in aarch64-simd.md. UNSPEC_PMULL2 ; Used in aarch64-simd.md. + UNSPEC_FMULX ; Used in aarch64-simd.md. + UNSPEC_FMULX_LANE ; Used in aarch64-simd.md. ]) ;; ------------------------------------------------------------------- @@ -462,6 +464,9 @@ ) +(define_mode_attr VDQF_Q [(V2SF "V4SF") (V4SF "V4SF") + (V2DF "V2DF")]) + ;; Widened mode register suffixes for VD_BHSI/VQW. (define_mode_attr Vwtype [(V8QI "8h") (V4HI "4s") (V2SI "2d") (V16QI "8h") Index: gcc/config/aarch64/aarch64-simd.md =================================================================== --- gcc/config/aarch64/aarch64-simd.md (revision 218464) +++ gcc/config/aarch64/aarch64-simd.md (working copy) @@ -1394,6 +1394,253 @@ } ) +(define_insn "aarch64_mul_n<mode>" + [(set (match_operand:VMUL 0 "register_operand" "=w") + (mult:VMUL + (match_operand:VMUL 1 "register_operand" "w") + (vec_duplicate:VMUL + (match_operand:<VEL> 2 "register_operand" "<h_con>"))))] + "TARGET_SIMD" + "<f>mul\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[0]" + [(set_attr "type" "neon_mul_<Vetype>_long")] +) + +(define_insn "aarch64_<su>mull_n<mode>" + [(set (match_operand:<VWIDE> 0 "register_operand" "=w") + (mult:<VWIDE> + (ANY_EXTEND:<VWIDE> + (match_operand:VD_HSI 1 "register_operand" "w")) + (ANY_EXTEND:<VWIDE> + (vec_duplicate:VD_HSI + (match_operand:<VEL> 2 "register_operand" "<vwx>")))))] + "TARGET_SIMD" + "<su>mull\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[0]" + [(set_attr "type" "neon_mul_<Vetype>_scalar_long")] +) + + +(define_insn "aarch64_<su>mull<mode>" + [(set (match_operand:<VWIDE> 0 "register_operand" "=w") + (mult:<VWIDE> + (ANY_EXTEND:<VWIDE> + (match_operand:VD_BHSI 1 "register_operand" "w")) + (ANY_EXTEND:<VWIDE> + (match_operand:VD_BHSI 2 "register_operand" "w"))))] + "TARGET_SIMD" + "<su>mull\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vtype>" + [(set_attr "type" "neon_mul_<Vetype>_long")] +) + +(define_insn "aarch64_simd_<su>mull2_n<mode>" + [(set (match_operand:<VWIDE> 0 "register_operand" "=w") + (mult:<VWIDE> (ANY_EXTEND:<VWIDE> (vec_select:<VHALF> + (match_operand:VQ_HSI 1 "register_operand" "w") + (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" ""))) + (ANY_EXTEND:<VWIDE> (vec_duplicate:<VHALF> + (match_operand:<VEL> 2 "register_operand" "<vw>")))))] + "TARGET_SIMD" + "<su>mull2\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[0]" + [(set_attr "type" "neon_mul_<Vetype>_scalar_long")] +) + +(define_expand "aarch64_<su>mull2_n<mode>" + [(match_operand:<VWIDE> 0 "register_operand" "") + (ANY_EXTEND:<VWIDE> (match_operand:VQ_HSI 1 "register_operand" "")) + (match_operand:<VEL> 2 "register_operand" "")] + "TARGET_SIMD" + { + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true); + emit_insn (gen_aarch64_simd_<su>mull2_n<mode> (operands[0], + operands[1], + operands[2], p)); + DONE; + + } +) + +(define_insn "aarch64_<su>mull_lane<mode>" + [(set (match_operand:<VWIDE> 0 "register_operand" "=w") + (mult:<VWIDE> + (ANY_EXTEND:<VWIDE> + (match_operand:VD_HSI 1 "register_operand" "w")) + (ANY_EXTEND:<VWIDE> + (vec_duplicate:VD_HSI + (vec_select:<VEL> + (match_operand:<VCOND> 2 "register_operand" "<vwx>") + (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))] + "TARGET_SIMD" + { + operands[3] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[3]))); + return "<su>mull\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]"; + } + [(set_attr "type" "neon_mul_<Vetype>_scalar_long")] +) + +(define_insn "aarch64_<su>mull_laneq<mode>" + [(set (match_operand:<VWIDE> 0 "register_operand" "=w") + (mult:<VWIDE> + (ANY_EXTEND:<VWIDE> + (match_operand:VD_HSI 1 "register_operand" "w")) + (ANY_EXTEND:<VWIDE> + (vec_duplicate:VD_HSI + (vec_select:<VEL> + (match_operand:<VCONQ> 2 "register_operand" "<vwx>") + (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))] + "TARGET_SIMD" + { + operands[3] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[3]))); + return "<su>mull\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]"; + } + [(set_attr "type" "neon_mul_<Vetype>_scalar_long")] +) + +(define_insn "aarch64_<su>mull2_lane<mode>_internal" + [(set (match_operand:<VWIDE> 0 "register_operand" "=w") + (mult:<VWIDE> + (ANY_EXTEND:<VWIDE> + (vec_select:<VHALF> + (match_operand:VQ_HSI 1 "register_operand" "w") + (match_operand:VQ_HSI 4 "vect_par_cnst_hi_half" ""))) + (ANY_EXTEND:<VWIDE> + (vec_duplicate:<VHALF> + (vec_select:<VEL> + (match_operand:<VCOND> 2 "register_operand" "<vwx>") + (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))] + "TARGET_SIMD" + { + operands[3] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[3]))); + return "<su>mull2\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]"; + } + [(set_attr "type" "neon_mul_<Vetype>_scalar_long")] +) + +(define_insn "aarch64_<su>mull2_laneq<mode>_internal" + [(set (match_operand:<VWIDE> 0 "register_operand" "=w") + (mult:<VWIDE> + (ANY_EXTEND:<VWIDE> + (vec_select:<VHALF> + (match_operand:VQ_HSI 1 "register_operand" "w") + (match_operand:VQ_HSI 4 "vect_par_cnst_hi_half" ""))) + (ANY_EXTEND:<VWIDE> + (vec_duplicate:<VHALF> + (vec_select:<VEL> + (match_operand:<VCONQ> 2 "register_operand" "<vwx>") + (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))] + "TARGET_SIMD" + { + operands[3] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[3]))); + return "<su>mull2\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]"; + } + [(set_attr "type" "neon_mul_<Vetype>_scalar_long")] +) + +(define_expand "aarch64_smull2_lane<mode>" + [(match_operand:<VWIDE> 0 "register_operand" "=w") + (match_operand:VQ_HSI 1 "register_operand" "w") + (match_operand:<VCOND> 2 "register_operand" "<vwx>") + (match_operand:SI 3 "immediate_operand" "i")] + "TARGET_SIMD" +{ + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true); + emit_insn (gen_aarch64_smull2_lane<mode>_internal (operands[0], operands[1], + operands[2], operands[3], + p)); + DONE; +}) + +(define_expand "aarch64_umull2_lane<mode>" + [(match_operand:<VWIDE> 0 "register_operand" "=w") + (match_operand:VQ_HSI 1 "register_operand" "w") + (match_operand:<VCOND> 2 "register_operand" "<vwx>") + (match_operand:SI 3 "immediate_operand" "i")] + "TARGET_SIMD" +{ + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true); + emit_insn (gen_aarch64_umull2_lane<mode>_internal (operands[0], operands[1], + operands[2], operands[3], + p)); + DONE; +}) + +(define_expand "aarch64_smull2_laneq<mode>" + [(match_operand:<VWIDE> 0 "register_operand" "=w") + (match_operand:VQ_HSI 1 "register_operand" "w") + (match_operand:<VCONQ> 2 "register_operand" "<vwx>") + (match_operand:SI 3 "immediate_operand" "i")] + "TARGET_SIMD" +{ + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true); + emit_insn (gen_aarch64_smull2_laneq<mode>_internal (operands[0], operands[1], + operands[2], operands[3], + p)); + DONE; +}) + +(define_expand "aarch64_umull2_laneq<mode>" + [(match_operand:<VWIDE> 0 "register_operand" "=w") + (match_operand:VQ_HSI 1 "register_operand" "w") + (match_operand:<VCONQ> 2 "register_operand" "<vwx>") + (match_operand:SI 3 "immediate_operand" "i")] + "TARGET_SIMD" +{ + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true); + emit_insn (gen_aarch64_umull2_laneq<mode>_internal (operands[0], operands[1], + operands[2], operands[3], + p)); + DONE; +}) + +(define_insn "aarch64_fmulx<mode>" + [(set (match_operand:VDQF 0 "register_operand" "=w") + (unspec:VDQF [(match_operand:VDQF 1 "register_operand" "w") + (match_operand:VDQF 2 "register_operand" "w")] + UNSPEC_FMULX))] + "TARGET_SIMD" + "fmulx\\t%0.<vtype>, %1.<vtype>, %2.<vtype>" + [(set_attr "type" "neon_mul_s")] +) + +(define_insn "aarch64_fmulx<mode>" + [(set (match_operand:GPF 0 "register_operand" "=w") + (unspec:GPF [(match_operand:GPF 1 "register_operand" "w") + (match_operand:GPF 2 "register_operand" "w")] + UNSPEC_FMULX))] + "TARGET_SIMD" + "fmulx\\t%<s>0, %<s>1, %<s>2" + [(set_attr "type" "neon_mul_s")] +) + +(define_insn "aarch64_fmulx_lane<mode>" + [(set (match_operand:VDQF 0 "register_operand" "=w") + (unspec:VDQF [(match_operand:VDQF 1 "register_operand" "w") + (match_operand:<VDQF_Q> 2 "register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_FMULX_LANE))] + "TARGET_SIMD" + "fmulx\\t%0.<vtype>, %1.<vtype>, %2.<vetype>" + [(set_attr "type" "neon_mul_s")] +) + +(define_insn "aarch64_pmull2v16qi" + [(set (match_operand:V8HI 0 "register_operand" "=w") + (unspec:V8HI [(match_operand:V16QI 1 "register_operand" "w") + (match_operand:V16QI 2 "register_operand" "w")] + UNSPEC_PMULL2))] + "TARGET_SIMD" + "pmull2\\t%0.8h, %1.16b, %2.16b" + [(set_attr "type" "neon_mul_b_long")] +) + +(define_insn "aarch64_pmullv8qi" + [(set (match_operand:V8HI 0 "register_operand" "=w") + (unspec:V8HI [(match_operand:V8QI 1 "register_operand" "w") + (match_operand:V8QI 2 "register_operand" "w")] + UNSPEC_PMULL))] + "TARGET_SIMD" + "pmull\\t%0.8h, %1.8b, %2.8b" + [(set_attr "type" "neon_mul_b_long")] +) + ;; FP vector operations. ;; AArch64 AdvSIMD supports single-precision (32-bit) and ;; double-precision (64-bit) floating-point data types and arithmetic as Index: gcc/config/aarch64/aarch64-simd-builtins.def =================================================================== --- gcc/config/aarch64/aarch64-simd-builtins.def (revision 218464) +++ gcc/config/aarch64/aarch64-simd-builtins.def (working copy) @@ -184,6 +184,39 @@ BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_lane, 0) BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_laneq, 0) + /* Implemented by vec_widen_<su>mult_hi_<mode>. */ + BUILTIN_VQW (BINOP, vec_widen_smult_hi_, 10) + BUILTIN_VQW (BINOPU, vec_widen_umult_hi_, 10) + /* Implemented by aarch64_<su>mull<mode>. */ + BUILTIN_VD_BHSI (BINOPU, umull, 0) + BUILTIN_VD_BHSI (BINOP, smull, 0) + /* Implemented by aarch64_<su>mull_n<mode>. */ + BUILTIN_VD_HSI (BINOP, smull_n, 0) + BUILTIN_VD_HSI (BINOPU, umull_n, 0) + /* Implemented by aarch64_mul_n<mode>. */ + BUILTIN_VMUL (BINOP, mul_n, 0) + /* Implemented by aarch64_<su>mull2_n<mode>. */ + BUILTIN_VQ_HSI (BINOP, smull2_n, 0) + BUILTIN_VQ_HSI (BINOPU, umull2_n, 0) + /* Implemented by aarch64_<su>mull_lane<q><mode>. */ + BUILTIN_VD_HSI (TERNOP, smull_lane, 0) + BUILTIN_VD_HSI (TERNOPU, umull_lane, 0) + BUILTIN_VD_HSI (TERNOP, smull_laneq, 0) + BUILTIN_VD_HSI (TERNOPU, umull_laneq, 0) + /* Implemented by aarch64_<su>mull2_lane<q><mode>. */ + BUILTIN_VQ_HSI (TERNOP, smull2_lane, 0) + BUILTIN_VQ_HSI (TERNOP_LANE, umull2_lane, 0) + BUILTIN_VQ_HSI (TERNOP, smull2_laneq, 0) + BUILTIN_VQ_HSI (TERNOP_LANE, umull2_laneq, 0) + /* Implemented by aarch64_fmulx<mode>. */ + BUILTIN_VDQF (BINOP, fmulx, 0) + BUILTIN_GPF (BINOP, fmulx, 0) + BUILTIN_VDQF (BINOP, fmulx_lane, 0) + + /* Implemented by aarch64_pmull<2><mode>.*/ + VAR1 (BINOPP, pmull, 0, v8qi) + VAR1 (BINOPP, pmull2, 0, v16qi) + BUILTIN_VSDQ_I_DI (BINOP, ashl, 3) /* Implemented by aarch64_<sur>shl<mode>. */ BUILTIN_VSDQ_I_DI (BINOP, sshl, 0)