From patchwork Mon May 16 09:09:37 2016 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Jiong Wang X-Patchwork-Id: 622515 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 3r7ZSD0Xslz9t5C for ; Mon, 16 May 2016 19:10:35 +1000 (AEST) Authentication-Results: ozlabs.org; dkim=pass (1024-bit key; unprotected) header.d=gcc.gnu.org header.i=@gcc.gnu.org header.b=iC0QmE2I; dkim-atps=neutral DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:from :subject:references:to:message-id:date:mime-version:in-reply-to :content-type; q=dns; s=default; b=DylPzYkYOv6BBFmubTBTEXGJ+/IlQ YD+HFN1TrfqgP8Fvy+d1lbsmPAYdEipeakaYK4lUcctOWnQaKqDp/N8dGitSRZke zOhgcbE8Iz46lo3gapI0NEYBXNASGxGq4NNYk14a9euZ/WvMesRj5O2iflQoO0rP fNCVLaC+g6fFpM= DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:from :subject:references:to:message-id:date:mime-version:in-reply-to :content-type; s=default; bh=8S1QIJ72nOchq9Wq/JOlf78nXG8=; b=iC0 QmE2I5dk7pnARR2auu4RlMX01JiCKD3ER/C9Q3xxYkCUWTR+S08C0RKG0pJcO5o7 P4YZzJFnJ4kuRmk6HSM7G4VjuUWDPoJ9ctvWqR/eMOsMRMSxi2/24elZ2F7STCB8 FP1uD0C1IKGE8vIKb+MPsJ7yAkAyGROes5JPdqYw= Received: (qmail 25665 invoked by alias); 16 May 2016 09:09:51 -0000 Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Delivered-To: mailing list gcc-patches@gcc.gnu.org Received: (qmail 25554 invoked by uid 89); 16 May 2016 09:09:50 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-0.3 required=5.0 tests=BAYES_00, KAM_LAZY_DOMAIN_SECURITY, LIKELY_SPAM_BODY, RP_MATCHES_RCVD autolearn=no version=3.3.2 spammy=aus, 2d, dus, 2h X-HELO: foss.arm.com Received: from foss.arm.com (HELO foss.arm.com) (217.140.101.70) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with ESMTP; Mon, 16 May 2016 09:09:40 +0000 Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.72.51.249]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 7E1F645C for ; Mon, 16 May 2016 02:09:55 -0700 (PDT) Received: from [10.2.206.198] (e104437-lin.cambridge.arm.com [10.2.206.198]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id 091053F218 for ; Mon, 16 May 2016 02:09:38 -0700 (PDT) From: Jiong Wang Subject: [AArch64, 3/4] Reimplement multiply by element to get rid of inline assembly References: <57398D3D.1040806@foss.arm.com> <57398D5E.6070503@foss.arm.com> To: GCC Patches Message-ID: <57398E51.3090903@foss.arm.com> Date: Mon, 16 May 2016 10:09:37 +0100 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Thunderbird/38.7.1 MIME-Version: 1.0 In-Reply-To: <57398D5E.6070503@foss.arm.com> X-IsSubscribed: yes This patch reimplement vector multiply by element on top of the existed vmul_lane* intrinsics instead of inline assembly. There is no code generation change from this patch. OK for trunk? 2016-05-16 Jiong Wang gcc/ * config/aarch64/aarch64-simd.md (vmul_n_f32): Remove inline assembly. Use builtin. (vmul_n_s16): Likewise. (vmul_n_s32): Likewise. (vmul_n_u16): Likewise. (vmul_n_u32): Likewise. (vmulq_n_f32): Likewise. (vmulq_n_f64): Likewise. (vmulq_n_s16): Likewise. (vmulq_n_s32): Likewise. (vmulq_n_u16): Likewise. (vmulq_n_u32): Likewise. gcc/testsuite/ * gcc.target/aarch64/simd/vmul_elem_1.c: Use intrinsics. diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index ca7ace5aa656163826569d046fcbf02f9f7d4d6c..84931aeec2d885f8552197fe8a72500f127e2bbb 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -7938,61 +7938,6 @@ vmovn_u64 (uint64x2_t a) return result; } -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vmul_n_f32 (float32x2_t a, float32_t b) -{ - float32x2_t result; - __asm__ ("fmul %0.2s,%1.2s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vmul_n_s16 (int16x4_t a, int16_t b) -{ - int16x4_t result; - __asm__ ("mul %0.4h,%1.4h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vmul_n_s32 (int32x2_t a, int32_t b) -{ - int32x2_t result; - __asm__ ("mul %0.2s,%1.2s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vmul_n_u16 (uint16x4_t a, uint16_t b) -{ - uint16x4_t result; - __asm__ ("mul %0.4h,%1.4h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vmul_n_u32 (uint32x2_t a, uint32_t b) -{ - uint32x2_t result; - __asm__ ("mul %0.2s,%1.2s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - #define vmull_high_lane_s16(a, b, c) \ __extension__ \ ({ \ @@ -8443,72 +8388,6 @@ vmull_u32 (uint32x2_t a, uint32x2_t b) return result; } -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vmulq_n_f32 (float32x4_t a, float32_t b) -{ - float32x4_t result; - __asm__ ("fmul %0.4s,%1.4s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vmulq_n_f64 (float64x2_t a, float64_t b) -{ - float64x2_t result; - __asm__ ("fmul %0.2d,%1.2d,%2.d[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vmulq_n_s16 (int16x8_t a, int16_t b) -{ - int16x8_t result; - __asm__ ("mul %0.8h,%1.8h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vmulq_n_s32 (int32x4_t a, int32_t b) -{ - int32x4_t result; - __asm__ ("mul %0.4s,%1.4s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vmulq_n_u16 (uint16x8_t a, uint16_t b) -{ - uint16x8_t result; - __asm__ ("mul %0.8h,%1.8h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vmulq_n_u32 (uint32x4_t a, uint32_t b) -{ - uint32x4_t result; - __asm__ ("mul %0.4s,%1.4s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) vmvn_p8 (poly8x8_t a) { @@ -18924,6 +18803,74 @@ vmulq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, const int __lane) return __a * __aarch64_vget_lane_any (__b, __lane); } +/* vmul_n. */ + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vmul_n_f32 (float32x2_t __a, float32_t __b) +{ + return __a * __b; +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vmulq_n_f32 (float32x4_t __a, float32_t __b) +{ + return __a * __b; +} + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vmulq_n_f64 (float64x2_t __a, float64_t __b) +{ + return __a * __b; +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vmul_n_s16 (int16x4_t __a, int16_t __b) +{ + return __a * __b; +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmulq_n_s16 (int16x8_t __a, int16_t __b) +{ + return __a * __b; +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vmul_n_s32 (int32x2_t __a, int32_t __b) +{ + return __a * __b; +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmulq_n_s32 (int32x4_t __a, int32_t __b) +{ + return __a * __b; +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vmul_n_u16 (uint16x4_t __a, uint16_t __b) +{ + return __a * __b; +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmulq_n_u16 (uint16x8_t __a, uint16_t __b) +{ + return __a * __b; +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vmul_n_u32 (uint32x2_t __a, uint32_t __b) +{ + return __a * __b; +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmulq_n_u32 (uint32x4_t __a, uint32_t __b) +{ + return __a * __b; +} + /* vneg */ __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmul_elem_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vmul_elem_1.c index 290a4e9adbc5d9ce1335ca28120e437293776f30..155cac3b4a5579318244533c3ab590250c150dd6 100644 --- a/gcc/testsuite/gcc.target/aarch64/simd/vmul_elem_1.c +++ b/gcc/testsuite/gcc.target/aarch64/simd/vmul_elem_1.c @@ -142,13 +142,13 @@ check_v2sf (float32_t elemA, float32_t elemB) int32_t indx; const float32_t vec32x2_buf[2] = {A, B}; float32x2_t vec32x2_src = vld1_f32 (vec32x2_buf); - float32x2_t vec32x2_res = vec32x2_src * elemA; + float32x2_t vec32x2_res = vmul_n_f32 (vec32x2_src, elemA); for (indx = 0; indx < 2; indx++) if (* (uint32_t *) &vec32x2_res[indx] != * (uint32_t *) &expected2_1[indx]) abort (); - vec32x2_res = vec32x2_src * elemB; + vec32x2_res = vmul_n_f32 (vec32x2_src, elemB); for (indx = 0; indx < 2; indx++) if (* (uint32_t *) &vec32x2_res[indx] != * (uint32_t *) &expected2_2[indx]) @@ -163,25 +163,25 @@ check_v4sf (float32_t elemA, float32_t elemB, float32_t elemC, float32_t elemD) int32_t indx; const float32_t vec32x4_buf[4] = {A, B, C, D}; float32x4_t vec32x4_src = vld1q_f32 (vec32x4_buf); - float32x4_t vec32x4_res = vec32x4_src * elemA; + float32x4_t vec32x4_res = vmulq_n_f32 (vec32x4_src, elemA); for (indx = 0; indx < 4; indx++) if (* (uint32_t *) &vec32x4_res[indx] != * (uint32_t *) &expected4_1[indx]) abort (); - vec32x4_res = vec32x4_src * elemB; + vec32x4_res = vmulq_n_f32 (vec32x4_src, elemB); for (indx = 0; indx < 4; indx++) if (* (uint32_t *) &vec32x4_res[indx] != * (uint32_t *) &expected4_2[indx]) abort (); - vec32x4_res = vec32x4_src * elemC; + vec32x4_res = vmulq_n_f32 (vec32x4_src, elemC); for (indx = 0; indx < 4; indx++) if (* (uint32_t *) &vec32x4_res[indx] != * (uint32_t *) &expected4_3[indx]) abort (); - vec32x4_res = vec32x4_src * elemD; + vec32x4_res = vmulq_n_f32 (vec32x4_src, elemD); for (indx = 0; indx < 4; indx++) if (* (uint32_t *) &vec32x4_res[indx] != * (uint32_t *) &expected4_4[indx]) @@ -196,13 +196,13 @@ check_v2df (float64_t elemdC, float64_t elemdD) int32_t indx; const float64_t vec64x2_buf[2] = {AD, BD}; float64x2_t vec64x2_src = vld1q_f64 (vec64x2_buf); - float64x2_t vec64x2_res = vec64x2_src * elemdC; + float64x2_t vec64x2_res = vmulq_n_f64 (vec64x2_src, elemdC); for (indx = 0; indx < 2; indx++) if (* (uint64_t *) &vec64x2_res[indx] != * (uint64_t *) &expectedd2_1[indx]) abort (); - vec64x2_res = vec64x2_src * elemdD; + vec64x2_res = vmulq_n_f64 (vec64x2_src, elemdD); for (indx = 0; indx < 2; indx++) if (* (uint64_t *) &vec64x2_res[indx] != * (uint64_t *) &expectedd2_2[indx]) @@ -217,13 +217,13 @@ check_v2si (int32_t elemsA, int32_t elemsB) int32_t indx; const int32_t vecs32x2_buf[2] = {AS, BS}; int32x2_t vecs32x2_src = vld1_s32 (vecs32x2_buf); - int32x2_t vecs32x2_res = vecs32x2_src * elemsA; + int32x2_t vecs32x2_res = vmul_n_s32 (vecs32x2_src, elemsA); for (indx = 0; indx < 2; indx++) if (vecs32x2_res[indx] != expecteds2_1[indx]) abort (); - vecs32x2_res = vecs32x2_src * elemsB; + vecs32x2_res = vmul_n_s32 (vecs32x2_src, elemsB); for (indx = 0; indx < 2; indx++) if (vecs32x2_res[indx] != expecteds2_2[indx]) @@ -236,13 +236,13 @@ check_v2si_unsigned (uint32_t elemusA, uint32_t elemusB) int indx; const uint32_t vecus32x2_buf[2] = {AUS, BUS}; uint32x2_t vecus32x2_src = vld1_u32 (vecus32x2_buf); - uint32x2_t vecus32x2_res = vecus32x2_src * elemusA; + uint32x2_t vecus32x2_res = vmul_n_u32 (vecus32x2_src, elemusA); for (indx = 0; indx < 2; indx++) if (vecus32x2_res[indx] != expectedus2_1[indx]) abort (); - vecus32x2_res = vecus32x2_src * elemusB; + vecus32x2_res = vmul_n_u32 (vecus32x2_src, elemusB); for (indx = 0; indx < 2; indx++) if (vecus32x2_res[indx] != expectedus2_2[indx]) @@ -257,25 +257,25 @@ check_v4si (int32_t elemsA, int32_t elemsB, int32_t elemsC, int32_t elemsD) int32_t indx; const int32_t vecs32x4_buf[4] = {AS, BS, CS, DS}; int32x4_t vecs32x4_src = vld1q_s32 (vecs32x4_buf); - int32x4_t vecs32x4_res = vecs32x4_src * elemsA; + int32x4_t vecs32x4_res = vmulq_n_s32 (vecs32x4_src, elemsA); for (indx = 0; indx < 4; indx++) if (vecs32x4_res[indx] != expecteds4_1[indx]) abort (); - vecs32x4_res = vecs32x4_src * elemsB; + vecs32x4_res = vmulq_n_s32 (vecs32x4_src, elemsB); for (indx = 0; indx < 4; indx++) if (vecs32x4_res[indx] != expecteds4_2[indx]) abort (); - vecs32x4_res = vecs32x4_src * elemsC; + vecs32x4_res = vmulq_n_s32 (vecs32x4_src, elemsC); for (indx = 0; indx < 4; indx++) if (vecs32x4_res[indx] != expecteds4_3[indx]) abort (); - vecs32x4_res = vecs32x4_src * elemsD; + vecs32x4_res = vmulq_n_s32 (vecs32x4_src, elemsD); for (indx = 0; indx < 4; indx++) if (vecs32x4_res[indx] != expecteds4_4[indx]) @@ -289,25 +289,25 @@ check_v4si_unsigned (uint32_t elemusA, uint32_t elemusB, uint32_t elemusC, int indx; const uint32_t vecus32x4_buf[4] = {AUS, BUS, CUS, DUS}; uint32x4_t vecus32x4_src = vld1q_u32 (vecus32x4_buf); - uint32x4_t vecus32x4_res = vecus32x4_src * elemusA; + uint32x4_t vecus32x4_res = vmulq_n_u32 (vecus32x4_src, elemusA); for (indx = 0; indx < 4; indx++) if (vecus32x4_res[indx] != expectedus4_1[indx]) abort (); - vecus32x4_res = vecus32x4_src * elemusB; + vecus32x4_res = vmulq_n_u32 (vecus32x4_src, elemusB); for (indx = 0; indx < 4; indx++) if (vecus32x4_res[indx] != expectedus4_2[indx]) abort (); - vecus32x4_res = vecus32x4_src * elemusC; + vecus32x4_res = vmulq_n_u32 (vecus32x4_src, elemusC); for (indx = 0; indx < 4; indx++) if (vecus32x4_res[indx] != expectedus4_3[indx]) abort (); - vecus32x4_res = vecus32x4_src * elemusD; + vecus32x4_res = vmulq_n_u32 (vecus32x4_src, elemusD); for (indx = 0; indx < 4; indx++) if (vecus32x4_res[indx] != expectedus4_4[indx]) @@ -323,25 +323,25 @@ check_v4hi (int16_t elemhA, int16_t elemhB, int16_t elemhC, int16_t elemhD) int32_t indx; const int16_t vech16x4_buf[4] = {AH, BH, CH, DH}; int16x4_t vech16x4_src = vld1_s16 (vech16x4_buf); - int16x4_t vech16x4_res = vech16x4_src * elemhA; + int16x4_t vech16x4_res = vmul_n_s16 (vech16x4_src, elemhA); for (indx = 0; indx < 4; indx++) if (vech16x4_res[indx] != expectedh4_1[indx]) abort (); - vech16x4_res = vech16x4_src * elemhB; + vech16x4_res = vmul_n_s16 (vech16x4_src, elemhB); for (indx = 0; indx < 4; indx++) if (vech16x4_res[indx] != expectedh4_2[indx]) abort (); - vech16x4_res = vech16x4_src * elemhC; + vech16x4_res = vmul_n_s16 (vech16x4_src, elemhC); for (indx = 0; indx < 4; indx++) if (vech16x4_res[indx] != expectedh4_3[indx]) abort (); - vech16x4_res = vech16x4_src * elemhD; + vech16x4_res = vmul_n_s16 (vech16x4_src, elemhD); for (indx = 0; indx < 4; indx++) if (vech16x4_res[indx] != expectedh4_4[indx]) @@ -355,25 +355,25 @@ check_v4hi_unsigned (uint16_t elemuhA, uint16_t elemuhB, uint16_t elemuhC, int indx; const uint16_t vecuh16x4_buf[4] = {AUH, BUH, CUH, DUH}; uint16x4_t vecuh16x4_src = vld1_u16 (vecuh16x4_buf); - uint16x4_t vecuh16x4_res = vecuh16x4_src * elemuhA; + uint16x4_t vecuh16x4_res = vmul_n_u16 (vecuh16x4_src, elemuhA); for (indx = 0; indx < 4; indx++) if (vecuh16x4_res[indx] != expecteduh4_1[indx]) abort (); - vecuh16x4_res = vecuh16x4_src * elemuhB; + vecuh16x4_res = vmul_n_u16 (vecuh16x4_src, elemuhB); for (indx = 0; indx < 4; indx++) if (vecuh16x4_res[indx] != expecteduh4_2[indx]) abort (); - vecuh16x4_res = vecuh16x4_src * elemuhC; + vecuh16x4_res = vmul_n_u16 (vecuh16x4_src, elemuhC); for (indx = 0; indx < 4; indx++) if (vecuh16x4_res[indx] != expecteduh4_3[indx]) abort (); - vecuh16x4_res = vecuh16x4_src * elemuhD; + vecuh16x4_res = vmul_n_u16 (vecuh16x4_src, elemuhD); for (indx = 0; indx < 4; indx++) if (vecuh16x4_res[indx] != expecteduh4_4[indx]) @@ -389,49 +389,49 @@ check_v8hi (int16_t elemhA, int16_t elemhB, int16_t elemhC, int16_t elemhD, int32_t indx; const int16_t vech16x8_buf[8] = {AH, BH, CH, DH, EH, FH, GH, HH}; int16x8_t vech16x8_src = vld1q_s16 (vech16x8_buf); - int16x8_t vech16x8_res = vech16x8_src * elemhA; + int16x8_t vech16x8_res = vmulq_n_s16 (vech16x8_src, elemhA); for (indx = 0; indx < 8; indx++) if (vech16x8_res[indx] != expectedh8_1[indx]) abort (); - vech16x8_res = vech16x8_src * elemhB; + vech16x8_res = vmulq_n_s16 (vech16x8_src, elemhB); for (indx = 0; indx < 8; indx++) if (vech16x8_res[indx] != expectedh8_2[indx]) abort (); - vech16x8_res = vech16x8_src * elemhC; + vech16x8_res = vmulq_n_s16 (vech16x8_src, elemhC); for (indx = 0; indx < 8; indx++) if (vech16x8_res[indx] != expectedh8_3[indx]) abort (); - vech16x8_res = vech16x8_src * elemhD; + vech16x8_res = vmulq_n_s16 (vech16x8_src, elemhD); for (indx = 0; indx < 8; indx++) if (vech16x8_res[indx] != expectedh8_4[indx]) abort (); - vech16x8_res = vech16x8_src * elemhE; + vech16x8_res = vmulq_n_s16 (vech16x8_src, elemhE); for (indx = 0; indx < 8; indx++) if (vech16x8_res[indx] != expectedh8_5[indx]) abort (); - vech16x8_res = vech16x8_src * elemhF; + vech16x8_res = vmulq_n_s16 (vech16x8_src, elemhF); for (indx = 0; indx < 8; indx++) if (vech16x8_res[indx] != expectedh8_6[indx]) abort (); - vech16x8_res = vech16x8_src * elemhG; + vech16x8_res = vmulq_n_s16 (vech16x8_src, elemhG); for (indx = 0; indx < 8; indx++) if (vech16x8_res[indx] != expectedh8_7[indx]) abort (); - vech16x8_res = vech16x8_src * elemhH; + vech16x8_res = vmulq_n_s16 (vech16x8_src, elemhH); for (indx = 0; indx < 8; indx++) if (vech16x8_res[indx] != expectedh8_8[indx]) @@ -446,49 +446,49 @@ check_v8hi_unsigned (uint16_t elemuhA, uint16_t elemuhB, uint16_t elemuhC, int indx; const uint16_t vecuh16x8_buf[8] = {AUH, BUH, CUH, DUH, EUH, FUH, GUH, HUH}; uint16x8_t vecuh16x8_src = vld1q_u16 (vecuh16x8_buf); - uint16x8_t vecuh16x8_res = vecuh16x8_src * elemuhA; + uint16x8_t vecuh16x8_res = vmulq_n_u16 (vecuh16x8_src, elemuhA); for (indx = 0; indx < 8; indx++) if (vecuh16x8_res[indx] != expecteduh8_1[indx]) abort (); - vecuh16x8_res = vecuh16x8_src * elemuhB; + vecuh16x8_res = vmulq_n_u16 (vecuh16x8_src, elemuhB); for (indx = 0; indx < 8; indx++) if (vecuh16x8_res[indx] != expecteduh8_2[indx]) abort (); - vecuh16x8_res = vecuh16x8_src * elemuhC; + vecuh16x8_res = vmulq_n_u16 (vecuh16x8_src, elemuhC); for (indx = 0; indx < 8; indx++) if (vecuh16x8_res[indx] != expecteduh8_3[indx]) abort (); - vecuh16x8_res = vecuh16x8_src * elemuhD; + vecuh16x8_res = vmulq_n_u16 (vecuh16x8_src, elemuhD); for (indx = 0; indx < 8; indx++) if (vecuh16x8_res[indx] != expecteduh8_4[indx]) abort (); - vecuh16x8_res = vecuh16x8_src * elemuhE; + vecuh16x8_res = vmulq_n_u16 (vecuh16x8_src, elemuhE); for (indx = 0; indx < 8; indx++) if (vecuh16x8_res[indx] != expecteduh8_5[indx]) abort (); - vecuh16x8_res = vecuh16x8_src * elemuhF; + vecuh16x8_res = vmulq_n_u16 (vecuh16x8_src, elemuhF); for (indx = 0; indx < 8; indx++) if (vecuh16x8_res[indx] != expecteduh8_6[indx]) abort (); - vecuh16x8_res = vecuh16x8_src * elemuhG; + vecuh16x8_res = vmulq_n_u16 (vecuh16x8_src, elemuhG); for (indx = 0; indx < 8; indx++) if (vecuh16x8_res[indx] != expecteduh8_7[indx]) abort (); - vecuh16x8_res = vecuh16x8_src * elemuhH; + vecuh16x8_res = vmulq_n_u16 (vecuh16x8_src, elemuhH); for (indx = 0; indx < 8; indx++) if (vecuh16x8_res[indx] != expecteduh8_8[indx])