From patchwork Sat Oct 18 08:03:10 2014 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Marc Glisse X-Patchwork-Id: 400658 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 5491D14007F for ; Sat, 18 Oct 2014 19:03:25 +1100 (AEDT) DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:date :from:to:cc:subject:in-reply-to:message-id:references :mime-version:content-type; q=dns; s=default; b=MoFu/TIzOr3BzjDz duMDon+ZWqEDLbHtgAGokRx8/OT/iarG0d17mExF4YstywMy8FIGqF9nmwFU6ktf nh+hYFKR3HOhXhX2WBW2FUez3bsTVOvqTfqbn6NUuO9mDGdIAL95vjSRpSM4sFPn lm1VSPrtqvxT8NW1kY9KPBVNXf0= DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:date :from:to:cc:subject:in-reply-to:message-id:references :mime-version:content-type; s=default; bh=Q9+qckcbGcxt14q/yGpiDY r+HI8=; b=RFMAZDGRzY+I5a/E2hXNqbdPBqiGwQQd3OcyBfmYQLmpHzG93fsJ2L EB7yRhFiH+XE/N/SYygwHMTOXOolVCcOCgvYeb8/t11bg6ilOI6utdlF3NznTwCz bFLnd1HIQHZX+jOkBfeQuX+0TLySDmnbCqorvCo/N7mBwbwoVKerw= Received: (qmail 14591 invoked by alias); 18 Oct 2014 08:03:17 -0000 Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Delivered-To: mailing list gcc-patches@gcc.gnu.org Received: (qmail 14580 invoked by uid 89); 18 Oct 2014 08:03:17 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-3.0 required=5.0 tests=AWL, BAYES_00, RP_MATCHES_RCVD autolearn=ham version=3.3.2 X-HELO: mail3-relais-sop.national.inria.fr Received: from mail3-relais-sop.national.inria.fr (HELO mail3-relais-sop.national.inria.fr) (192.134.164.104) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with (CAMELLIA256-SHA encrypted) ESMTPS; Sat, 18 Oct 2014 08:03:14 +0000 Received: from stedding.saclay.inria.fr (HELO stedding) ([193.55.250.194]) by mail3-relais-sop.national.inria.fr with ESMTP/TLS/AES128-SHA; 18 Oct 2014 10:03:11 +0200 Received: from glisse (helo=localhost) by stedding with local-esmtp (Exim 4.84) (envelope-from ) id 1XfOyk-0005P9-OT; Sat, 18 Oct 2014 10:03:10 +0200 Date: Sat, 18 Oct 2014 10:03:10 +0200 (CEST) From: Marc Glisse To: Uros Bizjak cc: "gcc-patches@gcc.gnu.org" Subject: Re: [x86] Replace builtins with vector extensions In-Reply-To: Message-ID: References: User-Agent: Alpine 2.02 (DEB 1266 2009-07-14) MIME-Version: 1.0 On Fri, 17 Oct 2014, Marc Glisse wrote: > Thanks. I am testing the updated patch tonight and I'll commit. I am planning > on making a branch: ^/branches/x86-intrinsics-ext if noone complains about > the name. Committed patch attached. Index: gcc/testsuite/gcc.target/i386/intrinsics_opt-3.c =================================================================== --- gcc/testsuite/gcc.target/i386/intrinsics_opt-3.c (revision 0) +++ gcc/testsuite/gcc.target/i386/intrinsics_opt-3.c (revision 216422) @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* { dg-options "-O -msse2" } */ + +#include + +double f(){ + __m128d x = _mm_set1_pd (0.); + double r = 42; + _mm_storeh_pd (&r, x); + return r; +} + +/* { dg-final { scan-assembler-not "unpckhpd" } } */ Index: gcc/testsuite/gcc.target/i386/intrinsics_opt-4.c =================================================================== --- gcc/testsuite/gcc.target/i386/intrinsics_opt-4.c (revision 0) +++ gcc/testsuite/gcc.target/i386/intrinsics_opt-4.c (revision 216422) @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-O -ffast-math -msse2" } */ + +#include + +__m128d f(__m128d x, __m128d y, __m128d z){ + y = _mm_add_pd (x, y); + y = _mm_add_pd (z, y); + return _mm_sub_pd (y, x); +} + +/* { dg-final { scan-assembler-not "subpd" } } */ Index: gcc/testsuite/gcc.target/i386/intrinsics_opt-1.c =================================================================== --- gcc/testsuite/gcc.target/i386/intrinsics_opt-1.c (revision 0) +++ gcc/testsuite/gcc.target/i386/intrinsics_opt-1.c (revision 216422) @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mfma" } */ + +#include + +__m128d myfma(__m128d x, __m128d y, __m128d z){ + __m128d m = _mm_mul_pd (x, y); + return _mm_add_pd (m, z); +} + +/* { dg-final { scan-assembler "vfmadd" } } */ Index: gcc/testsuite/gcc.target/i386/intrinsics_opt-2.c =================================================================== --- gcc/testsuite/gcc.target/i386/intrinsics_opt-2.c (revision 0) +++ gcc/testsuite/gcc.target/i386/intrinsics_opt-2.c (revision 216422) @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-options "-O -ffast-math -msse2 -fdump-tree-optimized" } */ + +#include + +int f(__m128d x){ + x = _mm_sub_pd (x, x); + x = _mm_mul_pd (x, x); + double r = 42; + _mm_storeh_pd (&r, x); + int z = r == 0; + return __builtin_constant_p (z) && z; +} + +/* { dg-final { scan-tree-dump "return 1;" "optimized" } } */ +/* { dg-final { cleanup-tree-dump "optimized" } } */ Index: gcc/testsuite/ChangeLog.x86-intrinsics-ext =================================================================== --- gcc/testsuite/ChangeLog.x86-intrinsics-ext (revision 0) +++ gcc/testsuite/ChangeLog.x86-intrinsics-ext (revision 216422) @@ -0,0 +1,6 @@ +2014-10-18 Marc Glisse + + * gcc.target/i386/intrinsics_opt-1.c: New testcase. + * gcc.target/i386/intrinsics_opt-2.c: Likewise. + * gcc.target/i386/intrinsics_opt-3.c: Likewise. + * gcc.target/i386/intrinsics_opt-4.c: Likewise. Index: gcc/ChangeLog.x86-intrinsics-ext =================================================================== --- gcc/ChangeLog.x86-intrinsics-ext (revision 0) +++ gcc/ChangeLog.x86-intrinsics-ext (revision 216422) @@ -0,0 +1,15 @@ +2014-10-18 Marc Glisse + + * config/i386/xmmintrin.h (_mm_add_ps, _mm_sub_ps, _mm_mul_ps, + _mm_div_ps, _mm_store_ss, _mm_cvtss_f32): Use vector extensions + instead of builtins. + * config/i386/avxintrin.h (_mm256_add_pd, _mm256_add_ps, + _mm256_div_pd, _mm256_div_ps, _mm256_mul_pd, _mm256_mul_ps, + _mm256_sub_pd, _mm256_sub_ps): Likewise. + * config/i386/avx512fintrin.h (_mm512_add_pd, _mm512_add_ps, + _mm512_sub_pd, _mm512_sub_ps, _mm512_mul_pd, _mm512_mul_ps, + _mm512_div_pd, _mm512_div_ps): Likewise. + * config/i386/emmintrin.h (_mm_store_sd, _mm_cvtsd_f64, _mm_storeh_pd, + _mm_cvtsi128_si64, _mm_cvtsi128_si64x, _mm_add_pd, _mm_sub_pd, + _mm_mul_pd, _mm_div_pd, _mm_storel_epi64, _mm_movepi64_pi64): Likewise. + (_mm_sqrt_sd): Fix comment. Index: gcc/config/i386/xmmintrin.h =================================================================== --- gcc/config/i386/xmmintrin.h (revision 216421) +++ gcc/config/i386/xmmintrin.h (revision 216422) @@ -173,39 +173,39 @@ extern __inline __m128 __attribute__((__ _mm_max_ss (__m128 __A, __m128 __B) { return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B); } /* Perform the respective operation on the four SPFP values in A and B. */ extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_ps (__m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B); + return (__m128) ((__v4sf)__A + (__v4sf)__B); } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_ps (__m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B); + return (__m128) ((__v4sf)__A - (__v4sf)__B); } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mul_ps (__m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B); + return (__m128) ((__v4sf)__A * (__v4sf)__B); } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_div_ps (__m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B); + return (__m128) ((__v4sf)__A / (__v4sf)__B); } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sqrt_ps (__m128 __A) { return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A); } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_rcp_ps (__m128 __A) @@ -950,27 +950,27 @@ _mm_set_ps (const float __Z, const float extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_setr_ps (float __Z, float __Y, float __X, float __W) { return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W }; } /* Stores the lower SPFP value. */ extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_store_ss (float *__P, __m128 __A) { - *__P = __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0); + *__P = ((__v4sf)__A)[0]; } extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtss_f32 (__m128 __A) { - return __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0); + return ((__v4sf)__A)[0]; } /* Store four SPFP values. The address must be 16-byte aligned. */ extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_store_ps (float *__P, __m128 __A) { *(__v4sf *)__P = (__v4sf)__A; } /* Store four SPFP values. The address need not be 16-byte aligned. */ Index: gcc/config/i386/emmintrin.h =================================================================== --- gcc/config/i386/emmintrin.h (revision 216421) +++ gcc/config/i386/emmintrin.h (revision 216422) @@ -161,40 +161,40 @@ _mm_store_pd (double *__P, __m128d __A) extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_storeu_pd (double *__P, __m128d __A) { __builtin_ia32_storeupd (__P, __A); } /* Stores the lower DPFP value. */ extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_store_sd (double *__P, __m128d __A) { - *__P = __builtin_ia32_vec_ext_v2df (__A, 0); + *__P = ((__v2df)__A)[0]; } extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsd_f64 (__m128d __A) { - return __builtin_ia32_vec_ext_v2df (__A, 0); + return ((__v2df)__A)[0]; } extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_storel_pd (double *__P, __m128d __A) { _mm_store_sd (__P, __A); } /* Stores the upper DPFP value. */ extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_storeh_pd (double *__P, __m128d __A) { - *__P = __builtin_ia32_vec_ext_v2df (__A, 1); + *__P = ((__v2df)__A)[1]; } /* Store the lower DPFP value across two words. The address must be 16-byte aligned. */ extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_store1_pd (double *__P, __m128d __A) { _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,0))); } @@ -215,86 +215,86 @@ extern __inline int __attribute__((__gnu _mm_cvtsi128_si32 (__m128i __A) { return __builtin_ia32_vec_ext_v4si ((__v4si)__A, 0); } #ifdef __x86_64__ /* Intel intrinsic. */ extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi128_si64 (__m128i __A) { - return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0); + return ((__v2di)__A)[0]; } /* Microsoft intrinsic. */ extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi128_si64x (__m128i __A) { - return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0); + return ((__v2di)__A)[0]; } #endif extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_pd (__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_addpd ((__v2df)__A, (__v2df)__B); + return (__m128d) ((__v2df)__A + (__v2df)__B); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_sd (__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_pd (__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_subpd ((__v2df)__A, (__v2df)__B); + return (__m128d) ((__v2df)__A - (__v2df)__B); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_sd (__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mul_pd (__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_mulpd ((__v2df)__A, (__v2df)__B); + return (__m128d) ((__v2df)__A * (__v2df)__B); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mul_sd (__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_div_pd (__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_divpd ((__v2df)__A, (__v2df)__B); + return (__m128d) ((__v2df)__A / (__v2df)__B); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_div_sd (__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sqrt_pd (__m128d __A) { return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A); } -/* Return pair {sqrt (A[0), B[1]}. */ +/* Return pair {sqrt (B[0]), A[1]}. */ extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sqrt_sd (__m128d __A, __m128d __B) { __v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_min_pd (__m128d __A, __m128d __B) { @@ -708,27 +708,27 @@ _mm_store_si128 (__m128i *__P, __m128i _ extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_storeu_si128 (__m128i *__P, __m128i __B) { __builtin_ia32_storedqu ((char *)__P, (__v16qi)__B); } extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_storel_epi64 (__m128i *__P, __m128i __B) { - *(long long *)__P = __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0); + *(long long *)__P = ((__v2di)__B)[0]; } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movepi64_pi64 (__m128i __B) { - return (__m64) __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0); + return (__m64) ((__v2di)__B)[0]; } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movpi64_epi64 (__m64 __A) { return _mm_set_epi64 ((__m64)0LL, __A); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_move_epi64 (__m128i __A) Index: gcc/config/i386/avx512fintrin.h =================================================================== --- gcc/config/i386/avx512fintrin.h (revision 216421) +++ gcc/config/i386/avx512fintrin.h (revision 216422) @@ -10742,26 +10742,21 @@ _mm512_maskz_sqrt_ps (__mmask16 __U, __m (__v16sf) _mm512_setzero_ps (), (__mmask16) __U, _MM_FROUND_CUR_DIRECTION); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_add_pd (__m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_undefined_pd (), - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) ((__v8df)__A + (__v8df)__B); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_add_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, (__v8df) __B, (__v8df) __W, (__mmask8) __U, @@ -10777,26 +10772,21 @@ _mm512_maskz_add_pd (__mmask8 __U, __m51 (__v8df) _mm512_setzero_pd (), (__mmask8) __U, _MM_FROUND_CUR_DIRECTION); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_add_ps (__m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_undefined_ps (), - (__mmask16) -1, - _MM_FROUND_CUR_DIRECTION); + return (__m512) ((__v16sf)__A + (__v16sf)__B); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_add_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, (__v16sf) __B, (__v16sf) __W, (__mmask16) __U, @@ -10812,26 +10802,21 @@ _mm512_maskz_add_ps (__mmask16 __U, __m5 (__v16sf) _mm512_setzero_ps (), (__mmask16) __U, _MM_FROUND_CUR_DIRECTION); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_sub_pd (__m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_undefined_pd (), - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) ((__v8df)__A - (__v8df)__B); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_sub_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, (__v8df) __B, (__v8df) __W, (__mmask8) __U, @@ -10847,26 +10832,21 @@ _mm512_maskz_sub_pd (__mmask8 __U, __m51 (__v8df) _mm512_setzero_pd (), (__mmask8) __U, _MM_FROUND_CUR_DIRECTION); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_sub_ps (__m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_undefined_ps (), - (__mmask16) -1, - _MM_FROUND_CUR_DIRECTION); + return (__m512) ((__v16sf)__A - (__v16sf)__B); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_sub_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, (__v16sf) __B, (__v16sf) __W, (__mmask16) __U, @@ -10882,26 +10862,21 @@ _mm512_maskz_sub_ps (__mmask16 __U, __m5 (__v16sf) _mm512_setzero_ps (), (__mmask16) __U, _MM_FROUND_CUR_DIRECTION); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mul_pd (__m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_undefined_pd (), - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) ((__v8df)__A * (__v8df)__B); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_mul_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, (__v8df) __B, (__v8df) __W, (__mmask8) __U, @@ -10917,26 +10892,21 @@ _mm512_maskz_mul_pd (__mmask8 __U, __m51 (__v8df) _mm512_setzero_pd (), (__mmask8) __U, _MM_FROUND_CUR_DIRECTION); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mul_ps (__m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_undefined_ps (), - (__mmask16) -1, - _MM_FROUND_CUR_DIRECTION); + return (__m512) ((__v16sf)__A * (__v16sf)__B); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_mul_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, (__v16sf) __B, (__v16sf) __W, (__mmask16) __U, @@ -10952,26 +10922,21 @@ _mm512_maskz_mul_ps (__mmask16 __U, __m5 (__v16sf) _mm512_setzero_ps (), (__mmask16) __U, _MM_FROUND_CUR_DIRECTION); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_div_pd (__m512d __M, __m512d __V) { - return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M, - (__v8df) __V, - (__v8df) - _mm512_undefined_pd (), - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) ((__v8df)__M / (__v8df)__V); } extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_div_pd (__m512d __W, __mmask8 __U, __m512d __M, __m512d __V) { return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M, (__v8df) __V, (__v8df) __W, (__mmask8) __U, @@ -10987,26 +10952,21 @@ _mm512_maskz_div_pd (__mmask8 __U, __m51 (__v8df) _mm512_setzero_pd (), (__mmask8) __U, _MM_FROUND_CUR_DIRECTION); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_div_ps (__m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_undefined_ps (), - (__mmask16) -1, - _MM_FROUND_CUR_DIRECTION); + return (__m512) ((__v16sf)__A / (__v16sf)__B); } extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_div_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, (__v16sf) __B, (__v16sf) __W, (__mmask16) __U, Index: gcc/config/i386/avxintrin.h =================================================================== --- gcc/config/i386/avxintrin.h (revision 216421) +++ gcc/config/i386/avxintrin.h (revision 216422) @@ -117,27 +117,27 @@ typedef double __m256d __attribute__ ((_ /* Greater-than-or-equal (ordered, non-signaling) */ #define _CMP_GE_OQ 0x1d /* Greater-than (ordered, non-signaling) */ #define _CMP_GT_OQ 0x1e /* True (unordered, signaling) */ #define _CMP_TRUE_US 0x1f extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm256_add_pd (__m256d __A, __m256d __B) { - return (__m256d) __builtin_ia32_addpd256 ((__v4df)__A, (__v4df)__B); + return (__m256d) ((__v4df)__A + (__v4df)__B); } extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm256_add_ps (__m256 __A, __m256 __B) { - return (__m256) __builtin_ia32_addps256 ((__v8sf)__A, (__v8sf)__B); + return (__m256) ((__v8sf)__A + (__v8sf)__B); } extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm256_addsub_pd (__m256d __A, __m256d __B) { return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B); } extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm256_addsub_ps (__m256 __A, __m256 __B) @@ -211,27 +211,27 @@ extern __inline __m256 __attribute__((__ _mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M) { return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X, (__v8sf)__Y, (__v8sf)__M); } extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm256_div_pd (__m256d __A, __m256d __B) { - return (__m256d) __builtin_ia32_divpd256 ((__v4df)__A, (__v4df)__B); + return (__m256d) ((__v4df)__A / (__v4df)__B); } extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm256_div_ps (__m256 __A, __m256 __B) { - return (__m256) __builtin_ia32_divps256 ((__v8sf)__A, (__v8sf)__B); + return (__m256) ((__v8sf)__A / (__v8sf)__B); } /* Dot product instructions with mask-defined summing and zeroing parts of result. */ #ifdef __OPTIMIZE__ extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm256_dp_ps (__m256 __X, __m256 __Y, const int __M) { return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X, @@ -288,27 +288,27 @@ _mm256_min_pd (__m256d __A, __m256d __B) extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm256_min_ps (__m256 __A, __m256 __B) { return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B); } extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm256_mul_pd (__m256d __A, __m256d __B) { - return (__m256d) __builtin_ia32_mulpd256 ((__v4df)__A, (__v4df)__B); + return (__m256d) ((__v4df)__A * (__v4df)__B); } extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm256_mul_ps (__m256 __A, __m256 __B) { - return (__m256) __builtin_ia32_mulps256 ((__v8sf)__A, (__v8sf)__B); + return (__m256) ((__v8sf)__A * (__v8sf)__B); } extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm256_or_pd (__m256d __A, __m256d __B) { return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B); } extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm256_or_ps (__m256 __A, __m256 __B) @@ -336,27 +336,27 @@ _mm256_shuffle_ps (__m256 __A, __m256 __ (__v4df)(__m256d)(B), (int)(N))) #define _mm256_shuffle_ps(A, B, N) \ ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A), \ (__v8sf)(__m256)(B), (int)(N))) #endif extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm256_sub_pd (__m256d __A, __m256d __B) { - return (__m256d) __builtin_ia32_subpd256 ((__v4df)__A, (__v4df)__B); + return (__m256d) ((__v4df)__A - (__v4df)__B); } extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm256_sub_ps (__m256 __A, __m256 __B) { - return (__m256) __builtin_ia32_subps256 ((__v8sf)__A, (__v8sf)__B); + return (__m256) ((__v8sf)__A - (__v8sf)__B); } extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm256_xor_pd (__m256d __A, __m256d __B) { return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B); } extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm256_xor_ps (__m256 __A, __m256 __B)