From patchwork Sun Apr 7 10:28:29 2013 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Marc Glisse X-Patchwork-Id: 234473 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) (using TLSv1 with cipher DHE-RSA-AES256-SHA (256/256 bits)) (Client CN "localhost", Issuer "www.qmailtoaster.com" (not verified)) by ozlabs.org (Postfix) with ESMTPS id C531D2C00BA for ; Sun, 7 Apr 2013 20:28:44 +1000 (EST) DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:date :from:to:subject:message-id:mime-version:content-type :content-id; q=dns; s=default; b=IWTKzOn2jSF4hqagj7Vp6KDYi7bQTB+ 1doJbutfPmKIyP0qYGbuX5b1fQoF3FpY/FF5cNzcMU6EWtyV6BTlXlji0qAXztEJ EvTaNkGfv62ohaX8bSNycfzhfYokKE2pQNBCNYFAqDh3MBDtA175yLxc40td5+16 VLeuqd5KSG/A= DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:date :from:to:subject:message-id:mime-version:content-type :content-id; s=default; bh=Eyz21O5e2oqPjFT3/GnZBhEAa1A=; b=t5/Yk WG6Z/EwUEwNevv2XQUcSz6n6KkV3DZItTskdqEMuaIqdmm+3ho/f3KpLI342QCil mu8Nb4fkaAjk0EFR0i2FOKExJhLRy/a05wmXtNqXhBnlwCiuY4Ef5kV2iel6EIRH pJBoWy/1SRPJ5Jxu4R49H4s4k1GkfysdX1J05k= Received: (qmail 31879 invoked by alias); 7 Apr 2013 10:28:37 -0000 Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Delivered-To: mailing list gcc-patches@gcc.gnu.org Received: (qmail 31869 invoked by uid 89); 7 Apr 2013 10:28:36 -0000 X-Spam-SWARE-Status: No, score=-8.1 required=5.0 tests=AWL, BAYES_00, KHOP_RCVD_UNTRUST, RCVD_IN_DNSWL_HI, RP_MATCHES_RCVD, TW_PN, TW_SR autolearn=ham version=3.3.1 Received: from mail3-relais-sop.national.inria.fr (HELO mail3-relais-sop.national.inria.fr) (192.134.164.104) by sourceware.org (qpsmtpd/0.84/v0.84-167-ge50287c) with ESMTP; Sun, 07 Apr 2013 10:28:32 +0000 Received: from stedding.saclay.inria.fr ([193.55.250.194]) by mail3-relais-sop.national.inria.fr with ESMTP/TLS/DHE-RSA-AES128-SHA; 07 Apr 2013 12:28:29 +0200 Received: from glisse (helo=localhost) by stedding.saclay.inria.fr with local-esmtp (Exim 4.80) (envelope-from ) id 1UOmpp-0001fB-6i for gcc-patches@gcc.gnu.org; Sun, 07 Apr 2013 12:28:29 +0200 Date: Sun, 7 Apr 2013 12:28:29 +0200 (CEST) From: Marc Glisse To: gcc-patches@gcc.gnu.org Subject: [i386] Replace builtins with vector extensions Message-ID: User-Agent: Alpine 2.02 (DEB 1266 2009-07-14) MIME-Version: 1.0 Content-ID: X-Virus-Found: No Hello, the attached patch is very incomplete (it passes bootstrap+testsuite on x86_64-linux-gnu), but it raises a number of questions that I'd like to settle before continuing. * Is there any chance of a patch in this direction being accepted? * May I remove the builtins (from i386.c and the doc) when they become unused? * Do we want to keep the casts even when they don't seem strictly necessary? For instance for _mm_add_ps, we can write: return __A + __B; or: return (__m128) ((__v4sf)__A + (__v4sf)__B); Note that for _mm_add_epi8 for instance we do need the casts. * For integer operations like _mm_add_epi16 I should probably use the unsigned typedefs to make it clear overflow is well defined? (the patch still has the signed version) * Any better name than __v4su for the unsigned version of __v4si? * Other comments? 2013-04-07 Marc Glisse * emmintrin.h (__v2du, __v4su, __v8hu): New typedefs. (_mm_add_pd, _mm_sub_pd, _mm_mul_pd, _mm_div_pd, _mm_cmpeq_pd, _mm_cmplt_pd, _mm_cmple_pd, _mm_cmpgt_pd, _mm_cmpge_pd, _mm_cmpneq_pd, _mm_add_epi8, _mm_add_epi16, _mm_add_epi32, _mm_add_epi64, _mm_slli_epi16, _mm_slli_epi32, _mm_slli_epi64, _mm_srai_epi16, _mm_srai_epi32, _mm_srli_epi16, _mm_srli_epi32, _mm_srli_epi64): Replace builtins with vector extensions. * xmmintrin.h (_mm_add_ps, _mm_sub_ps, _mm_mul_ps, _mm_div_ps, _mm_cmpeq_ps, _mm_cmplt_ps, _mm_cmple_ps, _mm_cmpgt_ps, _mm_cmpge_ps, _mm_cmpneq_ps): Likewise. Index: config/i386/xmmintrin.h =================================================================== --- config/i386/xmmintrin.h (revision 197549) +++ config/i386/xmmintrin.h (working copy) @@ -147,39 +147,39 @@ extern __inline __m128 __attribute__((__ _mm_max_ss (__m128 __A, __m128 __B) { return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B); } /* Perform the respective operation on the four SPFP values in A and B. */ extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_ps (__m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B); + return __A + __B; } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_ps (__m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B); + return __A - __B; } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mul_ps (__m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B); + return __A * __B; } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_div_ps (__m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B); + return __A / __B; } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sqrt_ps (__m128 __A) { return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A); } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_rcp_ps (__m128 __A) @@ -323,51 +323,51 @@ _mm_cmpunord_ss (__m128 __A, __m128 __B) return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B); } /* Perform a comparison on the four SPFP values of A and B. For each element, if the comparison is true, place a mask of all ones in the result, otherwise a mask of zeros. */ extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpeq_ps (__m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B); + return (__m128) (__A == __B); } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmplt_ps (__m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B); + return (__m128) (__A < __B); } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmple_ps (__m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B); + return (__m128) (__A <= __B); } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpgt_ps (__m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B); + return (__m128) (__A > __B); } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpge_ps (__m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B); + return (__m128) (__A >= __B); } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpneq_ps (__m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B); + return (__m128) (__A != __B); } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpnlt_ps (__m128 __A, __m128 __B) { return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B); } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpnle_ps (__m128 __A, __m128 __B) Index: config/i386/emmintrin.h =================================================================== --- config/i386/emmintrin.h (revision 197549) +++ config/i386/emmintrin.h (working copy) @@ -30,22 +30,25 @@ #ifndef __SSE2__ # error "SSE2 instruction set not enabled" #else /* We need definitions from the SSE header files*/ #include /* SSE2 */ typedef double __v2df __attribute__ ((__vector_size__ (16))); typedef long long __v2di __attribute__ ((__vector_size__ (16))); +typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16))); typedef int __v4si __attribute__ ((__vector_size__ (16))); +typedef unsigned int __v4su __attribute__ ((__vector_size__ (16))); typedef short __v8hi __attribute__ ((__vector_size__ (16))); +typedef unsigned short __v8hu __attribute__ ((__vector_size__ (16))); typedef char __v16qi __attribute__ ((__vector_size__ (16))); /* The Intel API is flexible enough that we must allow aliasing with other vector types, and their scalar components. */ typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__)); typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__)); /* Create a selector for use with the SHUFPD instruction. */ #define _MM_SHUFFLE2(fp1,fp0) \ (((fp1) << 1) | (fp0)) @@ -219,72 +222,72 @@ _mm_cvtsi128_si64 (__m128i __A) extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi128_si64x (__m128i __A) { return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0); } #endif extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_pd (__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_addpd ((__v2df)__A, (__v2df)__B); + return __A + __B; } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_sd (__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_pd (__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_subpd ((__v2df)__A, (__v2df)__B); + return __A - __B; } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_sd (__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mul_pd (__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_mulpd ((__v2df)__A, (__v2df)__B); + return __A * __B; } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mul_sd (__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_div_pd (__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_divpd ((__v2df)__A, (__v2df)__B); + return __A / __B; } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_div_sd (__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sqrt_pd (__m128d __A) { return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A); } -/* Return pair {sqrt (A[0), B[1]}. */ +/* Return pair {sqrt (A[0]), B[1]}. */ extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sqrt_sd (__m128d __A, __m128d __B) { __v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_min_pd (__m128d __A, __m128d __B) { @@ -329,51 +332,51 @@ _mm_or_pd (__m128d __A, __m128d __B) extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_xor_pd (__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_xorpd ((__v2df)__A, (__v2df)__B); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpeq_pd (__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_cmpeqpd ((__v2df)__A, (__v2df)__B); + return (__m128d) (__A == __B); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmplt_pd (__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_cmpltpd ((__v2df)__A, (__v2df)__B); + return (__m128d) (__A < __B); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmple_pd (__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_cmplepd ((__v2df)__A, (__v2df)__B); + return (__m128d) (__A <= __B); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpgt_pd (__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_cmpgtpd ((__v2df)__A, (__v2df)__B); + return (__m128d) (__A > __B); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpge_pd (__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_cmpgepd ((__v2df)__A, (__v2df)__B); + return (__m128d) (__A >= __B); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpneq_pd (__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_cmpneqpd ((__v2df)__A, (__v2df)__B); + return (__m128d) (__A != __B); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpnlt_pd (__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_cmpnltpd ((__v2df)__A, (__v2df)__B); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpnle_pd (__m128d __A, __m128d __B) @@ -981,39 +984,39 @@ _mm_unpacklo_epi32 (__m128i __A, __m128i extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpacklo_epi64 (__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__A, (__v2di)__B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_epi8 (__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_paddb128 ((__v16qi)__A, (__v16qi)__B); + return (__m128i) ((__v16qi)__A + (__v16qi)__B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_epi16 (__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_paddw128 ((__v8hi)__A, (__v8hi)__B); + return (__m128i) ((__v8hi)__A + (__v8hi)__B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_epi32 (__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_paddd128 ((__v4si)__A, (__v4si)__B); + return (__m128i) ((__v4si)__A + (__v4si)__B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_epi64 (__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_paddq128 ((__v2di)__A, (__v2di)__B); + return (__m128i) ((__v2di)__A + (__v2di)__B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_adds_epi8 (__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_adds_epi16 (__m128i __A, __m128i __B) @@ -1107,45 +1110,45 @@ _mm_mul_su32 (__m64 __A, __m64 __B) extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mul_epu32 (__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pmuludq128 ((__v4si)__A, (__v4si)__B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_slli_epi16 (__m128i __A, int __B) { - return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B); + return (__m128i) ((__v8hi)__A << __B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_slli_epi32 (__m128i __A, int __B) { - return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B); + return (__m128i) ((__v4si)__A << __B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_slli_epi64 (__m128i __A, int __B) { - return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B); + return (__m128i) ((__v2di)__A << __B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srai_epi16 (__m128i __A, int __B) { - return (__m128i)__builtin_ia32_psrawi128 ((__v8hi)__A, __B); + return (__m128i) ((__v8hi)__A >> __B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srai_epi32 (__m128i __A, int __B) { - return (__m128i)__builtin_ia32_psradi128 ((__v4si)__A, __B); + return (__m128i) ((__v4si)__A >> __B); } #ifdef __OPTIMIZE__ extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srli_si128 (__m128i __A, const int __N) { return (__m128i)__builtin_ia32_psrldqi128 (__A, __N * 8); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -1156,33 +1159,33 @@ _mm_slli_si128 (__m128i __A, const int _ #else #define _mm_srli_si128(A, N) \ ((__m128i)__builtin_ia32_psrldqi128 ((__m128i)(A), (int)(N) * 8)) #define _mm_slli_si128(A, N) \ ((__m128i)__builtin_ia32_pslldqi128 ((__m128i)(A), (int)(N) * 8)) #endif extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srli_epi16 (__m128i __A, int __B) { - return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B); + return (__m128i) ((__v8hu)__A >> __B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srli_epi32 (__m128i __A, int __B) { - return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B); + return (__m128i) ((__v4su)__A >> __B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srli_epi64 (__m128i __A, int __B) { - return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B); + return (__m128i) ((__v2du)__A >> __B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sll_epi16 (__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psllw128((__v8hi)__A, (__v8hi)__B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sll_epi32 (__m128i __A, __m128i __B)