From patchwork Tue Oct 12 01:49:43 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Patchwork-Submitter: "H.J. Lu" X-Patchwork-Id: 67499 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) by ozlabs.org (Postfix) with SMTP id 5BB89B70CB for ; Tue, 12 Oct 2010 12:50:04 +1100 (EST) Received: (qmail 19026 invoked by alias); 12 Oct 2010 01:49:59 -0000 Received: (qmail 18266 invoked by uid 22791); 12 Oct 2010 01:49:55 -0000 X-SWARE-Spam-Status: No, hits=-1.7 required=5.0 tests=AWL, BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, FREEMAIL_FROM, RCVD_IN_DNSWL_NONE, TW_AV, TW_CP, TW_VP, TW_VX, TW_XD X-Spam-Check-By: sourceware.org Received: from mail-vw0-f47.google.com (HELO mail-vw0-f47.google.com) (209.85.212.47) by sourceware.org (qpsmtpd/0.43rc1) with ESMTP; Tue, 12 Oct 2010 01:49:46 +0000 Received: by vws3 with SMTP id 3so1170553vws.20 for ; Mon, 11 Oct 2010 18:49:44 -0700 (PDT) MIME-Version: 1.0 Received: by 10.220.189.70 with SMTP id dd6mr2131091vcb.207.1286848183953; Mon, 11 Oct 2010 18:49:43 -0700 (PDT) Received: by 10.220.181.138 with HTTP; Mon, 11 Oct 2010 18:49:43 -0700 (PDT) In-Reply-To: References: Date: Mon, 11 Oct 2010 18:49:43 -0700 Message-ID: Subject: Re: [PATCH][RFC] 256bit AVX vectorization support From: "H.J. Lu" To: Richard Guenther , Uros Bizjak , Richard Henderson Cc: gcc-patches@gcc.gnu.org X-IsSubscribed: yes Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Delivered-To: mailing list gcc-patches@gcc.gnu.org On Tue, Oct 5, 2010 at 7:49 AM, Richard Guenther wrote: > > This is another try at enabling some AVX auto-vectorization for GCC 4.6. > I wasn't happy with my previous patch at all give its intrusiveness. > > Thus here is a simpler approach - iterate over a set of desired vector > sizes.  In the testsuite in its current state the patch causes FAILs > without -mavx because we try mmx vector sizes (which is probably > a bad idea), with -mavx we get extra fails because some things are > now dumped twice instead of once due to the iteration > (like "strided access in outer loop"). > > The patch asks for two cleanups: > >  1) targetm.units_per_simd_word should return a mode (and be renamed) >  2) we need a target hook that specifies what sizes to iterate over >    (solves the MMX problem and doesn't pessimize targets with a >    sane vector instruction set) > > Building SPEC CPU 2006 with -mavx on x86_64 gives me AVX vectorized > loops for most FP loops (3952 loops before the machine died processing > 481.wrf), so it seems to be working. > > Also bootstrapped on x86_64-unknown-linux-gnu, but not applicable yet. > > HJ, do you plan to bring the i386 backend changes from vect256 branch > to trunk? > Here is the 256bit AVX x86 backend patch. There is no regression on Linux/ia32 and Linux/x86-64. OK to install? Thanks. diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 1d79a18..33810ff 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -15204,17 +15204,28 @@ ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value) rtvec v; switch (mode) { - case SImode: + case V4SImode: gcc_assert (vect); v = gen_rtvec (4, value, value, value, value); return gen_rtx_CONST_VECTOR (V4SImode, v); - case DImode: + case V2DImode: gcc_assert (vect); v = gen_rtvec (2, value, value); return gen_rtx_CONST_VECTOR (V2DImode, v); - case SFmode: + case V8SFmode: + if (vect) + v = gen_rtvec (8, value, value, value, value, + value, value, value, value); + else + v = gen_rtvec (8, value, CONST0_RTX (SFmode), + CONST0_RTX (SFmode), CONST0_RTX (SFmode), + CONST0_RTX (SFmode), CONST0_RTX (SFmode), + CONST0_RTX (SFmode), CONST0_RTX (SFmode)); + return gen_rtx_CONST_VECTOR (V8SFmode, v); + + case V4SFmode: if (vect) v = gen_rtvec (4, value, value, value, value); else @@ -15222,7 +15233,15 @@ ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value) CONST0_RTX (SFmode), CONST0_RTX (SFmode)); return gen_rtx_CONST_VECTOR (V4SFmode, v); - case DFmode: + case V4DFmode: + if (vect) + v = gen_rtvec (4, value, value, value, value); + else + v = gen_rtvec (4, value, CONST0_RTX (DFmode), + CONST0_RTX (DFmode), CONST0_RTX (DFmode)); + return gen_rtx_CONST_VECTOR (V4DFmode, v); + + case V2DFmode: if (vect) v = gen_rtvec (2, value, value); else @@ -15252,17 +15271,21 @@ ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert) /* Find the sign bit, sign extended to 2*HWI. */ switch (mode) { - case SImode: - case SFmode: + case V4SImode: + case V8SFmode: + case V4SFmode: + vec_mode = mode; + mode = GET_MODE_INNER (mode); imode = SImode; - vec_mode = (mode == SImode) ? V4SImode : V4SFmode; lo = 0x80000000, hi = lo < 0; break; - case DImode: - case DFmode: + case V2DImode: + case V4DFmode: + case V2DFmode: + vec_mode = mode; + mode = GET_MODE_INNER (mode); imode = DImode; - vec_mode = (mode == DImode) ? V2DImode : V2DFmode; if (HOST_BITS_PER_WIDE_INT >= 64) lo = (HOST_WIDE_INT)1 << shift, hi = -1; else @@ -15316,7 +15339,7 @@ ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert) if (vec_mode == VOIDmode) return force_reg (mode, mask); - v = ix86_build_const_vector (mode, vect, mask); + v = ix86_build_const_vector (vec_mode, vect, mask); return force_reg (vec_mode, v); } @@ -15329,22 +15352,25 @@ ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode, rtx mask, set, use, clob, dst, src; bool use_sse = false; bool vector_mode = VECTOR_MODE_P (mode); - enum machine_mode elt_mode = mode; + enum machine_mode vmode = mode; if (vector_mode) - { - elt_mode = GET_MODE_INNER (mode); - use_sse = true; - } + use_sse = true; else if (mode == TFmode) use_sse = true; else if (TARGET_SSE_MATH) - use_sse = SSE_FLOAT_MODE_P (mode); + { + use_sse = SSE_FLOAT_MODE_P (mode); + if (mode == SFmode) + vmode = V4SFmode; + else if (mode == DFmode) + vmode = V2DFmode; + } /* NEG and ABS performed with SSE use bitwise mask operations. Create the appropriate mask now. */ if (use_sse) - mask = ix86_build_signbit_mask (elt_mode, vector_mode, code == ABS); + mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS); else mask = NULL_RTX; @@ -15378,7 +15404,7 @@ ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode, void ix86_expand_copysign (rtx operands[]) { - enum machine_mode mode; + enum machine_mode mode, vmode; rtx dest, op0, op1, mask, nmask; dest = operands[0]; @@ -15387,6 +15413,13 @@ ix86_expand_copysign (rtx operands[]) mode = GET_MODE (dest); + if (mode == SFmode) + vmode = V4SFmode; + else if (mode == DFmode) + vmode = V2DFmode; + else + vmode = mode; + if (GET_CODE (op0) == CONST_DOUBLE) { rtx (*copysign_insn)(rtx, rtx, rtx, rtx); @@ -15396,15 +15429,11 @@ ix86_expand_copysign (rtx operands[]) if (mode == SFmode || mode == DFmode) { - enum machine_mode vmode; - - vmode = mode == SFmode ? V4SFmode : V2DFmode; - if (op0 == CONST0_RTX (mode)) op0 = CONST0_RTX (vmode); else { - rtx v = ix86_build_const_vector (mode, false, op0); + rtx v = ix86_build_const_vector (vmode, false, op0); op0 = force_reg (vmode, v); } @@ -15412,7 +15441,7 @@ ix86_expand_copysign (rtx operands[]) else if (op0 != CONST0_RTX (mode)) op0 = force_reg (mode, op0); - mask = ix86_build_signbit_mask (mode, 0, 0); + mask = ix86_build_signbit_mask (vmode, 0, 0); if (mode == SFmode) copysign_insn = gen_copysignsf3_const; @@ -15427,8 +15456,8 @@ ix86_expand_copysign (rtx operands[]) { rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx); - nmask = ix86_build_signbit_mask (mode, 0, 1); - mask = ix86_build_signbit_mask (mode, 0, 0); + nmask = ix86_build_signbit_mask (vmode, 0, 1); + mask = ix86_build_signbit_mask (vmode, 0, 0); if (mode == SFmode) copysign_insn = gen_copysignsf3_var; @@ -17335,8 +17364,7 @@ ix86_expand_int_vcond (rtx operands[]) /* Subtract (-(INT MAX) - 1) from both operands to make them signed. */ - mask = ix86_build_signbit_mask (GET_MODE_INNER (mode), - true, false); + mask = ix86_build_signbit_mask (mode, true, false); gen_sub3 = (mode == V4SImode ? gen_subv4si3 : gen_subv2di3); t1 = gen_reg_rtx (mode); @@ -22157,6 +22185,8 @@ enum ix86_builtins /* Vectorizer support builtins. */ IX86_BUILTIN_CPYSGNPS, IX86_BUILTIN_CPYSGNPD, + IX86_BUILTIN_CPYSGNPS256, + IX86_BUILTIN_CPYSGNPD256, IX86_BUILTIN_CVTUDQ2PS, @@ -23294,6 +23324,9 @@ static const struct builtin_description bdesc_args[] = { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF }, { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF }, + { OPTION_MASK_ISA_ABM, CODE_FOR_clzhi2_abm, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 }, /* F16C */ @@ -25480,15 +25513,23 @@ ix86_builtin_vectorized_function (tree fndecl, tree type_out, switch (fn) { case BUILT_IN_SQRT: - if (out_mode == DFmode && out_n == 2 - && in_mode == DFmode && in_n == 2) - return ix86_builtins[IX86_BUILTIN_SQRTPD]; + if (out_mode == DFmode && in_mode == DFmode) + { + if (out_n == 2 && in_n == 2) + return ix86_builtins[IX86_BUILTIN_SQRTPD]; + else if (out_n == 4 && in_n == 4) + return ix86_builtins[IX86_BUILTIN_SQRTPD256]; + } break; case BUILT_IN_SQRTF: - if (out_mode == SFmode && out_n == 4 - && in_mode == SFmode && in_n == 4) - return ix86_builtins[IX86_BUILTIN_SQRTPS_NR]; + if (out_mode == SFmode && in_mode == SFmode) + { + if (out_n == 4 && in_n == 4) + return ix86_builtins[IX86_BUILTIN_SQRTPS_NR]; + else if (out_n == 8 && in_n == 8) + return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256]; + } break; case BUILT_IN_LRINT: @@ -25498,21 +25539,33 @@ ix86_builtin_vectorized_function (tree fndecl, tree type_out, break; case BUILT_IN_LRINTF: - if (out_mode == SImode && out_n == 4 - && in_mode == SFmode && in_n == 4) - return ix86_builtins[IX86_BUILTIN_CVTPS2DQ]; + if (out_mode == SImode && in_mode == SFmode) + { + if (out_n == 4 && in_n == 4) + return ix86_builtins[IX86_BUILTIN_CVTPS2DQ]; + else if (out_n == 8 && in_n == 8) + return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256]; + } break; case BUILT_IN_COPYSIGN: - if (out_mode == DFmode && out_n == 2 - && in_mode == DFmode && in_n == 2) - return ix86_builtins[IX86_BUILTIN_CPYSGNPD]; + if (out_mode == DFmode && in_mode == DFmode) + { + if (out_n == 2 && in_n == 2) + return ix86_builtins[IX86_BUILTIN_CPYSGNPD]; + else if (out_n == 4 && in_n == 4) + return ix86_builtins[IX86_BUILTIN_CPYSGNPD256]; + } break; case BUILT_IN_COPYSIGNF: - if (out_mode == SFmode && out_n == 4 - && in_mode == SFmode && in_n == 4) - return ix86_builtins[IX86_BUILTIN_CPYSGNPS]; + if (out_mode == SFmode && in_mode == SFmode) + { + if (out_n == 4 && in_n == 4) + return ix86_builtins[IX86_BUILTIN_CPYSGNPS]; + else if (out_n == 8 && in_n == 8) + return ix86_builtins[IX86_BUILTIN_CPYSGNPS256]; + } break; default: @@ -25835,6 +25888,9 @@ ix86_builtin_reciprocal (unsigned int fn, bool md_fn, case IX86_BUILTIN_SQRTPS_NR: return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR]; + case IX86_BUILTIN_SQRTPS_NR256: + return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256]; + default: return NULL_TREE; } @@ -29377,7 +29433,7 @@ void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode) two = CONST_DOUBLE_FROM_REAL_VALUE (dconst2, SFmode); if (VECTOR_MODE_P (mode)) - two = ix86_build_const_vector (SFmode, true, two); + two = ix86_build_const_vector (mode, true, two); two = force_reg (mode, two); @@ -29424,8 +29480,8 @@ void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode, if (VECTOR_MODE_P (mode)) { - mthree = ix86_build_const_vector (SFmode, true, mthree); - mhalf = ix86_build_const_vector (SFmode, true, mhalf); + mthree = ix86_build_const_vector (mode, true, mthree); + mhalf = ix86_build_const_vector (mode, true, mhalf); } /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) @@ -29570,7 +29626,16 @@ ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask) rtx sgn = gen_reg_rtx (mode); if (mask == NULL_RTX) { - mask = ix86_build_signbit_mask (mode, VECTOR_MODE_P (mode), false); + enum machine_mode vmode; + + if (mode == SFmode) + vmode = V4SFmode; + else if (mode == DFmode) + vmode = V2DFmode; + else + vmode = mode; + + mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false); if (!VECTOR_MODE_P (mode)) { /* We need to generate a scalar mode mask in this case. */ @@ -29594,11 +29659,17 @@ ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask) static rtx ix86_expand_sse_fabs (rtx op0, rtx *smask) { - enum machine_mode mode = GET_MODE (op0); + enum machine_mode vmode, mode = GET_MODE (op0); rtx xa, mask; xa = gen_reg_rtx (mode); - mask = ix86_build_signbit_mask (mode, VECTOR_MODE_P (mode), true); + if (mode == SFmode) + vmode = V4SFmode; + else if (mode == DFmode) + vmode = V2DFmode; + else + vmode = mode; + mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true); if (!VECTOR_MODE_P (mode)) { /* We need to generate a scalar mode mask in this case. */ @@ -30941,7 +31012,7 @@ expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d) static bool expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd) { - rtx t1, t2, t3, t4; + rtx t1, t2, t3; switch (d->vmode) { @@ -30963,34 +31034,34 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd) case V8SFmode: { - static const unsigned char perm1[8] = { 0, 2, 1, 3, 5, 6, 5, 7 }; - static const unsigned char perme[8] = { 0, 1, 8, 9, 4, 5, 12, 13 }; - static const unsigned char permo[8] = { 2, 3, 10, 11, 6, 7, 14, 15 }; + int mask = odd ? 0xdd : 0x88; t1 = gen_reg_rtx (V8SFmode); t2 = gen_reg_rtx (V8SFmode); t3 = gen_reg_rtx (V8SFmode); - t4 = gen_reg_rtx (V8SFmode); /* Shuffle within the 128-bit lanes to produce: - { 0 2 1 3 4 6 5 7 } and { 8 a 9 b c e d f }. */ - expand_vselect (t1, d->op0, perm1, 8); - expand_vselect (t2, d->op1, perm1, 8); + { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */ + emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1, + GEN_INT (mask))); + + /* Shuffle the lanes around to produce: + { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */ + emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1, + GEN_INT (0x3))); + + /* Shuffle within the 128-bit lanes to produce: + { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */ + emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44))); + + /* Shuffle within the 128-bit lanes to produce: + { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */ + emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee))); /* Shuffle the lanes around to produce: - { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */ - emit_insn (gen_avx_vperm2f128v8sf3 (t3, t1, t2, GEN_INT (0x20))); - emit_insn (gen_avx_vperm2f128v8sf3 (t4, t1, t2, GEN_INT (0x31))); - - /* Now a vpermil2p will produce the result required. */ - /* ??? The vpermil2p requires a vector constant. Another option - is a unpck[lh]ps to merge the two vectors to produce - { 0 4 2 6 8 c a e } or { 1 5 3 7 9 d b f }. Then use another - vpermilps to get the elements into the final order. */ - d->op0 = t3; - d->op1 = t4; - memcpy (d->perm, odd ? permo: perme, 8); - expand_vec_perm_vpermil (d); + { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */ + emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2, + GEN_INT (0x20))); } break; diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 9780eef..1d92601 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -4518,7 +4518,7 @@ real_ldexp (&TWO31r, &dconst1, 31); two31 = const_double_from_real_value (TWO31r, mode); - two31 = ix86_build_const_vector (mode, true, two31); + two31 = ix86_build_const_vector (vecmode, true, two31); operands[2] = force_reg (vecmode, two31); }) diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index a3488cf..068d60e 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -504,6 +504,14 @@ ; define patterns for other modes that would expand to several insns. (define_expand "storent" + [(set (match_operand:AVX256MODEF2P 0 "memory_operand" "") + (unspec:AVX256MODEF2P + [(match_operand:AVX256MODEF2P 1 "register_operand" "")] + UNSPEC_MOVNT))] + "AVX256_VEC_FLOAT_MODE_P (mode)" + "") + +(define_expand "storent" [(set (match_operand:SSEMODEF2P 0 "memory_operand" "") (unspec:SSEMODEF2P [(match_operand:SSEMODEF2P 1 "register_operand" "")] @@ -540,6 +548,13 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (define_expand "2" + [(set (match_operand:AVX256MODEF2P 0 "register_operand" "") + (absneg:AVX256MODEF2P + (match_operand:AVX256MODEF2P 1 "register_operand" "")))] + "AVX256_VEC_FLOAT_MODE_P (mode)" + "ix86_expand_fp_absneg_operator (, mode, operands); DONE;") + +(define_expand "2" [(set (match_operand:SSEMODEF2P 0 "register_operand" "") (absneg:SSEMODEF2P (match_operand:SSEMODEF2P 1 "register_operand" "")))] @@ -1385,6 +1400,19 @@ [(set_attr "type" "sseadd") (set_attr "mode" "V2DF")]) +(define_expand "reduc_splus_v8sf" + [(match_operand:V8SF 0 "register_operand" "") + (match_operand:V8SF 1 "register_operand" "")] + "TARGET_AVX" +{ + rtx tmp = gen_reg_rtx (V8SFmode); + rtx tmp2 = gen_reg_rtx (V8SFmode); + emit_insn (gen_avx_haddv8sf3 (tmp, operands[1], operands[1])); + emit_insn (gen_avx_haddv8sf3 (tmp2, operands[1], operands[1])); + emit_insn (gen_avx_haddv8sf3 (operands[0], tmp2, tmp2)); + DONE; +}) + (define_expand "reduc_splus_v4sf" [(match_operand:V4SF 0 "register_operand" "") (match_operand:V4SF 1 "register_operand" "")] @@ -1401,6 +1429,17 @@ DONE; }) +(define_expand "reduc_splus_v4df" + [(match_operand:V4DF 0 "register_operand" "") + (match_operand:V4DF 1 "register_operand" "")] + "TARGET_AVX" +{ + rtx tmp = gen_reg_rtx (V4DFmode); + emit_insn (gen_avx_haddv4df3 (tmp, operands[1], operands[1])); + emit_insn (gen_avx_haddv4df3 (operands[0], tmp, tmp)); + DONE; +}) + (define_expand "reduc_splus_v2df" [(match_operand:V2DF 0 "register_operand" "") (match_operand:V2DF 1 "register_operand" "")] @@ -1655,6 +1694,24 @@ (define_expand "copysign3" [(set (match_dup 4) + (and:AVX256MODEF2P + (not:AVX256MODEF2P (match_dup 3)) + (match_operand:AVX256MODEF2P 1 "nonimmediate_operand" ""))) + (set (match_dup 5) + (and:AVX256MODEF2P (match_dup 3) + (match_operand:AVX256MODEF2P 2 "nonimmediate_operand" ""))) + (set (match_operand:AVX256MODEF2P 0 "register_operand" "") + (ior:AVX256MODEF2P (match_dup 4) (match_dup 5)))] + "AVX256_VEC_FLOAT_MODE_P (mode)" +{ + operands[3] = ix86_build_signbit_mask (mode, 1, 0); + + operands[4] = gen_reg_rtx (mode); + operands[5] = gen_reg_rtx (mode); +}) + +(define_expand "copysign3" + [(set (match_dup 4) (and:SSEMODEF2P (not:SSEMODEF2P (match_dup 3)) (match_operand:SSEMODEF2P 1 "nonimmediate_operand" ""))) @@ -1665,7 +1722,7 @@ (ior:SSEMODEF2P (match_dup 4) (match_dup 5)))] "SSE_VEC_FLOAT_MODE_P (mode)" { - operands[3] = ix86_build_signbit_mask (mode, 1, 0); + operands[3] = ix86_build_signbit_mask (mode, 1, 0); operands[4] = gen_reg_rtx (mode); operands[5] = gen_reg_rtx (mode); @@ -2662,7 +2719,8 @@ x = const_double_from_real_value (TWO32r, SFmode); operands[3] = force_reg (V4SFmode, CONST0_RTX (V4SFmode)); - operands[4] = force_reg (V4SFmode, ix86_build_const_vector (SFmode, 1, x)); + operands[4] = force_reg (V4SFmode, + ix86_build_const_vector (V4SFmode, 1, x)); for (i = 5; i < 8; i++) operands[i] = gen_reg_rtx (V4SFmode); @@ -2897,6 +2955,18 @@ (set_attr "prefix" "vex") (set_attr "mode" "V4DF")]) +(define_insn "*avx_cvtdq2pd256_2" + [(set (match_operand:V4DF 0 "register_operand" "=x") + (float:V4DF + (vec_select:V4SI + (match_operand:V8SI 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3)]))))] + "TARGET_AVX" + "vcvtdq2pd\t{%x1, %0|%0, %x1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "vex") + (set_attr "mode" "V4DF")]) + (define_insn "sse2_cvtdq2pd" [(set (match_operand:V2DF 0 "register_operand" "=x") (float:V2DF @@ -3077,6 +3147,18 @@ (set_attr "prefix" "vex") (set_attr "mode" "V4DF")]) +(define_insn "*avx_cvtps2pd256_2" + [(set (match_operand:V4DF 0 "register_operand" "=x") + (float_extend:V4DF + (vec_select:V4SF + (match_operand:V8SF 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3)]))))] + "TARGET_AVX" + "vcvtps2pd\t{%x1, %0|%0, %x1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "vex") + (set_attr "mode" "V4DF")]) + (define_insn "sse2_cvtps2pd" [(set (match_operand:V2DF 0 "register_operand" "=x") (float_extend:V2DF @@ -3111,6 +3193,22 @@ operands[2] = gen_reg_rtx (V4SFmode); }) +(define_expand "vec_unpacks_hi_v8sf" + [(set (match_dup 2) + (vec_select:V4SF + (match_operand:V8SF 1 "nonimmediate_operand" "") + (parallel [(const_int 4) + (const_int 5) + (const_int 6) + (const_int 7)]))) + (set (match_operand:V4DF 0 "register_operand" "") + (float_extend:V4DF + (match_dup 2)))] + "TARGET_AVX" +{ + operands[2] = gen_reg_rtx (V4SFmode); +}) + (define_expand "vec_unpacks_lo_v4sf" [(set (match_operand:V2DF 0 "register_operand" "") (float_extend:V2DF @@ -3119,6 +3217,14 @@ (parallel [(const_int 0) (const_int 1)]))))] "TARGET_SSE2") +(define_expand "vec_unpacks_lo_v8sf" + [(set (match_operand:V4DF 0 "register_operand" "") + (float_extend:V4DF + (vec_select:V4SF + (match_operand:V8SF 1 "nonimmediate_operand" "") + (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3)]))))] + "TARGET_AVX") + (define_expand "vec_unpacks_float_hi_v8hi" [(match_operand:V4SF 0 "register_operand" "") (match_operand:V8HI 1 "register_operand" "")] @@ -3191,6 +3297,28 @@ (parallel [(const_int 0) (const_int 1)]))))] "TARGET_SSE2") +(define_expand "vec_unpacks_float_hi_v8si" + [(set (match_dup 2) + (vec_select:V4SI + (match_operand:V8SI 1 "nonimmediate_operand" "") + (parallel [(const_int 4) + (const_int 5) + (const_int 6) + (const_int 7)]))) + (set (match_operand:V4DF 0 "register_operand" "") + (float:V4DF + (match_dup 2)))] + "TARGET_AVX" + "operands[2] = gen_reg_rtx (V4SImode);") + +(define_expand "vec_unpacks_float_lo_v8si" + [(set (match_operand:V4DF 0 "register_operand" "") + (float:V4DF + (vec_select:V4SI + (match_operand:V8SI 1 "nonimmediate_operand" "") + (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3)]))))] + "TARGET_AVX") + (define_expand "vec_unpacku_float_hi_v4si" [(set (match_dup 5) (vec_select:V4SI @@ -3220,7 +3348,8 @@ x = const_double_from_real_value (TWO32r, DFmode); operands[3] = force_reg (V2DFmode, CONST0_RTX (V2DFmode)); - operands[4] = force_reg (V2DFmode, ix86_build_const_vector (DFmode, 1, x)); + operands[4] = force_reg (V2DFmode, + ix86_build_const_vector (V2DFmode, 1, x)); operands[5] = gen_reg_rtx (V4SImode); @@ -3250,12 +3379,30 @@ x = const_double_from_real_value (TWO32r, DFmode); operands[3] = force_reg (V2DFmode, CONST0_RTX (V2DFmode)); - operands[4] = force_reg (V2DFmode, ix86_build_const_vector (DFmode, 1, x)); + operands[4] = force_reg (V2DFmode, + ix86_build_const_vector (V2DFmode, 1, x)); for (i = 5; i < 8; i++) operands[i] = gen_reg_rtx (V2DFmode); }) +(define_expand "vec_pack_trunc_v4df" + [(set (match_dup 3) + (float_truncate:V4SF + (match_operand:V4DF 1 "nonimmediate_operand" ""))) + (set (match_dup 4) + (float_truncate:V4SF + (match_operand:V4DF 2 "nonimmediate_operand" ""))) + (set (match_operand:V8SF 0 "register_operand" "") + (vec_concat:V8SF + (match_dup 3) + (match_dup 4)))] + "TARGET_AVX" +{ + operands[3] = gen_reg_rtx (V4SFmode); + operands[4] = gen_reg_rtx (V4SFmode); +}) + (define_expand "vec_pack_trunc_v2df" [(match_operand:V4SF 0 "register_operand" "") (match_operand:V2DF 1 "nonimmediate_operand" "") @@ -3448,6 +3595,41 @@ (set_attr "prefix" "vex") (set_attr "mode" "V4SF")]) +(define_expand "vec_interleave_highv8sf" + [(set (match_dup 3) + (vec_select:V8SF + (vec_concat:V16SF + (match_operand:V8SF 1 "register_operand" "x") + (match_operand:V8SF 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) (const_int 8) + (const_int 1) (const_int 9) + (const_int 4) (const_int 12) + (const_int 5) (const_int 13)]))) + (set (match_dup 4) + (vec_select:V8SF + (vec_concat:V16SF + (match_dup 1) + (match_dup 2)) + (parallel [(const_int 2) (const_int 10) + (const_int 3) (const_int 11) + (const_int 6) (const_int 14) + (const_int 7) (const_int 15)]))) + (set (match_operand:V8SF 0 "register_operand" "") + (vec_concat:V8SF + (vec_select:V4SF + (match_dup 3) + (parallel [(const_int 4) (const_int 5) + (const_int 6) (const_int 7)])) + (vec_select:V4SF + (match_dup 4) + (parallel [(const_int 4) (const_int 5) + (const_int 6) (const_int 7)]))))] + "TARGET_AVX" +{ + operands[3] = gen_reg_rtx (V8SFmode); + operands[4] = gen_reg_rtx (V8SFmode); +}) + (define_insn "vec_interleave_highv4sf" [(set (match_operand:V4SF 0 "register_operand" "=x") (vec_select:V4SF @@ -3492,6 +3674,41 @@ (set_attr "prefix" "vex") (set_attr "mode" "V4SF")]) +(define_expand "vec_interleave_lowv8sf" + [(set (match_dup 3) + (vec_select:V8SF + (vec_concat:V16SF + (match_operand:V8SF 1 "register_operand" "x") + (match_operand:V8SF 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) (const_int 8) + (const_int 1) (const_int 9) + (const_int 4) (const_int 12) + (const_int 5) (const_int 13)]))) + (set (match_dup 4) + (vec_select:V8SF + (vec_concat:V16SF + (match_dup 1) + (match_dup 2)) + (parallel [(const_int 2) (const_int 10) + (const_int 3) (const_int 11) + (const_int 6) (const_int 14) + (const_int 7) (const_int 15)]))) + (set (match_operand:V8SF 0 "register_operand" "") + (vec_concat:V8SF + (vec_select:V4SF + (match_dup 3) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)])) + (vec_select:V4SF + (match_dup 4) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)]))))] + "TARGET_AVX" +{ + operands[3] = gen_reg_rtx (V8SFmode); + operands[4] = gen_reg_rtx (V8SFmode); +}) + (define_insn "vec_interleave_lowv4sf" [(set (match_operand:V4SF 0 "register_operand" "=x") (vec_select:V4SF @@ -4360,6 +4577,17 @@ }) (define_expand "vec_extract" + [(match_operand: 0 "register_operand" "") + (match_operand:AVX256MODEF2P 1 "register_operand" "") + (match_operand 2 "const_int_operand" "")] + "TARGET_AVX" +{ + ix86_expand_vector_extract (false, operands[0], operands[1], + INTVAL (operands[2])); + DONE; +}) + +(define_expand "vec_extract" [(match_operand: 0 "register_operand" "") (match_operand:SSEMODE 1 "register_operand" "") (match_operand 2 "const_int_operand" "")] @@ -4391,6 +4619,36 @@ (set_attr "prefix" "vex") (set_attr "mode" "V4DF")]) +(define_expand "vec_interleave_highv4df" + [(set (match_dup 3) + (vec_select:V4DF + (vec_concat:V8DF + (match_operand:V4DF 1 "register_operand" "x") + (match_operand:V4DF 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) (const_int 4) + (const_int 2) (const_int 6)]))) + (set (match_dup 4) + (vec_select:V4DF + (vec_concat:V8DF + (match_dup 1) + (match_dup 2)) + (parallel [(const_int 1) (const_int 5) + (const_int 3) (const_int 7)]))) + (set (match_operand:V4DF 0 "register_operand" "") + (vec_concat:V4DF + (vec_select:V2DF + (match_dup 3) + (parallel [(const_int 2) (const_int 3)])) + (vec_select:V2DF + (match_dup 4) + (parallel [(const_int 2) (const_int 3)]))))] + "TARGET_AVX" +{ + operands[3] = gen_reg_rtx (V4DFmode); + operands[4] = gen_reg_rtx (V4DFmode); +}) + + (define_expand "vec_interleave_highv2df" [(set (match_operand:V2DF 0 "register_operand" "") (vec_select:V2DF @@ -4498,6 +4756,35 @@ (set_attr "prefix" "vex") (set_attr "mode" "V4DF")]) +(define_expand "vec_interleave_lowv4df" + [(set (match_dup 3) + (vec_select:V4DF + (vec_concat:V8DF + (match_operand:V4DF 1 "register_operand" "x") + (match_operand:V4DF 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) (const_int 4) + (const_int 2) (const_int 6)]))) + (set (match_dup 4) + (vec_select:V4DF + (vec_concat:V8DF + (match_dup 1) + (match_dup 2)) + (parallel [(const_int 1) (const_int 5) + (const_int 3) (const_int 7)]))) + (set (match_operand:V4DF 0 "register_operand" "") + (vec_concat:V4DF + (vec_select:V2DF + (match_dup 3) + (parallel [(const_int 0) (const_int 1)])) + (vec_select:V2DF + (match_dup 4) + (parallel [(const_int 0) (const_int 1)]))))] + "TARGET_AVX" +{ + operands[3] = gen_reg_rtx (V4DFmode); + operands[4] = gen_reg_rtx (V4DFmode); +}) + (define_expand "vec_interleave_lowv2df" [(set (match_operand:V2DF 0 "register_operand" "") (vec_select:V2DF @@ -12079,3 +12366,67 @@ [(set_attr "type" "ssecvt") (set_attr "prefix" "vex") (set_attr "mode" "V8SF")]) + +(define_insn "*vec_concat_lo__avx" + [(set (match_operand:AVX256MODE4P 0 "register_operand" "=x") + (vec_concat:AVX256MODE4P + (vec_select: + (match_operand:AVX256MODE4P 1 "register_operand" "x") + (parallel [(const_int 0) (const_int 1)])) + (vec_select: + (match_operand:AVX256MODE4P 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0) (const_int 1)]))))] + "TARGET_AVX" + "vperm2f128\t{$0x20, %2, %1, %0|%0, %1, %2, 0x20}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "V8SF")]) + +(define_insn "*vec_concat_hi__avx" + [(set (match_operand:AVX256MODE4P 0 "register_operand" "=x") + (vec_concat:AVX256MODE4P + (vec_select: + (match_operand:AVX256MODE4P 1 "register_operand" "x") + (parallel [(const_int 2) (const_int 3)])) + (vec_select: + (match_operand:AVX256MODE4P 2 "nonimmediate_operand" "xm") + (parallel [(const_int 2) (const_int 3)]))))] + "TARGET_AVX" + "vperm2f128\t{$0x31, %2, %1, %0|%0, %1, %2, 0x31}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "V8SF")]) + +(define_insn "*vec_concat_lo__avx" + [(set (match_operand:AVX256MODE8P 0 "register_operand" "=x") + (vec_concat:AVX256MODE8P + (vec_select: + (match_operand:AVX256MODE8P 1 "register_operand" "x") + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)])) + (vec_select: + (match_operand:AVX256MODE8P 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)]))))] + "TARGET_AVX" + "vperm2f128\t{$0x20, %2, %1, %0|%0, %1, %2, 0x20}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "V8SF")]) + +(define_insn "*vec_concat_hi__avx" + [(set (match_operand:AVX256MODE8P 0 "register_operand" "=x") + (vec_concat:AVX256MODE8P + (vec_select: + (match_operand:AVX256MODE8P 1 "register_operand" "x") + (parallel [(const_int 4) (const_int 5) + (const_int 6) (const_int 7)])) + (vec_select: + (match_operand:AVX256MODE8P 2 "nonimmediate_operand" "xm") + (parallel [(const_int 4) (const_int 5) + (const_int 6) (const_int 7)]))))] + "TARGET_AVX" + "vperm2f128\t{$0x31, %2, %1, %0|%0, %1, %2, 00x31}" + [(set_attr "type" "sselog") + (set_attr "prefix" "vex") + (set_attr "mode" "V8SF")])