From patchwork Wed Oct 13 20:12:29 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Patchwork-Submitter: "H.J. Lu" X-Patchwork-Id: 67737 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) by ozlabs.org (Postfix) with SMTP id D1D37B70DD for ; Thu, 14 Oct 2010 07:12:59 +1100 (EST) Received: (qmail 8745 invoked by alias); 13 Oct 2010 20:12:51 -0000 Received: (qmail 8718 invoked by uid 22791); 13 Oct 2010 20:12:43 -0000 X-SWARE-Spam-Status: No, hits=-1.7 required=5.0 tests=AWL, BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, FREEMAIL_FROM, RCVD_IN_DNSWL_NONE, TW_AV, TW_CP, TW_VP, TW_VX, TW_XD X-Spam-Check-By: sourceware.org Received: from mail-vw0-f47.google.com (HELO mail-vw0-f47.google.com) (209.85.212.47) by sourceware.org (qpsmtpd/0.43rc1) with ESMTP; Wed, 13 Oct 2010 20:12:33 +0000 Received: by vws3 with SMTP id 3so2090566vws.20 for ; Wed, 13 Oct 2010 13:12:31 -0700 (PDT) MIME-Version: 1.0 Received: by 10.220.194.73 with SMTP id dx9mr342824vcb.1.1287000749788; Wed, 13 Oct 2010 13:12:29 -0700 (PDT) Received: by 10.220.181.138 with HTTP; Wed, 13 Oct 2010 13:12:29 -0700 (PDT) In-Reply-To: <4CB5E1B8.80307@redhat.com> References: <4CB49385.1010805@redhat.com> <4CB5E1B8.80307@redhat.com> Date: Wed, 13 Oct 2010 13:12:29 -0700 Message-ID: Subject: Re: [PATCH][RFC] 256bit AVX vectorization support From: "H.J. Lu" To: Richard Henderson Cc: Richard Guenther , Uros Bizjak , gcc-patches@gcc.gnu.org X-IsSubscribed: yes Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Delivered-To: mailing list gcc-patches@gcc.gnu.org On Wed, Oct 13, 2010 at 9:43 AM, Richard Henderson wrote: > On 10/13/2010 09:19 AM, H.J. Lu wrote: >> +(define_mode_iterator STORENT_MODE >> +  [(SF "TARGET_SSE4A") (DF "TARGET_SSE4A") >> +   (SI "TARGET_SSE2") (V2DI "TARGET_SSE2") (V2DF "TARGET_SSE2") >> +   (V4SF "TARGET_SSE") >> +   (V4DF "TARGET_AVX") (V8SF "TARGET_AVX")]) > > Since all modes have a condition here ... > >>  (define_expand "storent" >> +  [(set (match_operand:STORENT_MODE 0 "memory_operand" "") >> +     (unspec:STORENT_MODE >> +       [(match_operand:STORENT_MODE 1 "register_operand" "")] >>         UNSPEC_MOVNT))] >> +  "TARGET_SSE") > > ... I think you don't need one here. Done. > >>  (define_expand "2" >> +  [(set (match_operand:AVX256MODEF2P 0 "register_operand" "") >> +     (absneg:AVX256MODEF2P >> +       (match_operand:AVX256MODEF2P 1 "register_operand" "")))] >> +  "AVX256_VEC_FLOAT_MODE_P (mode)" >> +  "ix86_expand_fp_absneg_operator (, mode, operands); DONE;") > ... >>  (define_expand "copysign3" >>    [(set (match_dup 4) >> +     (and:AVX256MODEF2P >> +       (not:AVX256MODEF2P (match_dup 3)) >> +       (match_operand:AVX256MODEF2P 1 "nonimmediate_operand" ""))) >> +   (set (match_dup 5) >> +     (and:AVX256MODEF2P (match_dup 3) >> +                        (match_operand:AVX256MODEF2P 2 "nonimmediate_operand" ""))) >> +   (set (match_operand:AVX256MODEF2P 0 "register_operand" "") >> +     (ior:AVX256MODEF2P (match_dup 4) (match_dup 5)))] >> +  "AVX256_VEC_FLOAT_MODE_P (mode)" > ... >>  (define_expand "vec_extract" >> +  [(match_operand: 0 "register_operand" "") >> +   (match_operand:AVX256MODEF2P 1 "register_operand" "") >> +   (match_operand 2 "const_int_operand" "")] >> +  "TARGET_AVX" > > However my same comment about not duplicating expander patterns > applies to these other instances as well. Done. >> +(define_insn "*vec_concat_lo__avx" > > You didn't understand my comment about avx_vpermilp_parallel at > all, you merely doubled the number of patterns.  My point is > that you would only need 1 (one) pattern. > Those patterns were added for the old 256bit vectorizer changes. I am not sure if they are needed now. I deleted them. I will investigate against the new 256bit vectorizer changes after they are checked into trunk. OK for trunk? Thanks. diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index ff2be62..0987b45 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -15752,17 +15752,28 @@ ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value) rtvec v; switch (mode) { - case SImode: + case V4SImode: gcc_assert (vect); v = gen_rtvec (4, value, value, value, value); return gen_rtx_CONST_VECTOR (V4SImode, v); - case DImode: + case V2DImode: gcc_assert (vect); v = gen_rtvec (2, value, value); return gen_rtx_CONST_VECTOR (V2DImode, v); - case SFmode: + case V8SFmode: + if (vect) + v = gen_rtvec (8, value, value, value, value, + value, value, value, value); + else + v = gen_rtvec (8, value, CONST0_RTX (SFmode), + CONST0_RTX (SFmode), CONST0_RTX (SFmode), + CONST0_RTX (SFmode), CONST0_RTX (SFmode), + CONST0_RTX (SFmode), CONST0_RTX (SFmode)); + return gen_rtx_CONST_VECTOR (V8SFmode, v); + + case V4SFmode: if (vect) v = gen_rtvec (4, value, value, value, value); else @@ -15770,7 +15781,15 @@ ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value) CONST0_RTX (SFmode), CONST0_RTX (SFmode)); return gen_rtx_CONST_VECTOR (V4SFmode, v); - case DFmode: + case V4DFmode: + if (vect) + v = gen_rtvec (4, value, value, value, value); + else + v = gen_rtvec (4, value, CONST0_RTX (DFmode), + CONST0_RTX (DFmode), CONST0_RTX (DFmode)); + return gen_rtx_CONST_VECTOR (V4DFmode, v); + + case V2DFmode: if (vect) v = gen_rtvec (2, value, value); else @@ -15800,17 +15819,21 @@ ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert) /* Find the sign bit, sign extended to 2*HWI. */ switch (mode) { - case SImode: - case SFmode: + case V4SImode: + case V8SFmode: + case V4SFmode: + vec_mode = mode; + mode = GET_MODE_INNER (mode); imode = SImode; - vec_mode = (mode == SImode) ? V4SImode : V4SFmode; lo = 0x80000000, hi = lo < 0; break; - case DImode: - case DFmode: + case V2DImode: + case V4DFmode: + case V2DFmode: + vec_mode = mode; + mode = GET_MODE_INNER (mode); imode = DImode; - vec_mode = (mode == DImode) ? V2DImode : V2DFmode; if (HOST_BITS_PER_WIDE_INT >= 64) lo = (HOST_WIDE_INT)1 << shift, hi = -1; else @@ -15864,7 +15887,7 @@ ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert) if (vec_mode == VOIDmode) return force_reg (mode, mask); - v = ix86_build_const_vector (mode, vect, mask); + v = ix86_build_const_vector (vec_mode, vect, mask); return force_reg (vec_mode, v); } @@ -15877,22 +15900,25 @@ ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode, rtx mask, set, use, clob, dst, src; bool use_sse = false; bool vector_mode = VECTOR_MODE_P (mode); - enum machine_mode elt_mode = mode; + enum machine_mode vmode = mode; if (vector_mode) - { - elt_mode = GET_MODE_INNER (mode); - use_sse = true; - } + use_sse = true; else if (mode == TFmode) use_sse = true; else if (TARGET_SSE_MATH) - use_sse = SSE_FLOAT_MODE_P (mode); + { + use_sse = SSE_FLOAT_MODE_P (mode); + if (mode == SFmode) + vmode = V4SFmode; + else if (mode == DFmode) + vmode = V2DFmode; + } /* NEG and ABS performed with SSE use bitwise mask operations. Create the appropriate mask now. */ if (use_sse) - mask = ix86_build_signbit_mask (elt_mode, vector_mode, code == ABS); + mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS); else mask = NULL_RTX; @@ -15926,7 +15952,7 @@ ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode, void ix86_expand_copysign (rtx operands[]) { - enum machine_mode mode; + enum machine_mode mode, vmode; rtx dest, op0, op1, mask, nmask; dest = operands[0]; @@ -15935,6 +15961,13 @@ ix86_expand_copysign (rtx operands[]) mode = GET_MODE (dest); + if (mode == SFmode) + vmode = V4SFmode; + else if (mode == DFmode) + vmode = V2DFmode; + else + vmode = mode; + if (GET_CODE (op0) == CONST_DOUBLE) { rtx (*copysign_insn)(rtx, rtx, rtx, rtx); @@ -15944,15 +15977,11 @@ ix86_expand_copysign (rtx operands[]) if (mode == SFmode || mode == DFmode) { - enum machine_mode vmode; - - vmode = mode == SFmode ? V4SFmode : V2DFmode; - if (op0 == CONST0_RTX (mode)) op0 = CONST0_RTX (vmode); else { - rtx v = ix86_build_const_vector (mode, false, op0); + rtx v = ix86_build_const_vector (vmode, false, op0); op0 = force_reg (vmode, v); } @@ -15960,7 +15989,7 @@ ix86_expand_copysign (rtx operands[]) else if (op0 != CONST0_RTX (mode)) op0 = force_reg (mode, op0); - mask = ix86_build_signbit_mask (mode, 0, 0); + mask = ix86_build_signbit_mask (vmode, 0, 0); if (mode == SFmode) copysign_insn = gen_copysignsf3_const; @@ -15975,8 +16004,8 @@ ix86_expand_copysign (rtx operands[]) { rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx); - nmask = ix86_build_signbit_mask (mode, 0, 1); - mask = ix86_build_signbit_mask (mode, 0, 0); + nmask = ix86_build_signbit_mask (vmode, 0, 1); + mask = ix86_build_signbit_mask (vmode, 0, 0); if (mode == SFmode) copysign_insn = gen_copysignsf3_var; @@ -17877,8 +17906,7 @@ ix86_expand_int_vcond (rtx operands[]) /* Subtract (-(INT MAX) - 1) from both operands to make them signed. */ - mask = ix86_build_signbit_mask (GET_MODE_INNER (mode), - true, false); + mask = ix86_build_signbit_mask (mode, true, false); gen_sub3 = (mode == V4SImode ? gen_subv4si3 : gen_subv2di3); t1 = gen_reg_rtx (mode); @@ -22713,6 +22741,8 @@ enum ix86_builtins /* Vectorizer support builtins. */ IX86_BUILTIN_CPYSGNPS, IX86_BUILTIN_CPYSGNPD, + IX86_BUILTIN_CPYSGNPS256, + IX86_BUILTIN_CPYSGNPD256, IX86_BUILTIN_CVTUDQ2PS, @@ -23850,6 +23880,9 @@ static const struct builtin_description bdesc_args[] = { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF }, { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF }, + { OPTION_MASK_ISA_ABM, CODE_FOR_clzhi2_abm, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 }, /* F16C */ @@ -26036,15 +26069,23 @@ ix86_builtin_vectorized_function (tree fndecl, tree type_out, switch (fn) { case BUILT_IN_SQRT: - if (out_mode == DFmode && out_n == 2 - && in_mode == DFmode && in_n == 2) - return ix86_builtins[IX86_BUILTIN_SQRTPD]; + if (out_mode == DFmode && in_mode == DFmode) + { + if (out_n == 2 && in_n == 2) + return ix86_builtins[IX86_BUILTIN_SQRTPD]; + else if (out_n == 4 && in_n == 4) + return ix86_builtins[IX86_BUILTIN_SQRTPD256]; + } break; case BUILT_IN_SQRTF: - if (out_mode == SFmode && out_n == 4 - && in_mode == SFmode && in_n == 4) - return ix86_builtins[IX86_BUILTIN_SQRTPS_NR]; + if (out_mode == SFmode && in_mode == SFmode) + { + if (out_n == 4 && in_n == 4) + return ix86_builtins[IX86_BUILTIN_SQRTPS_NR]; + else if (out_n == 8 && in_n == 8) + return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256]; + } break; case BUILT_IN_LRINT: @@ -26054,21 +26095,33 @@ ix86_builtin_vectorized_function (tree fndecl, tree type_out, break; case BUILT_IN_LRINTF: - if (out_mode == SImode && out_n == 4 - && in_mode == SFmode && in_n == 4) - return ix86_builtins[IX86_BUILTIN_CVTPS2DQ]; + if (out_mode == SImode && in_mode == SFmode) + { + if (out_n == 4 && in_n == 4) + return ix86_builtins[IX86_BUILTIN_CVTPS2DQ]; + else if (out_n == 8 && in_n == 8) + return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256]; + } break; case BUILT_IN_COPYSIGN: - if (out_mode == DFmode && out_n == 2 - && in_mode == DFmode && in_n == 2) - return ix86_builtins[IX86_BUILTIN_CPYSGNPD]; + if (out_mode == DFmode && in_mode == DFmode) + { + if (out_n == 2 && in_n == 2) + return ix86_builtins[IX86_BUILTIN_CPYSGNPD]; + else if (out_n == 4 && in_n == 4) + return ix86_builtins[IX86_BUILTIN_CPYSGNPD256]; + } break; case BUILT_IN_COPYSIGNF: - if (out_mode == SFmode && out_n == 4 - && in_mode == SFmode && in_n == 4) - return ix86_builtins[IX86_BUILTIN_CPYSGNPS]; + if (out_mode == SFmode && in_mode == SFmode) + { + if (out_n == 4 && in_n == 4) + return ix86_builtins[IX86_BUILTIN_CPYSGNPS]; + else if (out_n == 8 && in_n == 8) + return ix86_builtins[IX86_BUILTIN_CPYSGNPS256]; + } break; default: @@ -26391,6 +26444,9 @@ ix86_builtin_reciprocal (unsigned int fn, bool md_fn, case IX86_BUILTIN_SQRTPS_NR: return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR]; + case IX86_BUILTIN_SQRTPS_NR256: + return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256]; + default: return NULL_TREE; } @@ -30053,7 +30109,7 @@ void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode) two = CONST_DOUBLE_FROM_REAL_VALUE (dconst2, SFmode); if (VECTOR_MODE_P (mode)) - two = ix86_build_const_vector (SFmode, true, two); + two = ix86_build_const_vector (mode, true, two); two = force_reg (mode, two); @@ -30100,8 +30156,8 @@ void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode, if (VECTOR_MODE_P (mode)) { - mthree = ix86_build_const_vector (SFmode, true, mthree); - mhalf = ix86_build_const_vector (SFmode, true, mhalf); + mthree = ix86_build_const_vector (mode, true, mthree); + mhalf = ix86_build_const_vector (mode, true, mhalf); } /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) @@ -30246,7 +30302,16 @@ ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask) rtx sgn = gen_reg_rtx (mode); if (mask == NULL_RTX) { - mask = ix86_build_signbit_mask (mode, VECTOR_MODE_P (mode), false); + enum machine_mode vmode; + + if (mode == SFmode) + vmode = V4SFmode; + else if (mode == DFmode) + vmode = V2DFmode; + else + vmode = mode; + + mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false); if (!VECTOR_MODE_P (mode)) { /* We need to generate a scalar mode mask in this case. */ @@ -30270,11 +30335,17 @@ ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask) static rtx ix86_expand_sse_fabs (rtx op0, rtx *smask) { - enum machine_mode mode = GET_MODE (op0); + enum machine_mode vmode, mode = GET_MODE (op0); rtx xa, mask; xa = gen_reg_rtx (mode); - mask = ix86_build_signbit_mask (mode, VECTOR_MODE_P (mode), true); + if (mode == SFmode) + vmode = V4SFmode; + else if (mode == DFmode) + vmode = V2DFmode; + else + vmode = mode; + mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true); if (!VECTOR_MODE_P (mode)) { /* We need to generate a scalar mode mask in this case. */ @@ -31617,7 +31688,7 @@ expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d) static bool expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd) { - rtx t1, t2, t3, t4; + rtx t1, t2, t3; switch (d->vmode) { @@ -31639,34 +31710,34 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd) case V8SFmode: { - static const unsigned char perm1[8] = { 0, 2, 1, 3, 5, 6, 5, 7 }; - static const unsigned char perme[8] = { 0, 1, 8, 9, 4, 5, 12, 13 }; - static const unsigned char permo[8] = { 2, 3, 10, 11, 6, 7, 14, 15 }; + int mask = odd ? 0xdd : 0x88; t1 = gen_reg_rtx (V8SFmode); t2 = gen_reg_rtx (V8SFmode); t3 = gen_reg_rtx (V8SFmode); - t4 = gen_reg_rtx (V8SFmode); /* Shuffle within the 128-bit lanes to produce: - { 0 2 1 3 4 6 5 7 } and { 8 a 9 b c e d f }. */ - expand_vselect (t1, d->op0, perm1, 8); - expand_vselect (t2, d->op1, perm1, 8); + { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */ + emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1, + GEN_INT (mask))); + + /* Shuffle the lanes around to produce: + { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */ + emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1, + GEN_INT (0x3))); + + /* Shuffle within the 128-bit lanes to produce: + { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */ + emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44))); + + /* Shuffle within the 128-bit lanes to produce: + { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */ + emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee))); /* Shuffle the lanes around to produce: - { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */ - emit_insn (gen_avx_vperm2f128v8sf3 (t3, t1, t2, GEN_INT (0x20))); - emit_insn (gen_avx_vperm2f128v8sf3 (t4, t1, t2, GEN_INT (0x31))); - - /* Now a vpermil2p will produce the result required. */ - /* ??? The vpermil2p requires a vector constant. Another option - is a unpck[lh]ps to merge the two vectors to produce - { 0 4 2 6 8 c a e } or { 1 5 3 7 9 d b f }. Then use another - vpermilps to get the elements into the final order. */ - d->op0 = t3; - d->op1 = t4; - memcpy (d->perm, odd ? permo: perme, 8); - expand_vec_perm_vpermil (d); + { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */ + emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2, + GEN_INT (0x20))); } break; diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index e11908c..d2ad8b1 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -4509,7 +4509,7 @@ real_ldexp (&TWO31r, &dconst1, 31); two31 = const_double_from_real_value (TWO31r, mode); - two31 = ix86_build_const_vector (mode, true, two31); + two31 = ix86_build_const_vector (vecmode, true, two31); operands[2] = force_reg (vecmode, two31); }) diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 1784da9..db5e4de 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -86,6 +86,25 @@ (V8HI "TARGET_SSE2") (V16QI "TARGET_SSE2") (V4DF "TARGET_AVX") (V8SF "TARGET_AVX")]) +;; Modes handled by storent patterns. +(define_mode_iterator STORENT_MODE + [(SF "TARGET_SSE4A") (DF "TARGET_SSE4A") + (SI "TARGET_SSE2") (V2DI "TARGET_SSE2") (V2DF "TARGET_SSE2") + (V4SF "TARGET_SSE") + (V4DF "TARGET_AVX") (V8SF "TARGET_AVX")]) + +;; Modes handled by vector float patterns. +(define_mode_iterator VEC_FLOAT_MODE + [(V2DF "TARGET_SSE2") (V4SF "TARGET_SSE") + (V4DF "TARGET_AVX") (V8SF "TARGET_AVX")]) + +;; Modes handled by vector extract patterns. +(define_mode_iterator VEC_EXTRACT_MODE + [(V2DI "TARGET_SSE") (V4SI "TARGET_SSE") + (V8HI "TARGET_SSE") (V16QI "TARGET_SSE") + (V2DF "TARGET_SSE") (V4SF "TARGET_SSE") + (V4DF "TARGET_AVX") (V8SF "TARGET_AVX")]) + ;; Mapping from float mode to required SSE level (define_mode_attr sse [(SF "sse") (DF "sse2") (V4SF "sse") (V2DF "sse2")]) @@ -504,30 +523,10 @@ ; define patterns for other modes that would expand to several insns. (define_expand "storent" - [(set (match_operand:SSEMODEF2P 0 "memory_operand" "") - (unspec:SSEMODEF2P - [(match_operand:SSEMODEF2P 1 "register_operand" "")] - UNSPEC_MOVNT))] - "SSE_VEC_FLOAT_MODE_P (mode)") - -(define_expand "storent" - [(set (match_operand:MODEF 0 "memory_operand" "") - (unspec:MODEF - [(match_operand:MODEF 1 "register_operand" "")] - UNSPEC_MOVNT))] - "TARGET_SSE4A") - -(define_expand "storentv2di" - [(set (match_operand:V2DI 0 "memory_operand" "") - (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "")] - UNSPEC_MOVNT))] - "TARGET_SSE2") - -(define_expand "storentsi" - [(set (match_operand:SI 0 "memory_operand" "") - (unspec:SI [(match_operand:SI 1 "register_operand" "")] - UNSPEC_MOVNT))] - "TARGET_SSE2") + [(set (match_operand:STORENT_MODE 0 "memory_operand" "") + (unspec:STORENT_MODE + [(match_operand:STORENT_MODE 1 "register_operand" "")] + UNSPEC_MOVNT))]) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; @@ -536,10 +535,10 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (define_expand "2" - [(set (match_operand:SSEMODEF2P 0 "register_operand" "") - (absneg:SSEMODEF2P - (match_operand:SSEMODEF2P 1 "register_operand" "")))] - "SSE_VEC_FLOAT_MODE_P (mode)" + [(set (match_operand:VEC_FLOAT_MODE 0 "register_operand" "") + (absneg:VEC_FLOAT_MODE + (match_operand:VEC_FLOAT_MODE 1 "register_operand" "")))] + "" "ix86_expand_fp_absneg_operator (, mode, operands); DONE;") (define_expand "3" @@ -1380,6 +1379,19 @@ [(set_attr "type" "sseadd") (set_attr "mode" "V2DF")]) +(define_expand "reduc_splus_v8sf" + [(match_operand:V8SF 0 "register_operand" "") + (match_operand:V8SF 1 "register_operand" "")] + "TARGET_AVX" +{ + rtx tmp = gen_reg_rtx (V8SFmode); + rtx tmp2 = gen_reg_rtx (V8SFmode); + emit_insn (gen_avx_haddv8sf3 (tmp, operands[1], operands[1])); + emit_insn (gen_avx_haddv8sf3 (tmp2, operands[1], operands[1])); + emit_insn (gen_avx_haddv8sf3 (operands[0], tmp2, tmp2)); + DONE; +}) + (define_expand "reduc_splus_v4sf" [(match_operand:V4SF 0 "register_operand" "") (match_operand:V4SF 1 "register_operand" "")] @@ -1396,6 +1408,17 @@ DONE; }) +(define_expand "reduc_splus_v4df" + [(match_operand:V4DF 0 "register_operand" "") + (match_operand:V4DF 1 "register_operand" "")] + "TARGET_AVX" +{ + rtx tmp = gen_reg_rtx (V4DFmode); + emit_insn (gen_avx_haddv4df3 (tmp, operands[1], operands[1])); + emit_insn (gen_avx_haddv4df3 (operands[0], tmp, tmp)); + DONE; +}) + (define_expand "reduc_splus_v2df" [(match_operand:V2DF 0 "register_operand" "") (match_operand:V2DF 1 "register_operand" "")] @@ -1650,17 +1673,17 @@ (define_expand "copysign3" [(set (match_dup 4) - (and:SSEMODEF2P - (not:SSEMODEF2P (match_dup 3)) - (match_operand:SSEMODEF2P 1 "nonimmediate_operand" ""))) + (and:VEC_FLOAT_MODE + (not:VEC_FLOAT_MODE (match_dup 3)) + (match_operand:VEC_FLOAT_MODE 1 "nonimmediate_operand" ""))) (set (match_dup 5) - (and:SSEMODEF2P (match_dup 3) - (match_operand:SSEMODEF2P 2 "nonimmediate_operand" ""))) - (set (match_operand:SSEMODEF2P 0 "register_operand" "") - (ior:SSEMODEF2P (match_dup 4) (match_dup 5)))] - "SSE_VEC_FLOAT_MODE_P (mode)" + (and:VEC_FLOAT_MODE (match_dup 3) + (match_operand:VEC_FLOAT_MODE 2 "nonimmediate_operand" ""))) + (set (match_operand:VEC_FLOAT_MODE 0 "register_operand" "") + (ior:VEC_FLOAT_MODE (match_dup 4) (match_dup 5)))] + "" { - operands[3] = ix86_build_signbit_mask (mode, 1, 0); + operands[3] = ix86_build_signbit_mask (mode, 1, 0); operands[4] = gen_reg_rtx (mode); operands[5] = gen_reg_rtx (mode); @@ -2657,7 +2680,8 @@ x = const_double_from_real_value (TWO32r, SFmode); operands[3] = force_reg (V4SFmode, CONST0_RTX (V4SFmode)); - operands[4] = force_reg (V4SFmode, ix86_build_const_vector (SFmode, 1, x)); + operands[4] = force_reg (V4SFmode, + ix86_build_const_vector (V4SFmode, 1, x)); for (i = 5; i < 8; i++) operands[i] = gen_reg_rtx (V4SFmode); @@ -2892,6 +2916,18 @@ (set_attr "prefix" "vex") (set_attr "mode" "V4DF")]) +(define_insn "*avx_cvtdq2pd256_2" + [(set (match_operand:V4DF 0 "register_operand" "=x") + (float:V4DF + (vec_select:V4SI + (match_operand:V8SI 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3)]))))] + "TARGET_AVX" + "vcvtdq2pd\t{%x1, %0|%0, %x1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "vex") + (set_attr "mode" "V4DF")]) + (define_insn "sse2_cvtdq2pd" [(set (match_operand:V2DF 0 "register_operand" "=x") (float:V2DF @@ -3072,6 +3108,18 @@ (set_attr "prefix" "vex") (set_attr "mode" "V4DF")]) +(define_insn "*avx_cvtps2pd256_2" + [(set (match_operand:V4DF 0 "register_operand" "=x") + (float_extend:V4DF + (vec_select:V4SF + (match_operand:V8SF 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3)]))))] + "TARGET_AVX" + "vcvtps2pd\t{%x1, %0|%0, %x1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "vex") + (set_attr "mode" "V4DF")]) + (define_insn "sse2_cvtps2pd" [(set (match_operand:V2DF 0 "register_operand" "=x") (float_extend:V2DF @@ -3104,6 +3152,22 @@ "TARGET_SSE2" "operands[2] = gen_reg_rtx (V4SFmode);") +(define_expand "vec_unpacks_hi_v8sf" + [(set (match_dup 2) + (vec_select:V4SF + (match_operand:V8SF 1 "nonimmediate_operand" "") + (parallel [(const_int 4) + (const_int 5) + (const_int 6) + (const_int 7)]))) + (set (match_operand:V4DF 0 "register_operand" "") + (float_extend:V4DF + (match_dup 2)))] + "TARGET_AVX" +{ + operands[2] = gen_reg_rtx (V4SFmode); +}) + (define_expand "vec_unpacks_lo_v4sf" [(set (match_operand:V2DF 0 "register_operand" "") (float_extend:V2DF @@ -3112,6 +3176,14 @@ (parallel [(const_int 0) (const_int 1)]))))] "TARGET_SSE2") +(define_expand "vec_unpacks_lo_v8sf" + [(set (match_operand:V4DF 0 "register_operand" "") + (float_extend:V4DF + (vec_select:V4SF + (match_operand:V8SF 1 "nonimmediate_operand" "") + (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3)]))))] + "TARGET_AVX") + (define_expand "vec_unpacks_float_hi_v8hi" [(match_operand:V4SF 0 "register_operand" "") (match_operand:V8HI 1 "register_operand" "")] @@ -3184,6 +3256,28 @@ (parallel [(const_int 0) (const_int 1)]))))] "TARGET_SSE2") +(define_expand "vec_unpacks_float_hi_v8si" + [(set (match_dup 2) + (vec_select:V4SI + (match_operand:V8SI 1 "nonimmediate_operand" "") + (parallel [(const_int 4) + (const_int 5) + (const_int 6) + (const_int 7)]))) + (set (match_operand:V4DF 0 "register_operand" "") + (float:V4DF + (match_dup 2)))] + "TARGET_AVX" + "operands[2] = gen_reg_rtx (V4SImode);") + +(define_expand "vec_unpacks_float_lo_v8si" + [(set (match_operand:V4DF 0 "register_operand" "") + (float:V4DF + (vec_select:V4SI + (match_operand:V8SI 1 "nonimmediate_operand" "") + (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3)]))))] + "TARGET_AVX") + (define_expand "vec_unpacku_float_hi_v4si" [(set (match_dup 5) (vec_select:V4SI @@ -3213,7 +3307,8 @@ x = const_double_from_real_value (TWO32r, DFmode); operands[3] = force_reg (V2DFmode, CONST0_RTX (V2DFmode)); - operands[4] = force_reg (V2DFmode, ix86_build_const_vector (DFmode, 1, x)); + operands[4] = force_reg (V2DFmode, + ix86_build_const_vector (V2DFmode, 1, x)); operands[5] = gen_reg_rtx (V4SImode); @@ -3243,12 +3338,30 @@ x = const_double_from_real_value (TWO32r, DFmode); operands[3] = force_reg (V2DFmode, CONST0_RTX (V2DFmode)); - operands[4] = force_reg (V2DFmode, ix86_build_const_vector (DFmode, 1, x)); + operands[4] = force_reg (V2DFmode, + ix86_build_const_vector (V2DFmode, 1, x)); for (i = 5; i < 8; i++) operands[i] = gen_reg_rtx (V2DFmode); }) +(define_expand "vec_pack_trunc_v4df" + [(set (match_dup 3) + (float_truncate:V4SF + (match_operand:V4DF 1 "nonimmediate_operand" ""))) + (set (match_dup 4) + (float_truncate:V4SF + (match_operand:V4DF 2 "nonimmediate_operand" ""))) + (set (match_operand:V8SF 0 "register_operand" "") + (vec_concat:V8SF + (match_dup 3) + (match_dup 4)))] + "TARGET_AVX" +{ + operands[3] = gen_reg_rtx (V4SFmode); + operands[4] = gen_reg_rtx (V4SFmode); +}) + (define_expand "vec_pack_trunc_v2df" [(match_operand:V4SF 0 "register_operand" "") (match_operand:V2DF 1 "nonimmediate_operand" "") @@ -3441,6 +3554,41 @@ (set_attr "prefix" "vex") (set_attr "mode" "V4SF")]) +(define_expand "vec_interleave_highv8sf" + [(set (match_dup 3) + (vec_select:V8SF + (vec_concat:V16SF + (match_operand:V8SF 1 "register_operand" "x") + (match_operand:V8SF 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) (const_int 8) + (const_int 1) (const_int 9) + (const_int 4) (const_int 12) + (const_int 5) (const_int 13)]))) + (set (match_dup 4) + (vec_select:V8SF + (vec_concat:V16SF + (match_dup 1) + (match_dup 2)) + (parallel [(const_int 2) (const_int 10) + (const_int 3) (const_int 11) + (const_int 6) (const_int 14) + (const_int 7) (const_int 15)]))) + (set (match_operand:V8SF 0 "register_operand" "") + (vec_concat:V8SF + (vec_select:V4SF + (match_dup 3) + (parallel [(const_int 4) (const_int 5) + (const_int 6) (const_int 7)])) + (vec_select:V4SF + (match_dup 4) + (parallel [(const_int 4) (const_int 5) + (const_int 6) (const_int 7)]))))] + "TARGET_AVX" +{ + operands[3] = gen_reg_rtx (V8SFmode); + operands[4] = gen_reg_rtx (V8SFmode); +}) + (define_insn "vec_interleave_highv4sf" [(set (match_operand:V4SF 0 "register_operand" "=x") (vec_select:V4SF @@ -3485,6 +3633,41 @@ (set_attr "prefix" "vex") (set_attr "mode" "V4SF")]) +(define_expand "vec_interleave_lowv8sf" + [(set (match_dup 3) + (vec_select:V8SF + (vec_concat:V16SF + (match_operand:V8SF 1 "register_operand" "x") + (match_operand:V8SF 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) (const_int 8) + (const_int 1) (const_int 9) + (const_int 4) (const_int 12) + (const_int 5) (const_int 13)]))) + (set (match_dup 4) + (vec_select:V8SF + (vec_concat:V16SF + (match_dup 1) + (match_dup 2)) + (parallel [(const_int 2) (const_int 10) + (const_int 3) (const_int 11) + (const_int 6) (const_int 14) + (const_int 7) (const_int 15)]))) + (set (match_operand:V8SF 0 "register_operand" "") + (vec_concat:V8SF + (vec_select:V4SF + (match_dup 3) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)])) + (vec_select:V4SF + (match_dup 4) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)]))))] + "TARGET_AVX" +{ + operands[3] = gen_reg_rtx (V8SFmode); + operands[4] = gen_reg_rtx (V8SFmode); +}) + (define_insn "vec_interleave_lowv4sf" [(set (match_operand:V4SF 0 "register_operand" "=x") (vec_select:V4SF @@ -4353,8 +4536,8 @@ }) (define_expand "vec_extract" - [(match_operand: 0 "register_operand" "") - (match_operand:SSEMODE 1 "register_operand" "") + [(match_operand: 0 "register_operand" "") + (match_operand:VEC_EXTRACT_MODE 1 "register_operand" "") (match_operand 2 "const_int_operand" "")] "TARGET_SSE" { @@ -4384,6 +4567,36 @@ (set_attr "prefix" "vex") (set_attr "mode" "V4DF")]) +(define_expand "vec_interleave_highv4df" + [(set (match_dup 3) + (vec_select:V4DF + (vec_concat:V8DF + (match_operand:V4DF 1 "register_operand" "x") + (match_operand:V4DF 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) (const_int 4) + (const_int 2) (const_int 6)]))) + (set (match_dup 4) + (vec_select:V4DF + (vec_concat:V8DF + (match_dup 1) + (match_dup 2)) + (parallel [(const_int 1) (const_int 5) + (const_int 3) (const_int 7)]))) + (set (match_operand:V4DF 0 "register_operand" "") + (vec_concat:V4DF + (vec_select:V2DF + (match_dup 3) + (parallel [(const_int 2) (const_int 3)])) + (vec_select:V2DF + (match_dup 4) + (parallel [(const_int 2) (const_int 3)]))))] + "TARGET_AVX" +{ + operands[3] = gen_reg_rtx (V4DFmode); + operands[4] = gen_reg_rtx (V4DFmode); +}) + + (define_expand "vec_interleave_highv2df" [(set (match_operand:V2DF 0 "register_operand" "") (vec_select:V2DF @@ -4489,6 +4702,35 @@ (set_attr "prefix" "vex") (set_attr "mode" "V4DF")]) +(define_expand "vec_interleave_lowv4df" + [(set (match_dup 3) + (vec_select:V4DF + (vec_concat:V8DF + (match_operand:V4DF 1 "register_operand" "x") + (match_operand:V4DF 2 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) (const_int 4) + (const_int 2) (const_int 6)]))) + (set (match_dup 4) + (vec_select:V4DF + (vec_concat:V8DF + (match_dup 1) + (match_dup 2)) + (parallel [(const_int 1) (const_int 5) + (const_int 3) (const_int 7)]))) + (set (match_operand:V4DF 0 "register_operand" "") + (vec_concat:V4DF + (vec_select:V2DF + (match_dup 3) + (parallel [(const_int 0) (const_int 1)])) + (vec_select:V2DF + (match_dup 4) + (parallel [(const_int 0) (const_int 1)]))))] + "TARGET_AVX" +{ + operands[3] = gen_reg_rtx (V4DFmode); + operands[4] = gen_reg_rtx (V4DFmode); +}) + (define_expand "vec_interleave_lowv2df" [(set (match_operand:V2DF 0 "register_operand" "") (vec_select:V2DF