[2/3] i386: Implement widen_smul_*_v4si for plain sse2

Message ID	1340767258-21086-2-git-send-email-rth@redhat.com
State	New
Headers	show Return-Path: <gcc-patches-return-321733-incoming=patchwork.ozlabs.org@gcc.gnu.org> Comment: DKIM? See http://www.dkim.org Comment: DomainKeys? See http://antispam.yahoo.com/domainkeys DomainKey-Signature: a=rsa-sha1; q=dns; c=nofws; s=default; d=gcc.gnu.org; h=Received:Received:X-SWARE-Spam-Status:X-Spam-Check-By:Received:Received:Received:Received:From:To:Subject:Date:Message-Id:In-Reply-To:References:X-IsSubscribed:Mailing-List:Precedence:List-Id:List-Unsubscribe:List-Archive:List-Post:List-Help:Sender:Delivered-To; b=oQHYhdyF4D8qDlzCkUAhECoqy0VhGUE2O7kG6BGtcneLz2HnfbYNSG4tnU3gg3 jqQ7KGPTKPG6V/UH1fbRMSsReALf5HHBgDIlxSXRFpnnKk8Yli/nUGY0fzj0Z4/F CKPd0cWzogyzIOxYLdFJQhGH4757LwBhgBvvQxnORQaz0=; From: Richard Henderson <rth@redhat.com> To: gcc-patches@gcc.gnu.org Subject: [PATCH 2/3] i386: Implement widen_smul_*_v4si for plain sse2 Date: Tue, 26 Jun 2012 20:20:57 -0700 Message-Id: <1340767258-21086-2-git-send-email-rth@redhat.com> In-Reply-To: <1340767258-21086-1-git-send-email-rth@redhat.com> References: <1340767258-21086-1-git-send-email-rth@redhat.com> Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 5cf230f..b96fc6e 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -25758,6 +25758,7 @@ enum ix86_builtins IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V8SI, IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V4SI, IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V8SI, + IX86_BUILTIN_VEC_WIDEN_SMUL_EVEN_V4SI, IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V4SI, IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V8SI, @@ -26620,7 +26621,9 @@ static const struct builtin_description bdesc_args[] = { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI }, { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI }, { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_vw_umul_even_v4si", IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V4SI, UNKNOWN, (int) V2UDI_FTYPE_V4USI_V4USI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_smult_even_v4si, "__builtin_ia32_vw_smul_even_v4si", IX86_BUILTIN_VEC_WIDEN_SMUL_EVEN_V4SI, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI }, { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_odd_v4si, "__builtin_ia32_vw_umul_odd_v4si", IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V4SI, UNKNOWN, (int) V2UDI_FTYPE_V4USI_V4USI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_smult_odd_v4si, "__builtin_ia32_vw_smul_odd_v4si", IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V4SI, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI }, { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI }, @@ -26747,7 +26750,6 @@ static const struct builtin_description bdesc_args[] = { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI }, { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI }, { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI }, - { OPTION_MASK_ISA_SSE4_1, CODE_FOR_vec_widen_smult_odd_v4si, "__builtin_ia32_vw_smul_odd_v4si", IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V4SI, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI }, { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI }, /* SSE4.1 */ @@ -31067,18 +31069,10 @@ ix86_builtin_mul_widen_even (tree type) switch (TYPE_MODE (type)) { case V4SImode: - if (uns_p) - { - if (!TARGET_SSE2) - return NULL; - code = IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V4SI; - } - else - { - if (!TARGET_SSE4_1) - return NULL; - code = IX86_BUILTIN_PMULDQ128; - } + if (!TARGET_SSE2) + return NULL; + code = (uns_p ? IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V4SI + : IX86_BUILTIN_VEC_WIDEN_SMUL_EVEN_V4SI); break; case V8SImode: @@ -31103,18 +31097,10 @@ ix86_builtin_mul_widen_odd (tree type) switch (TYPE_MODE (type)) { case V4SImode: - if (uns_p) - { - if (!TARGET_SSE2) - return NULL; - code = IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V4SI; - } - else - { - if (!TARGET_SSE4_1) - return NULL; - code = IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V4SI; - } + if (!TARGET_SSE2) + return NULL; + code = (uns_p ? IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V4SI + : IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V4SI); break; case V8SImode: @@ -38774,12 +38760,12 @@ ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2, emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x)); return; } + + x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode)); op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1), - GEN_INT (GET_MODE_UNIT_BITSIZE (mode)), NULL, - 1, OPTAB_DIRECT); + x, NULL, 1, OPTAB_DIRECT); op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2), - GEN_INT (GET_MODE_UNIT_BITSIZE (mode)), NULL, - 1, OPTAB_DIRECT); + x, NULL, 1, OPTAB_DIRECT); op1 = gen_lowpart (mode, op1); op2 = gen_lowpart (mode, op2); } @@ -38801,7 +38787,38 @@ ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2, x = gen_xop_pmacsdql (dest, op1, op2, x); } else - gcc_unreachable (); + { + rtx s1, s2, t0, t1, t2; + + /* The easiest way to implement this without PMULDQ is to go through + the motions as if we are performing a full 64-bit multiply. With + the exception that we need to do less shuffling of the elements. */ + + /* Compute the sign-extension, aka highparts, of the two operands. */ + s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode), + op1, pc_rtx, pc_rtx); + s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode), + op2, pc_rtx, pc_rtx); + + /* Multiply LO(A) * HI(B), and vice-versa. */ + t1 = gen_reg_rtx (wmode); + t2 = gen_reg_rtx (wmode); + emit_insn (gen_sse2_umulv2siv2di3 (t1, s1, op2)); + emit_insn (gen_sse2_umulv2siv2di3 (t2, s2, op1)); + + /* Multiply LO(A) * LO(B). */ + t0 = gen_reg_rtx (wmode); + emit_insn (gen_sse2_umulv2siv2di3 (t0, op1, op2)); + + /* Combine and shift the highparts into place. */ + t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT); + t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1, + 1, OPTAB_DIRECT); + + /* Combine high and low parts. */ + force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT); + return; + } emit_insn (x); } diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 81e7dc0..754b8b4 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -5607,9 +5607,7 @@ (any_extend:<sseunpackmode> (match_operand:VI124_AVX2 1 "register_operand")) (match_operand:VI124_AVX2 2 "register_operand")] - ; Note that SSE2 does not have signed SI multiply - "TARGET_XOP || TARGET_SSE4_1 - || (TARGET_SSE2 && (<u_bool> || <MODE>mode != V4SImode))" + "TARGET_SSE2" { ix86_expand_mul_widen_hilo (operands[0], operands[1], operands[2], <u_bool>, true); @@ -5621,23 +5619,32 @@ (any_extend:<sseunpackmode> (match_operand:VI124_AVX2 1 "register_operand")) (match_operand:VI124_AVX2 2 "register_operand")] - ; Note that SSE2 does not have signed SI multiply - "TARGET_XOP || TARGET_SSE4_1 - || (TARGET_SSE2 && (<u_bool> || <MODE>mode != V4SImode))" + "TARGET_SSE2" { ix86_expand_mul_widen_hilo (operands[0], operands[1], operands[2], <u_bool>, false); DONE; }) +;; Most widen_<s>mult_even_<mode> can be handled directly from other +;; named patterns, but signed V4SI needs special help for plain SSE2. +(define_expand "vec_widen_smult_even_v4si" + [(match_operand:V2DI 0 "register_operand") + (match_operand:V4SI 1 "register_operand") + (match_operand:V4SI 2 "register_operand")] + "TARGET_SSE2" +{ + ix86_expand_mul_widen_evenodd (operands[0], operands[1], operands[2], + false, false); + DONE; +}) + (define_expand "vec_widen_<s>mult_odd_<mode>" [(match_operand:<sseunpackmode> 0 "register_operand") (any_extend:<sseunpackmode> - (match_operand:VI124_AVX2 1 "register_operand")) - (match_operand:VI124_AVX2 2 "register_operand")] - ; Note that SSE2 does not have signed SI multiply - "TARGET_AVX || TARGET_XOP || TARGET_SSE4_1 - || (TARGET_SSE2 && (<u_bool> || <MODE>mode != V4SImode))" + (match_operand:VI4_AVX2 1 "register_operand")) + (match_operand:VI4_AVX2 2 "register_operand")] + "TARGET_SSE2" { ix86_expand_mul_widen_evenodd (operands[0], operands[1], operands[2], <u_bool>, true);

[2/3] i386: Implement widen_smul_*_v4si for plain sse2

Commit Message

Patch