, RFC Add support for C99 fma/fmas/fmal builtins

Message ID	20101004201919.GA22525@hungry-tiger.westford.ibm.com
State	New
Headers	show Return-Path: <gcc-patches-return-274510-incoming=patchwork.ozlabs.org@gcc.gnu.org> Date: Mon, 4 Oct 2010 16:19:19 -0400 From: Michael Meissner <meissner@linux.vnet.ibm.com> To: gcc-patches@gcc.gnu.org, dje.gcc@gmail.com Subject: [PATCH], RFC Add support for C99 fma/fmas/fmal builtins Message-ID: <20101004201919.GA22525@hungry-tiger.westford.ibm.com> Mail-Followup-To: Michael Meissner <meissner@linux.vnet.ibm.com>, gcc-patches@gcc.gnu.org, dje.gcc@gmail.com MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="X1bOJ3K7DJ5YkBrT" Content-Disposition: inline User-Agent: Mutt/1.5.21 (2010-09-15) Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org

Index: gcc/doc/md.texi =================================================================== --- gcc/doc/md.texi (revision 164946) +++ gcc/doc/md.texi (working copy) @@ -3948,6 +3948,16 @@ means of constraints requiring operands @itemx @samp{and@var{m}3}, @samp{ior@var{m}3}, @samp{xor@var{m}3} Similar, for other arithmetic operations. +@cindex @code{fma@var{m}4} instruction pattern +@item @samp{fma@var{m}4} +Multiply operand 2 and operand 1, then add operand 3, storing the +result in operand 0. All operands must have mode @var{m}. This +pattern is used to implement the @code{fma}, @code{fmas}, and +@code{fmal} builtin functions from the ISO C99 standard. The +@code{fma} operation may produce different results than doing the +multiply followed by the add if the machine does not perform a +rounding step between the operations. + @cindex @code{min@var{m}3} instruction pattern @cindex @code{max@var{m}3} instruction pattern @item @samp{smin@var{m}3}, @samp{smax@var{m}3} Index: gcc/optabs.h =================================================================== --- gcc/optabs.h (revision 164946) +++ gcc/optabs.h (working copy) @@ -190,6 +190,8 @@ enum optab_index OTI_pow, /* Arc tangent of y/x */ OTI_atan2, + /* Floating multiply/add */ + OTI_fma, /* Move instruction. */ OTI_mov, @@ -432,6 +434,7 @@ enum optab_index #define umax_optab (&optab_table[OTI_umax]) #define pow_optab (&optab_table[OTI_pow]) #define atan2_optab (&optab_table[OTI_atan2]) +#define fma_optab (&optab_table[OTI_fma]) #define mov_optab (&optab_table[OTI_mov]) #define movstrict_optab (&optab_table[OTI_movstrict]) Index: gcc/genopinit.c =================================================================== --- gcc/genopinit.c (revision 164946) +++ gcc/genopinit.c (working copy) @@ -159,6 +159,7 @@ static const char * const optabs[] = "set_optab_handler (sqrt_optab, $A, CODE_FOR_$(sqrt$a2$))", "set_optab_handler (floor_optab, $A, CODE_FOR_$(floor$a2$))", "set_convert_optab_handler (lfloor_optab, $B, $A, CODE_FOR_$(lfloor$F$a$I$b2$))", + "set_optab_handler (fma_optab, $A, CODE_FOR_$(fma$a4$))", "set_optab_handler (ceil_optab, $A, CODE_FOR_$(ceil$a2$))", "set_convert_optab_handler (lceil_optab, $B, $A, CODE_FOR_$(lceil$F$a$I$b2$))", "set_optab_handler (round_optab, $A, CODE_FOR_$(round$a2$))", Index: gcc/builtins.c =================================================================== --- gcc/builtins.c (revision 164946) +++ gcc/builtins.c (working copy) @@ -106,6 +106,7 @@ static void expand_errno_check (tree, rt static rtx expand_builtin_mathfn (tree, rtx, rtx); static rtx expand_builtin_mathfn_2 (tree, rtx, rtx); static rtx expand_builtin_mathfn_3 (tree, rtx, rtx); +static rtx expand_builtin_mathfn_ternary (tree, rtx, rtx); static rtx expand_builtin_interclass_mathfn (tree, rtx); static rtx expand_builtin_sincos (tree); static rtx expand_builtin_cexpi (tree, rtx); @@ -2185,6 +2186,79 @@ expand_builtin_mathfn_2 (tree exp, rtx t return target; } +/* Expand a call to the builtin trinary math functions (fma). + Return NULL_RTX if a normal call should be emitted rather than expanding the + function in-line. EXP is the expression that is a call to the builtin + function; if convenient, the result should be placed in TARGET. + SUBTARGET may be used as the target for computing one of EXP's + operands. */ + +static rtx +expand_builtin_mathfn_ternary (tree exp, rtx target, rtx subtarget) +{ + optab builtin_optab; + rtx op0, op1, op2, insns; + tree fndecl = get_callee_fndecl (exp); + tree arg0, arg1, arg2; + enum machine_mode mode; + + if (!validate_arglist (exp, REAL_TYPE, REAL_TYPE, REAL_TYPE, VOID_TYPE)) + return NULL_RTX; + + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); + + switch (DECL_FUNCTION_CODE (fndecl)) + { + CASE_FLT_FN (BUILT_IN_FMA): + builtin_optab = fma_optab; break; + default: + gcc_unreachable (); + } + + /* Make a suitable register to place result in. */ + mode = TYPE_MODE (TREE_TYPE (exp)); + + /* Before working hard, check whether the instruction is available. */ + if (optab_handler (builtin_optab, mode) == CODE_FOR_nothing) + return NULL_RTX; + + target = gen_reg_rtx (mode); + + /* Always stabilize the argument list. */ + CALL_EXPR_ARG (exp, 0) = arg0 = builtin_save_expr (arg0); + CALL_EXPR_ARG (exp, 1) = arg1 = builtin_save_expr (arg1); + CALL_EXPR_ARG (exp, 2) = arg2 = builtin_save_expr (arg2); + + op0 = expand_expr (arg0, subtarget, VOIDmode, EXPAND_NORMAL); + op1 = expand_normal (arg1); + op2 = expand_normal (arg2); + + start_sequence (); + + /* Compute into TARGET. + Set TARGET to wherever the result comes back. */ + target = expand_ternary_op (mode, builtin_optab, op0, op1, op2, + target, 0); + + /* If we were unable to expand via the builtin, stop the sequence + (without outputting the insns) and call to the library function + with the stabilized argument list. */ + if (target == 0) + { + end_sequence (); + return expand_call (exp, target, target == const0_rtx); + } + + /* Output the entire sequence. */ + insns = get_insns (); + end_sequence (); + emit_insn (insns); + + return target; +} + /* Expand a call to the builtin sin and cos math functions. Return NULL_RTX if a normal call should be emitted rather than expanding the function in-line. EXP is the expression that is a call to the builtin @@ -5829,6 +5903,12 @@ expand_builtin (tree exp, rtx target, rt return target; break; + CASE_FLT_FN (BUILT_IN_FMA): + target = expand_builtin_mathfn_ternary (exp, target, subtarget); + if (target) + return target; + break; + CASE_FLT_FN (BUILT_IN_ILOGB): if (! flag_unsafe_math_optimizations) break; Index: gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c =================================================================== --- gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c (revision 0) +++ gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c (revision 0) @@ -0,0 +1,84 @@ +/* { dg-do compile { target { powerpc*-*-* } } } */ +/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */ +/* { dg-require-effective-target powerpc_vsx_ok } */ +/* { dg-options "-O3 -ftree-vectorize -mcpu=power7 -ffast-math -mno-fused-madd" } */ +/* { dg-final { scan-assembler-times "xvmadd" 2 } } */ +/* { dg-final { scan-assembler-times "xsmadd" 1 } } */ +/* { dg-final { scan-assembler-times "fmadds" 1 } } */ + +/* Only the functions calling the bulitin should generate an appropriate (a * + b) + c instruction. */ + +double +builtin_fma (double b, double c, double d) +{ + return __builtin_fma (b, c, d); +} + +float +builtin_fmaf (float b, float c, float d) +{ + return __builtin_fmaf (b, c, d); +} + +double +normal_fma (double b, double c, double d) +{ + return (b * c) + d; +} + +float +normal_fmaf (float b, float c, float d) +{ + return (b * c) + d; +} + +#ifndef SIZE +#define SIZE 1024 +#endif + +double vda[SIZE] __attribute__((__aligned__(32))); +double vdb[SIZE] __attribute__((__aligned__(32))); +double vdc[SIZE] __attribute__((__aligned__(32))); +double vdd[SIZE] __attribute__((__aligned__(32))); + +float vfa[SIZE] __attribute__((__aligned__(32))); +float vfb[SIZE] __attribute__((__aligned__(32))); +float vfc[SIZE] __attribute__((__aligned__(32))); +float vfd[SIZE] __attribute__((__aligned__(32))); + +void +vector_fma (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + vda[i] = __builtin_fma (vdb[i], vdc[i], vdd[i]); +} + +void +vector_fmaf (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + vfa[i] = __builtin_fmaf (vfb[i], vfc[i], vfd[i]); +} + +void +vnormal_fma (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + vda[i] = (vdb[i] * vdc[i]) + vdd[i]; +} + +void +vnormal_fmaf (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + vfa[i] = (vfb[i] * vfc[i]) + vfd[i]; +} Index: gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c =================================================================== --- gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c (revision 0) +++ gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c (revision 0) @@ -0,0 +1,61 @@ +/* { dg-do compile { target { powerpc*-*-* } } } */ +/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */ +/* { dg-require-effective-target powerpc_altivec_ok } */ +/* { dg-options "-O3 -ftree-vectorize -mcpu=power6 -maltivec -ffast-math" } */ +/* { dg-final { scan-assembler-times "vmaddfp" 2 } } */ +/* { dg-final { scan-assembler-times "fmadd " 2 } } */ +/* { dg-final { scan-assembler-times "fmadds" 2 } } */ + +/* All functions should generate an appropriate (a * b) + c instruction + since -mfused-madd is on by default. */ + +double +builtin_fma (double b, double c, double d) +{ + return __builtin_fma (b, c, d); +} + +float +builtin_fmaf (float b, float c, float d) +{ + return __builtin_fmaf (b, c, d); +} + +double +normal_fma (double b, double c, double d) +{ + return (b * c) + d; +} + +float +normal_fmaf (float b, float c, float d) +{ + return (b * c) + d; +} + +#ifndef SIZE +#define SIZE 1024 +#endif + +float vfa[SIZE] __attribute__((__aligned__(32))); +float vfb[SIZE] __attribute__((__aligned__(32))); +float vfc[SIZE] __attribute__((__aligned__(32))); +float vfd[SIZE] __attribute__((__aligned__(32))); + +void +vector_fmaf (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + vfa[i] = __builtin_fmaf (vfb[i], vfc[i], vfd[i]); +} + +void +vnormal_fmaf (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + vfa[i] = (vfb[i] * vfc[i]) + vfd[i]; +} Index: gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c =================================================================== --- gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c (revision 0) +++ gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c (revision 0) @@ -0,0 +1,61 @@ +/* { dg-do compile { target { powerpc*-*-* } } } */ +/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */ +/* { dg-require-effective-target powerpc_altivec_ok } */ +/* { dg-options "-O3 -ftree-vectorize -mcpu=power6 -maltivec -ffast-math -mno-fused-madd" } */ +/* { dg-final { scan-assembler-times "vmaddfp" 1 } } */ +/* { dg-final { scan-assembler-times "fmadd " 1 } } */ +/* { dg-final { scan-assembler-times "fmadds" 1 } } */ + +/* Only the functions calling the builtin should generate an appropriate + (a * b) + c instruction. */ + +double +builtin_fma (double b, double c, double d) +{ + return __builtin_fma (b, c, d); +} + +float +builtin_fmaf (float b, float c, float d) +{ + return __builtin_fmaf (b, c, d); +} + +double +normal_fma (double b, double c, double d) +{ + return (b * c) + d; +} + +float +normal_fmaf (float b, float c, float d) +{ + return (b * c) + d; +} + +#ifndef SIZE +#define SIZE 1024 +#endif + +float vfa[SIZE] __attribute__((__aligned__(32))); +float vfb[SIZE] __attribute__((__aligned__(32))); +float vfc[SIZE] __attribute__((__aligned__(32))); +float vfd[SIZE] __attribute__((__aligned__(32))); + +void +vector_fmaf (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + vfa[i] = __builtin_fmaf (vfb[i], vfc[i], vfd[i]); +} + +void +vnormal_fmaf (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + vfa[i] = (vfb[i] * vfc[i]) + vfd[i]; +} Index: gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c =================================================================== --- gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c (revision 0) +++ gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c (revision 0) @@ -0,0 +1,84 @@ +/* { dg-do compile { target { powerpc*-*-* } } } */ +/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */ +/* { dg-require-effective-target powerpc_vsx_ok } */ +/* { dg-options "-O3 -ftree-vectorize -mcpu=power7 -ffast-math" } */ +/* { dg-final { scan-assembler-times "xvmadd" 4 } } */ +/* { dg-final { scan-assembler-times "xsmadd" 2 } } */ +/* { dg-final { scan-assembler-times "fmadds" 2 } } */ + +/* All functions should generate an appropriate (a * b) + c instruction + since -mfused-madd is on by default. */ + +double +builtin_fma (double b, double c, double d) +{ + return __builtin_fma (b, c, d); +} + +float +builtin_fmaf (float b, float c, float d) +{ + return __builtin_fmaf (b, c, d); +} + +double +normal_fma (double b, double c, double d) +{ + return (b * c) + d; +} + +float +normal_fmaf (float b, float c, float d) +{ + return (b * c) + d; +} + +#ifndef SIZE +#define SIZE 1024 +#endif + +double vda[SIZE] __attribute__((__aligned__(32))); +double vdb[SIZE] __attribute__((__aligned__(32))); +double vdc[SIZE] __attribute__((__aligned__(32))); +double vdd[SIZE] __attribute__((__aligned__(32))); + +float vfa[SIZE] __attribute__((__aligned__(32))); +float vfb[SIZE] __attribute__((__aligned__(32))); +float vfc[SIZE] __attribute__((__aligned__(32))); +float vfd[SIZE] __attribute__((__aligned__(32))); + +void +vector_fma (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + vda[i] = __builtin_fma (vdb[i], vdc[i], vdd[i]); +} + +void +vector_fmaf (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + vfa[i] = __builtin_fmaf (vfb[i], vfc[i], vfd[i]); +} + +void +vnormal_fma (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + vda[i] = (vdb[i] * vdc[i]) + vdd[i]; +} + +void +vnormal_fmaf (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + vfa[i] = (vfb[i] * vfc[i]) + vfd[i]; +} Index: gcc/tree-vect-stmts.c =================================================================== --- gcc/tree-vect-stmts.c (revision 164946) +++ gcc/tree-vect-stmts.c (working copy) @@ -1357,10 +1357,10 @@ vectorizable_call (gimple stmt, gimple_s vectype_in = NULL_TREE; nargs = gimple_call_num_args (stmt); - /* Bail out if the function has more than two arguments, we - do not have interesting builtin functions to vectorize with - more than two arguments. No arguments is also not good. */ - if (nargs == 0 || nargs > 2) + /* Bail out if the function has more than three arguments, we do not have + interesting builtin functions to vectorize with more than two arguments + except for fma. No arguments is also not good. */ + if (nargs == 0 || nargs > 3) return false; for (i = 0; i < nargs; i++) Index: gcc/config/rs6000/rs6000.c =================================================================== --- gcc/config/rs6000/rs6000.c (revision 164946) +++ gcc/config/rs6000/rs6000.c (working copy) @@ -3895,6 +3895,22 @@ rs6000_builtin_vectorized_function (tree if (VECTOR_UNIT_ALTIVEC_P (V4SFmode)) return rs6000_builtin_decls[ALTIVEC_BUILTIN_VRFIM]; break; + case BUILT_IN_FMA: + if (VECTOR_UNIT_VSX_P (V2DFmode) + && out_mode == DFmode && out_n == 2 + && in_mode == DFmode && in_n == 2) + return rs6000_builtin_decls[VSX_BUILTIN_XVMADDDP]; + break; + case BUILT_IN_FMAF: + if (VECTOR_UNIT_VSX_P (V4SFmode) + && out_mode == SFmode && out_n == 4 + && in_mode == SFmode && in_n == 4) + return rs6000_builtin_decls[VSX_BUILTIN_XVMADDSP]; + else if (VECTOR_UNIT_ALTIVEC_P (V4SFmode) + && out_mode == SFmode && out_n == 4 + && in_mode == SFmode && in_n == 4) + return rs6000_builtin_decls[ALTIVEC_BUILTIN_VMADDFP]; + break; case BUILT_IN_TRUNC: if (VECTOR_UNIT_VSX_P (V2DFmode) && out_mode == DFmode && out_n == 2 Index: gcc/config/rs6000/rs6000.md =================================================================== --- gcc/config/rs6000/rs6000.md (revision 164946) +++ gcc/config/rs6000/rs6000.md (working copy) @@ -110,6 +110,7 @@ (define_constants (UNSPEC_LFIWAX 56) (UNSPEC_LFIWZX 57) (UNSPEC_FCTIWUZ 58) + (UNSPEC_FMA 59) ]) ;; @@ -5844,6 +5845,39 @@ (define_insn "fres" "fres %0,%1" [(set_attr "type" "fp")]) +(define_expand "fmasf4" + [(set (match_operand:SF 0 "gpc_reg_operand" "") + (plus:SF (mult:SF (match_operand:SF 1 "gpc_reg_operand" "") + (match_operand:SF 2 "gpc_reg_operand" "")) + (match_operand:SF 3 "gpc_reg_operand" "")))] + "TARGET_HARD_FLOAT && TARGET_FPRS + && ((TARGET_POWERPC && TARGET_SINGLE_FLOAT) + || ! TARGET_POWERPC)" +{ + if (!TARGET_FUSED_MADD) + { + if (TARGET_POWERPC) + emit_insn (gen_fmasf4_powerpc (operands[0], operands[1], operands[2], + operands[3])); + else + emit_insn (gen_fmasf4_power (operands[0], operands[1], operands[2], + operands[3])); + DONE; + } +}) + +; Fma builtin that will work even if -mno-fused-madd is used. +(define_insn "fmasf4_powerpc" + [(set (match_operand:SF 0 "gpc_reg_operand" "=f") + (unspec:SF [(match_operand:SF 1 "gpc_reg_operand" "f") + (match_operand:SF 2 "gpc_reg_operand" "f") + (match_operand:SF 3 "gpc_reg_operand" "f")] + UNSPEC_FMA))] + "TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_POWERPC" + "fmadds %0,%1,%2,%3" + [(set_attr "type" "fp") + (set_attr "fp_type" "fp_maddsub_s")]) + (define_insn "*fmaddsf4_powerpc" [(set (match_operand:SF 0 "gpc_reg_operand" "=f") (plus:SF (mult:SF (match_operand:SF 1 "gpc_reg_operand" "%f") @@ -5855,6 +5889,16 @@ (define_insn "*fmaddsf4_powerpc" [(set_attr "type" "fp") (set_attr "fp_type" "fp_maddsub_s")]) +(define_insn "fmasf4_power" + [(set (match_operand:SF 0 "gpc_reg_operand" "=f") + (unspec:SF [(match_operand:SF 1 "gpc_reg_operand" "f") + (match_operand:SF 2 "gpc_reg_operand" "f") + (match_operand:SF 3 "gpc_reg_operand" "f")] + UNSPEC_FMA))] + "TARGET_HARD_FLOAT && TARGET_FPRS && !TARGET_POWERPC" + "{fma|fmadd} %0,%1,%2,%3" + [(set_attr "type" "dmul")]) + (define_insn "*fmaddsf4_power" [(set (match_operand:SF 0 "gpc_reg_operand" "=f") (plus:SF (mult:SF (match_operand:SF 1 "gpc_reg_operand" "%f") @@ -6280,6 +6324,40 @@ (define_insn "*rsqrtdf_internal1" "frsqrte %0,%1" [(set_attr "type" "fp")]) +(define_expand "fmadf4" + [(set (match_operand:DF 0 "gpc_reg_operand" "") + (plus:DF (mult:DF (match_operand:DF 1 "gpc_reg_operand" "") + (match_operand:DF 2 "gpc_reg_operand" "")) + (match_operand:DF 3 "gpc_reg_operand" "")))] + "TARGET_HARD_FLOAT && TARGET_FPRS + && ((TARGET_POWERPC && TARGET_SINGLE_FLOAT) + || ! TARGET_POWERPC)" +{ + if (!TARGET_FUSED_MADD) + { + if (VECTOR_UNIT_VSX_P (DFmode)) + emit_insn (gen_vsx_fmadddf4_2 (operands[0], operands[1], operands[2], + operands[3])); + else + emit_insn (gen_fmadf4_fpr (operands[0], operands[1], operands[2], + operands[3])); + DONE; + } +}) + +; Fma builtin that will work even if -mno-fused-madd is used. +(define_insn "fmadf4_fpr" + [(set (match_operand:DF 0 "gpc_reg_operand" "=d") + (unspec:DF [(match_operand:DF 1 "gpc_reg_operand" "d") + (match_operand:DF 2 "gpc_reg_operand" "d") + (match_operand:DF 3 "gpc_reg_operand" "d")] + UNSPEC_FMA))] + "TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_DOUBLE_FLOAT + && VECTOR_UNIT_NONE_P (DFmode)" + "{fma|fmadd} %0,%1,%2,%3" + [(set_attr "type" "dmul") + (set_attr "fp_type" "fp_maddsub_d")]) + (define_insn "*fmadddf4_fpr" [(set (match_operand:DF 0 "gpc_reg_operand" "=d") (plus:DF (mult:DF (match_operand:DF 1 "gpc_reg_operand" "%d")

, RFC Add support for C99 fma/fmas/fmal builtins

Commit Message

Comments

Patch