Add FMA_EXPR, un-cse multiplications before expansion

Message ID	4CC1FE2F.4020802@redhat.com
State	New
Headers	show Return-Path: <gcc-patches-return-276194-incoming=patchwork.ozlabs.org@gcc.gnu.org> Message-ID: <4CC1FE2F.4020802@redhat.com> Date: Fri, 22 Oct 2010 14:12:15 -0700 From: Richard Henderson <rth@redhat.com> User-Agent: Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.9) Gecko/20100921 Fedora/3.1.4-1.fc13 Thunderbird/3.1.4 MIME-Version: 1.0 To: Richard Guenther <rguenther@suse.de> CC: gcc-patches@gcc.gnu.org Subject: Re: [PATCH] Add FMA_EXPR, un-cse multiplications before expansion References: <alpine.LNX.2.00.1010211136140.23074@zhemvz.fhfr.qr> In-Reply-To: <alpine.LNX.2.00.1010211136140.23074@zhemvz.fhfr.qr> Content-Type: multipart/mixed; boundary="------------060401080400020006070408" Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index d80be88..b61a3b5 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -1834,7 +1834,7 @@ ;; Intrinsic FMA operations. -;; The standard name for fma is only available with SSE math enabled. +;; The standard names for fma is only available with SSE math enabled. (define_expand "fma<mode>4" [(set (match_operand:FMAMODE 0 "register_operand") (fma:FMAMODE @@ -1844,6 +1844,33 @@ "(TARGET_FMA || TARGET_FMA4) && TARGET_SSE_MATH" "") +(define_expand "fms<mode>4" + [(set (match_operand:FMAMODE 0 "register_operand") + (fma:FMAMODE + (match_operand:FMAMODE 1 "nonimmediate_operand") + (match_operand:FMAMODE 2 "nonimmediate_operand") + (neg:FMAMODE (match_operand:FMAMODE 3 "nonimmediate_operand"))))] + "(TARGET_FMA || TARGET_FMA4) && TARGET_SSE_MATH" + "") + +(define_expand "fnma<mode>4" + [(set (match_operand:FMAMODE 0 "register_operand") + (fma:FMAMODE + (neg:FMAMODE (match_operand:FMAMODE 1 "nonimmediate_operand")) + (match_operand:FMAMODE 2 "nonimmediate_operand") + (match_operand:FMAMODE 3 "nonimmediate_operand")))] + "(TARGET_FMA || TARGET_FMA4) && TARGET_SSE_MATH" + "") + +(define_expand "fnms<mode>4" + [(set (match_operand:FMAMODE 0 "register_operand") + (fma:FMAMODE + (neg:FMAMODE (match_operand:FMAMODE 1 "nonimmediate_operand")) + (match_operand:FMAMODE 2 "nonimmediate_operand") + (neg:FMAMODE (match_operand:FMAMODE 3 "nonimmediate_operand"))))] + "(TARGET_FMA || TARGET_FMA4) && TARGET_SSE_MATH" + "") + ;; The builtin for fma4intrin.h is not constrained by SSE math enabled. (define_expand "fma4i_fmadd_<mode>" [(set (match_operand:FMAMODE 0 "register_operand") diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index 6de4f36..8418564 100644 --- a/gcc/doc/md.texi +++ b/gcc/doc/md.texi @@ -3958,6 +3958,36 @@ pattern is used to implement the @code{fma}, @code{fmaf}, and multiply followed by the add if the machine does not perform a rounding step between the operations. +@cindex @code{fms@var{m}4} instruction pattern +@item @samp{fms@var{m}4} +Like @code{fma@var{m}4}, except operand 3 subtracted from the +product instead of added to the product. This is represented +in the rtl as + +@smallexample +(fma:@var{m} @var{op1} @var{op2} (neg:@var{m} @var{op3})) +@end smallexample + +@cindex @code{fnma@var{m}4} instruction pattern +@item @samp{fnma@var{m}4} +Like @code{fma@var{m}4} except that the intermediate product +is negated before being added to operand 3. This is represented +in the rtl as + +@smallexample +(fma:@var{m} (neg:@var{m} @var{op1}) @var{op2} @var{op3}) +@end smallexample + +@cindex @code{fnms@var{m}4} instruction pattern +@item @samp{fnms@var{m}4} +Like @code{fms@var{m}4} except that the intermediate product +is negated before subtracting operand 3. This is represented +in the rtl as + +@smallexample +(fma:@var{m} (neg:@var{m} @var{op1}) @var{op2} (neg:@var{m} @var{op3})) +@end smallexample + @cindex @code{min@var{m}3} instruction pattern @cindex @code{max@var{m}3} instruction pattern @item @samp{smin@var{m}3}, @samp{smax@var{m}3} diff --git a/gcc/expr.c b/gcc/expr.c index 56f6eda..f29f6dc 100644 --- a/gcc/expr.c +++ b/gcc/expr.c @@ -7254,7 +7254,7 @@ expand_expr_real_2 (sepops ops, rtx target, enum machine_mode tmode, int ignore; bool reduce_bit_field; location_t loc = ops->location; - tree treeop0, treeop1; + tree treeop0, treeop1, treeop2; #define REDUCE_BIT_FIELD(expr) (reduce_bit_field \ ? reduce_to_bit_field_precision ((expr), \ target, \ @@ -7267,6 +7267,7 @@ expand_expr_real_2 (sepops ops, rtx target, enum machine_mode tmode, treeop0 = ops->op0; treeop1 = ops->op1; + treeop2 = ops->op2; /* We should be called only on simple (binary or unary) expressions, exactly those that are valid in gimple expressions that aren't @@ -7624,7 +7625,7 @@ expand_expr_real_2 (sepops ops, rtx target, enum machine_mode tmode, case WIDEN_MULT_PLUS_EXPR: case WIDEN_MULT_MINUS_EXPR: expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL); - op2 = expand_normal (ops->op2); + op2 = expand_normal (treeop2); target = expand_widen_pattern_expr (ops, op0, op1, op2, target, unsignedp); return target; @@ -7711,6 +7712,46 @@ expand_expr_real_2 (sepops ops, rtx target, enum machine_mode tmode, expand_operands (treeop0, treeop1, subtarget, &op0, &op1, EXPAND_NORMAL); return REDUCE_BIT_FIELD (expand_mult (mode, op0, op1, target, unsignedp)); + case FMA_EXPR: + { + optab opt = fma_optab; + gimple def0, def2; + + def0 = get_def_for_expr (treeop0, NEGATE_EXPR); + def2 = get_def_for_expr (treeop2, NEGATE_EXPR); + + op0 = op2 = NULL; + + if (def0 && def2 + && optab_handler (fnms_optab, mode) != CODE_FOR_nothing) + { + opt = fnms_optab; + op0 = expand_normal (gimple_assign_rhs1 (def0)); + op2 = expand_normal (gimple_assign_rhs1 (def2)); + } + else if (def0 + && optab_handler (fnma_optab, mode) != CODE_FOR_nothing) + { + opt = fnma_optab; + op0 = expand_normal (gimple_assign_rhs1 (def0)); + } + else if (def2 + && optab_handler (fms_optab, mode) != CODE_FOR_nothing) + { + opt = fms_optab; + op2 = expand_normal (gimple_assign_rhs1 (def2)); + } + + if (op0 == NULL) + op0 = expand_expr (treeop0, subtarget, VOIDmode, EXPAND_NORMAL); + if (op2 == NULL) + op2 = expand_normal (treeop2); + op1 = expand_normal (treeop1); + + return expand_ternary_op (TYPE_MODE (type), opt, + op0, op1, op2, target, 0); + } + case MULT_EXPR: /* If this is a fixed-point operation, then we cannot use the code below because "expand_mult" doesn't support sat/no-sat fixed-point diff --git a/gcc/genopinit.c b/gcc/genopinit.c index 6e0a714..eee9ef8 100644 --- a/gcc/genopinit.c +++ b/gcc/genopinit.c @@ -160,6 +160,9 @@ static const char * const optabs[] = "set_optab_handler (floor_optab, $A, CODE_FOR_$(floor$a2$))", "set_convert_optab_handler (lfloor_optab, $B, $A, CODE_FOR_$(lfloor$F$a$I$b2$))", "set_optab_handler (fma_optab, $A, CODE_FOR_$(fma$a4$))", + "set_optab_handler (fms_optab, $A, CODE_FOR_$(fms$a4$))", + "set_optab_handler (fnma_optab, $A, CODE_FOR_$(fnma$a4$))", + "set_optab_handler (fnms_optab, $A, CODE_FOR_$(fnms$a4$))", "set_optab_handler (ceil_optab, $A, CODE_FOR_$(ceil$a2$))", "set_convert_optab_handler (lceil_optab, $B, $A, CODE_FOR_$(lceil$F$a$I$b2$))", "set_optab_handler (round_optab, $A, CODE_FOR_$(round$a2$))", diff --git a/gcc/gimple-pretty-print.c b/gcc/gimple-pretty-print.c index c74dd0e..057f35b 100644 --- a/gcc/gimple-pretty-print.c +++ b/gcc/gimple-pretty-print.c @@ -400,6 +400,14 @@ dump_ternary_rhs (pretty_printer *buffer, gimple gs, int spc, int flags) pp_character (buffer, '>'); break; + case FMA_EXPR: + dump_generic_node (buffer, gimple_assign_rhs1 (gs), spc, flags, false); + pp_string (buffer, " * "); + dump_generic_node (buffer, gimple_assign_rhs2 (gs), spc, flags, false); + pp_string (buffer, " + "); + dump_generic_node (buffer, gimple_assign_rhs3 (gs), spc, flags, false); + break; + default: gcc_unreachable (); } diff --git a/gcc/gimple.c b/gcc/gimple.c index dea0b83..c008ec0 100644 --- a/gcc/gimple.c +++ b/gcc/gimple.c @@ -2530,7 +2530,8 @@ get_gimple_rhs_num_ops (enum tree_code code) || (SYM) == TRUTH_XOR_EXPR) ? GIMPLE_BINARY_RHS \ : (SYM) == TRUTH_NOT_EXPR ? GIMPLE_UNARY_RHS \ : ((SYM) == WIDEN_MULT_PLUS_EXPR \ - || (SYM) == WIDEN_MULT_MINUS_EXPR) ? GIMPLE_TERNARY_RHS \ + || (SYM) == WIDEN_MULT_MINUS_EXPR \ + || (SYM) == FMA_EXPR) ? GIMPLE_TERNARY_RHS \ : ((SYM) == COND_EXPR \ || (SYM) == CONSTRUCTOR \ || (SYM) == OBJ_TYPE_REF \ diff --git a/gcc/optabs.h b/gcc/optabs.h index 8b9c9a7..c4dfa60 100644 --- a/gcc/optabs.h +++ b/gcc/optabs.h @@ -192,6 +192,9 @@ enum optab_index OTI_atan2, /* Floating multiply/add */ OTI_fma, + OTI_fms, + OTI_fnma, + OTI_fnms, /* Move instruction. */ OTI_mov, @@ -435,6 +438,9 @@ enum optab_index #define pow_optab (&optab_table[OTI_pow]) #define atan2_optab (&optab_table[OTI_atan2]) #define fma_optab (&optab_table[OTI_fma]) +#define fms_optab (&optab_table[OTI_fms]) +#define fnma_optab (&optab_table[OTI_fnma]) +#define fnms_optab (&optab_table[OTI_fnms]) #define mov_optab (&optab_table[OTI_mov]) #define movstrict_optab (&optab_table[OTI_movstrict]) diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c index bffa679..2eefd90 100644 --- a/gcc/tree-cfg.c +++ b/gcc/tree-cfg.c @@ -3655,6 +3655,20 @@ verify_gimple_assign_ternary (gimple stmt) } break; + case FMA_EXPR: + if (!useless_type_conversion_p (lhs_type, rhs1_type) + || !useless_type_conversion_p (lhs_type, rhs2_type) + || !useless_type_conversion_p (lhs_type, rhs3_type)) + { + error ("type mismatch in fused multiply-add expression"); + debug_generic_expr (lhs_type); + debug_generic_expr (rhs1_type); + debug_generic_expr (rhs2_type); + debug_generic_expr (rhs3_type); + return true; + } + break; + default: gcc_unreachable (); } diff --git a/gcc/tree-inline.c b/gcc/tree-inline.c index 133d916..96e0eb4 100644 --- a/gcc/tree-inline.c +++ b/gcc/tree-inline.c @@ -3284,6 +3284,7 @@ estimate_operator_cost (enum tree_code code, eni_weights *weights, case POINTER_PLUS_EXPR: case MINUS_EXPR: case MULT_EXPR: + case FMA_EXPR: case ADDR_SPACE_CONVERT_EXPR: case FIXED_CONVERT_EXPR: diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c index a814f6f..4ee5080 100644 --- a/gcc/tree-ssa-math-opts.c +++ b/gcc/tree-ssa-math-opts.c @@ -1494,6 +1494,112 @@ convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt, return true; } +/* Combine the multiplication at MUL_STMT with uses in additions and + subtractions to form fused multiply-add operations. Returns true + if successful and MUL_STMT should be removed. */ + +static bool +convert_mult_to_fma (gimple mul_stmt) +{ + tree mul_result = gimple_assign_lhs (mul_stmt); + tree type = TREE_TYPE (mul_result); + gimple use_stmt, fma_stmt; + use_operand_p use_p; + imm_use_iterator imm_iter; + + /* If the target doesn't support it, don't generate it. + ??? We have no way of querying support for the various variants + with negated operands, so for the following we simply assume + they are all available ((-a)*b+c, a*b-c and (-a)*b-c). */ + if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing) + return false; + + /* We don't want to do bitfield reduction ops. */ + if (INTEGRAL_TYPE_P (type) + && (TYPE_PRECISION (type) + != GET_MODE_PRECISION (TYPE_MODE (type)))) + return false; + + /* Leave single uses to the RTL combiner, we otherwise regress + in some circumstances. */ + if (single_imm_use (mul_result, &use_p, &use_stmt)) + return false; + + /* Make sure that the multiplication statement becomes dead after + the transformation, thus that all uses are transformed to FMAs. + This means we assume that an FMA operation has the same cost + as an addition. */ + FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result) + { + enum tree_code use_code; + + use_stmt = USE_STMT (use_p); + + if (!is_gimple_assign (use_stmt)) + return false; + use_code = gimple_assign_rhs_code (use_stmt); + /* ??? Handle NEGATE_EXPR. */ + if (use_code != PLUS_EXPR + && use_code != MINUS_EXPR) + return false; + + /* We can't handle a * b + a * b. */ + if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt)) + return false; + + /* For now restrict this operations to single basic blocks. In theory + we would want to support sinking the multiplication in + m = a*b; + if () + ma = m + c; + else + d = m; + to form a fma in the then block and sink the multiplication to the + else block. */ + if (gimple_bb (use_stmt) != gimple_bb (mul_stmt)) + return false; + } + + FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result) + { + tree addop, mulop1; + gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt); + + mulop1 = gimple_assign_rhs1 (mul_stmt); + if (gimple_assign_rhs1 (use_stmt) == mul_result) + { + addop = gimple_assign_rhs2 (use_stmt); + /* a * b - c -> a * b + (-c) */ + if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR) + addop = force_gimple_operand_gsi (&gsi, + build1 (NEGATE_EXPR, + type, addop), + true, NULL_TREE, true, + GSI_SAME_STMT); + } + else + { + addop = gimple_assign_rhs1 (use_stmt); + /* a - b * c -> (-b) * c + a */ + if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR) + mulop1 = force_gimple_operand_gsi (&gsi, + build1 (NEGATE_EXPR, + type, mulop1), + true, NULL_TREE, true, + GSI_SAME_STMT); + } + + fma_stmt = gimple_build_assign_with_ops3 (FMA_EXPR, + gimple_assign_lhs (use_stmt), + mulop1, + gimple_assign_rhs2 (mul_stmt), + addop); + gsi_replace (&gsi, fma_stmt, true); + } + + return true; +} + /* Find integer multiplications where the operands are extended from smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR where appropriate. */ @@ -1501,31 +1607,45 @@ convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt, static unsigned int execute_optimize_widening_mul (void) { - bool changed = false; basic_block bb; FOR_EACH_BB (bb) { gimple_stmt_iterator gsi; - for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi)) + for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);) { gimple stmt = gsi_stmt (gsi); enum tree_code code; - if (!is_gimple_assign (stmt)) - continue; + if (is_gimple_assign (stmt)) + { + code = gimple_assign_rhs_code (stmt); + switch (code) + { + case MULT_EXPR: + if (!convert_mult_to_widen (stmt) + && convert_mult_to_fma (stmt)) + { + gsi_remove (&gsi, true); + release_defs (stmt); + continue; + } + break; - code = gimple_assign_rhs_code (stmt); - if (code == MULT_EXPR) - changed |= convert_mult_to_widen (stmt); - else if (code == PLUS_EXPR || code == MINUS_EXPR) - changed |= convert_plusminus_to_widen (&gsi, stmt, code); + case PLUS_EXPR: + case MINUS_EXPR: + convert_plusminus_to_widen (&gsi, stmt, code); + break; + + default:; + } + } + gsi_next (&gsi); } } - return (changed ? TODO_dump_func | TODO_update_ssa | TODO_verify_ssa - | TODO_verify_stmts : 0); + return 0; } static bool @@ -1549,6 +1669,9 @@ struct gimple_opt_pass pass_optimize_widening_mul = 0, /* properties_provided */ 0, /* properties_destroyed */ 0, /* todo_flags_start */ - 0 /* todo_flags_finish */ + TODO_verify_ssa + | TODO_verify_stmts + | TODO_dump_func + | TODO_update_ssa /* todo_flags_finish */ } }; diff --git a/gcc/tree.def b/gcc/tree.def index 24729e8..791d699 100644 --- a/gcc/tree.def +++ b/gcc/tree.def @@ -1092,6 +1092,12 @@ DEFTREECODE (WIDEN_MULT_PLUS_EXPR, "widen_mult_plus_expr", tcc_expression, 3) is subtracted from t3. */ DEFTREECODE (WIDEN_MULT_MINUS_EXPR, "widen_mult_plus_expr", tcc_expression, 3) +/* Fused multiply-add. + All operands and the result are of the same type. No intermediate + rounding is performed after multiplying operand one with operand two + before adding operand three. */ +DEFTREECODE (FMA_EXPR, "fma_expr", tcc_expression, 3) + /* Whole vector left/right shift in bits. Operand 0 is a vector to be shifted. Operand 1 is an integer shift amount in bits. */

Add FMA_EXPR, un-cse multiplications before expansion

Commit Message

Patch