@@ -1834,7 +1834,7 @@
;; Intrinsic FMA operations.
-;; The standard name for fma is only available with SSE math enabled.
+;; The standard names for fma is only available with SSE math enabled.
(define_expand "fma<mode>4"
[(set (match_operand:FMAMODE 0 "register_operand")
(fma:FMAMODE
@@ -1844,6 +1844,33 @@
"(TARGET_FMA || TARGET_FMA4) && TARGET_SSE_MATH"
"")
+(define_expand "fms<mode>4"
+ [(set (match_operand:FMAMODE 0 "register_operand")
+ (fma:FMAMODE
+ (match_operand:FMAMODE 1 "nonimmediate_operand")
+ (match_operand:FMAMODE 2 "nonimmediate_operand")
+ (neg:FMAMODE (match_operand:FMAMODE 3 "nonimmediate_operand"))))]
+ "(TARGET_FMA || TARGET_FMA4) && TARGET_SSE_MATH"
+ "")
+
+(define_expand "fnma<mode>4"
+ [(set (match_operand:FMAMODE 0 "register_operand")
+ (fma:FMAMODE
+ (neg:FMAMODE (match_operand:FMAMODE 1 "nonimmediate_operand"))
+ (match_operand:FMAMODE 2 "nonimmediate_operand")
+ (match_operand:FMAMODE 3 "nonimmediate_operand")))]
+ "(TARGET_FMA || TARGET_FMA4) && TARGET_SSE_MATH"
+ "")
+
+(define_expand "fnms<mode>4"
+ [(set (match_operand:FMAMODE 0 "register_operand")
+ (fma:FMAMODE
+ (neg:FMAMODE (match_operand:FMAMODE 1 "nonimmediate_operand"))
+ (match_operand:FMAMODE 2 "nonimmediate_operand")
+ (neg:FMAMODE (match_operand:FMAMODE 3 "nonimmediate_operand"))))]
+ "(TARGET_FMA || TARGET_FMA4) && TARGET_SSE_MATH"
+ "")
+
;; The builtin for fma4intrin.h is not constrained by SSE math enabled.
(define_expand "fma4i_fmadd_<mode>"
[(set (match_operand:FMAMODE 0 "register_operand")
@@ -3958,6 +3958,36 @@ pattern is used to implement the @code{fma}, @code{fmaf}, and
multiply followed by the add if the machine does not perform a
rounding step between the operations.
+@cindex @code{fms@var{m}4} instruction pattern
+@item @samp{fms@var{m}4}
+Like @code{fma@var{m}4}, except operand 3 subtracted from the
+product instead of added to the product. This is represented
+in the rtl as
+
+@smallexample
+(fma:@var{m} @var{op1} @var{op2} (neg:@var{m} @var{op3}))
+@end smallexample
+
+@cindex @code{fnma@var{m}4} instruction pattern
+@item @samp{fnma@var{m}4}
+Like @code{fma@var{m}4} except that the intermediate product
+is negated before being added to operand 3. This is represented
+in the rtl as
+
+@smallexample
+(fma:@var{m} (neg:@var{m} @var{op1}) @var{op2} @var{op3})
+@end smallexample
+
+@cindex @code{fnms@var{m}4} instruction pattern
+@item @samp{fnms@var{m}4}
+Like @code{fms@var{m}4} except that the intermediate product
+is negated before subtracting operand 3. This is represented
+in the rtl as
+
+@smallexample
+(fma:@var{m} (neg:@var{m} @var{op1}) @var{op2} (neg:@var{m} @var{op3}))
+@end smallexample
+
@cindex @code{min@var{m}3} instruction pattern
@cindex @code{max@var{m}3} instruction pattern
@item @samp{smin@var{m}3}, @samp{smax@var{m}3}
@@ -7254,7 +7254,7 @@ expand_expr_real_2 (sepops ops, rtx target, enum machine_mode tmode,
int ignore;
bool reduce_bit_field;
location_t loc = ops->location;
- tree treeop0, treeop1;
+ tree treeop0, treeop1, treeop2;
#define REDUCE_BIT_FIELD(expr) (reduce_bit_field \
? reduce_to_bit_field_precision ((expr), \
target, \
@@ -7267,6 +7267,7 @@ expand_expr_real_2 (sepops ops, rtx target, enum machine_mode tmode,
treeop0 = ops->op0;
treeop1 = ops->op1;
+ treeop2 = ops->op2;
/* We should be called only on simple (binary or unary) expressions,
exactly those that are valid in gimple expressions that aren't
@@ -7624,7 +7625,7 @@ expand_expr_real_2 (sepops ops, rtx target, enum machine_mode tmode,
case WIDEN_MULT_PLUS_EXPR:
case WIDEN_MULT_MINUS_EXPR:
expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
- op2 = expand_normal (ops->op2);
+ op2 = expand_normal (treeop2);
target = expand_widen_pattern_expr (ops, op0, op1, op2,
target, unsignedp);
return target;
@@ -7711,6 +7712,46 @@ expand_expr_real_2 (sepops ops, rtx target, enum machine_mode tmode,
expand_operands (treeop0, treeop1, subtarget, &op0, &op1, EXPAND_NORMAL);
return REDUCE_BIT_FIELD (expand_mult (mode, op0, op1, target, unsignedp));
+ case FMA_EXPR:
+ {
+ optab opt = fma_optab;
+ gimple def0, def2;
+
+ def0 = get_def_for_expr (treeop0, NEGATE_EXPR);
+ def2 = get_def_for_expr (treeop2, NEGATE_EXPR);
+
+ op0 = op2 = NULL;
+
+ if (def0 && def2
+ && optab_handler (fnms_optab, mode) != CODE_FOR_nothing)
+ {
+ opt = fnms_optab;
+ op0 = expand_normal (gimple_assign_rhs1 (def0));
+ op2 = expand_normal (gimple_assign_rhs1 (def2));
+ }
+ else if (def0
+ && optab_handler (fnma_optab, mode) != CODE_FOR_nothing)
+ {
+ opt = fnma_optab;
+ op0 = expand_normal (gimple_assign_rhs1 (def0));
+ }
+ else if (def2
+ && optab_handler (fms_optab, mode) != CODE_FOR_nothing)
+ {
+ opt = fms_optab;
+ op2 = expand_normal (gimple_assign_rhs1 (def2));
+ }
+
+ if (op0 == NULL)
+ op0 = expand_expr (treeop0, subtarget, VOIDmode, EXPAND_NORMAL);
+ if (op2 == NULL)
+ op2 = expand_normal (treeop2);
+ op1 = expand_normal (treeop1);
+
+ return expand_ternary_op (TYPE_MODE (type), opt,
+ op0, op1, op2, target, 0);
+ }
+
case MULT_EXPR:
/* If this is a fixed-point operation, then we cannot use the code
below because "expand_mult" doesn't support sat/no-sat fixed-point
@@ -160,6 +160,9 @@ static const char * const optabs[] =
"set_optab_handler (floor_optab, $A, CODE_FOR_$(floor$a2$))",
"set_convert_optab_handler (lfloor_optab, $B, $A, CODE_FOR_$(lfloor$F$a$I$b2$))",
"set_optab_handler (fma_optab, $A, CODE_FOR_$(fma$a4$))",
+ "set_optab_handler (fms_optab, $A, CODE_FOR_$(fms$a4$))",
+ "set_optab_handler (fnma_optab, $A, CODE_FOR_$(fnma$a4$))",
+ "set_optab_handler (fnms_optab, $A, CODE_FOR_$(fnms$a4$))",
"set_optab_handler (ceil_optab, $A, CODE_FOR_$(ceil$a2$))",
"set_convert_optab_handler (lceil_optab, $B, $A, CODE_FOR_$(lceil$F$a$I$b2$))",
"set_optab_handler (round_optab, $A, CODE_FOR_$(round$a2$))",
@@ -400,6 +400,14 @@ dump_ternary_rhs (pretty_printer *buffer, gimple gs, int spc, int flags)
pp_character (buffer, '>');
break;
+ case FMA_EXPR:
+ dump_generic_node (buffer, gimple_assign_rhs1 (gs), spc, flags, false);
+ pp_string (buffer, " * ");
+ dump_generic_node (buffer, gimple_assign_rhs2 (gs), spc, flags, false);
+ pp_string (buffer, " + ");
+ dump_generic_node (buffer, gimple_assign_rhs3 (gs), spc, flags, false);
+ break;
+
default:
gcc_unreachable ();
}
@@ -2530,7 +2530,8 @@ get_gimple_rhs_num_ops (enum tree_code code)
|| (SYM) == TRUTH_XOR_EXPR) ? GIMPLE_BINARY_RHS \
: (SYM) == TRUTH_NOT_EXPR ? GIMPLE_UNARY_RHS \
: ((SYM) == WIDEN_MULT_PLUS_EXPR \
- || (SYM) == WIDEN_MULT_MINUS_EXPR) ? GIMPLE_TERNARY_RHS \
+ || (SYM) == WIDEN_MULT_MINUS_EXPR \
+ || (SYM) == FMA_EXPR) ? GIMPLE_TERNARY_RHS \
: ((SYM) == COND_EXPR \
|| (SYM) == CONSTRUCTOR \
|| (SYM) == OBJ_TYPE_REF \
@@ -192,6 +192,9 @@ enum optab_index
OTI_atan2,
/* Floating multiply/add */
OTI_fma,
+ OTI_fms,
+ OTI_fnma,
+ OTI_fnms,
/* Move instruction. */
OTI_mov,
@@ -435,6 +438,9 @@ enum optab_index
#define pow_optab (&optab_table[OTI_pow])
#define atan2_optab (&optab_table[OTI_atan2])
#define fma_optab (&optab_table[OTI_fma])
+#define fms_optab (&optab_table[OTI_fms])
+#define fnma_optab (&optab_table[OTI_fnma])
+#define fnms_optab (&optab_table[OTI_fnms])
#define mov_optab (&optab_table[OTI_mov])
#define movstrict_optab (&optab_table[OTI_movstrict])
@@ -3655,6 +3655,20 @@ verify_gimple_assign_ternary (gimple stmt)
}
break;
+ case FMA_EXPR:
+ if (!useless_type_conversion_p (lhs_type, rhs1_type)
+ || !useless_type_conversion_p (lhs_type, rhs2_type)
+ || !useless_type_conversion_p (lhs_type, rhs3_type))
+ {
+ error ("type mismatch in fused multiply-add expression");
+ debug_generic_expr (lhs_type);
+ debug_generic_expr (rhs1_type);
+ debug_generic_expr (rhs2_type);
+ debug_generic_expr (rhs3_type);
+ return true;
+ }
+ break;
+
default:
gcc_unreachable ();
}
@@ -3284,6 +3284,7 @@ estimate_operator_cost (enum tree_code code, eni_weights *weights,
case POINTER_PLUS_EXPR:
case MINUS_EXPR:
case MULT_EXPR:
+ case FMA_EXPR:
case ADDR_SPACE_CONVERT_EXPR:
case FIXED_CONVERT_EXPR:
@@ -1494,6 +1494,112 @@ convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
return true;
}
+/* Combine the multiplication at MUL_STMT with uses in additions and
+ subtractions to form fused multiply-add operations. Returns true
+ if successful and MUL_STMT should be removed. */
+
+static bool
+convert_mult_to_fma (gimple mul_stmt)
+{
+ tree mul_result = gimple_assign_lhs (mul_stmt);
+ tree type = TREE_TYPE (mul_result);
+ gimple use_stmt, fma_stmt;
+ use_operand_p use_p;
+ imm_use_iterator imm_iter;
+
+ /* If the target doesn't support it, don't generate it.
+ ??? We have no way of querying support for the various variants
+ with negated operands, so for the following we simply assume
+ they are all available ((-a)*b+c, a*b-c and (-a)*b-c). */
+ if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
+ return false;
+
+ /* We don't want to do bitfield reduction ops. */
+ if (INTEGRAL_TYPE_P (type)
+ && (TYPE_PRECISION (type)
+ != GET_MODE_PRECISION (TYPE_MODE (type))))
+ return false;
+
+ /* Leave single uses to the RTL combiner, we otherwise regress
+ in some circumstances. */
+ if (single_imm_use (mul_result, &use_p, &use_stmt))
+ return false;
+
+ /* Make sure that the multiplication statement becomes dead after
+ the transformation, thus that all uses are transformed to FMAs.
+ This means we assume that an FMA operation has the same cost
+ as an addition. */
+ FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
+ {
+ enum tree_code use_code;
+
+ use_stmt = USE_STMT (use_p);
+
+ if (!is_gimple_assign (use_stmt))
+ return false;
+ use_code = gimple_assign_rhs_code (use_stmt);
+ /* ??? Handle NEGATE_EXPR. */
+ if (use_code != PLUS_EXPR
+ && use_code != MINUS_EXPR)
+ return false;
+
+ /* We can't handle a * b + a * b. */
+ if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
+ return false;
+
+ /* For now restrict this operations to single basic blocks. In theory
+ we would want to support sinking the multiplication in
+ m = a*b;
+ if ()
+ ma = m + c;
+ else
+ d = m;
+ to form a fma in the then block and sink the multiplication to the
+ else block. */
+ if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
+ return false;
+ }
+
+ FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
+ {
+ tree addop, mulop1;
+ gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
+
+ mulop1 = gimple_assign_rhs1 (mul_stmt);
+ if (gimple_assign_rhs1 (use_stmt) == mul_result)
+ {
+ addop = gimple_assign_rhs2 (use_stmt);
+ /* a * b - c -> a * b + (-c) */
+ if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
+ addop = force_gimple_operand_gsi (&gsi,
+ build1 (NEGATE_EXPR,
+ type, addop),
+ true, NULL_TREE, true,
+ GSI_SAME_STMT);
+ }
+ else
+ {
+ addop = gimple_assign_rhs1 (use_stmt);
+ /* a - b * c -> (-b) * c + a */
+ if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
+ mulop1 = force_gimple_operand_gsi (&gsi,
+ build1 (NEGATE_EXPR,
+ type, mulop1),
+ true, NULL_TREE, true,
+ GSI_SAME_STMT);
+ }
+
+ fma_stmt = gimple_build_assign_with_ops3 (FMA_EXPR,
+ gimple_assign_lhs (use_stmt),
+ mulop1,
+ gimple_assign_rhs2 (mul_stmt),
+ addop);
+ gsi_replace (&gsi, fma_stmt, true);
+ }
+
+ return true;
+}
+
/* Find integer multiplications where the operands are extended from
smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
where appropriate. */
@@ -1501,31 +1607,45 @@ convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
static unsigned int
execute_optimize_widening_mul (void)
{
- bool changed = false;
basic_block bb;
FOR_EACH_BB (bb)
{
gimple_stmt_iterator gsi;
- for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+ for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
{
gimple stmt = gsi_stmt (gsi);
enum tree_code code;
- if (!is_gimple_assign (stmt))
- continue;
+ if (is_gimple_assign (stmt))
+ {
+ code = gimple_assign_rhs_code (stmt);
+ switch (code)
+ {
+ case MULT_EXPR:
+ if (!convert_mult_to_widen (stmt)
+ && convert_mult_to_fma (stmt))
+ {
+ gsi_remove (&gsi, true);
+ release_defs (stmt);
+ continue;
+ }
+ break;
- code = gimple_assign_rhs_code (stmt);
- if (code == MULT_EXPR)
- changed |= convert_mult_to_widen (stmt);
- else if (code == PLUS_EXPR || code == MINUS_EXPR)
- changed |= convert_plusminus_to_widen (&gsi, stmt, code);
+ case PLUS_EXPR:
+ case MINUS_EXPR:
+ convert_plusminus_to_widen (&gsi, stmt, code);
+ break;
+
+ default:;
+ }
+ }
+ gsi_next (&gsi);
}
}
- return (changed ? TODO_dump_func | TODO_update_ssa | TODO_verify_ssa
- | TODO_verify_stmts : 0);
+ return 0;
}
static bool
@@ -1549,6 +1669,9 @@ struct gimple_opt_pass pass_optimize_widening_mul =
0, /* properties_provided */
0, /* properties_destroyed */
0, /* todo_flags_start */
- 0 /* todo_flags_finish */
+ TODO_verify_ssa
+ | TODO_verify_stmts
+ | TODO_dump_func
+ | TODO_update_ssa /* todo_flags_finish */
}
};
@@ -1092,6 +1092,12 @@ DEFTREECODE (WIDEN_MULT_PLUS_EXPR, "widen_mult_plus_expr", tcc_expression, 3)
is subtracted from t3. */
DEFTREECODE (WIDEN_MULT_MINUS_EXPR, "widen_mult_plus_expr", tcc_expression, 3)
+/* Fused multiply-add.
+ All operands and the result are of the same type. No intermediate
+ rounding is performed after multiplying operand one with operand two
+ before adding operand three. */
+DEFTREECODE (FMA_EXPR, "fma_expr", tcc_expression, 3)
+
/* Whole vector left/right shift in bits.
Operand 0 is a vector to be shifted.
Operand 1 is an integer shift amount in bits. */