Message ID | 20130408111056.gvvtnyc8w04cww8g-nzlynne@webmail.spamcop.net |
---|---|
State | New |
Headers | show |
On Mon, Apr 8, 2013 at 5:10 PM, Joern Rennecke <joern.rennecke@embecosm.com> wrote: > This is basically the same patch as attached to the PR, except that I > have changed the goto-loop into a do-while loop with a new comment; > this caused the need for a lot of reformatting. Can you please include a testcase that shows the effect of this? > bootstrapped & regtested on i686-pc-linux-gnu. > > 2013-04-08 Joern Rennecke <joern.rennecke@embecosm.com> > > * tree-ssa-math-opts.c (mult_to_fma_pass): New file static struct. > (convert_mult_to_fma): In first pass, don't use an fms construct > when we don't have an fms operation, but fmna. it's fnma I believe. > (execute_optimize_widening_mul): Add a second pass if > convert_mult_to_fma requests it. > > Index: gcc/tree-ssa-math-opts.c > =================================================================== > --- gcc/tree-ssa-math-opts.c (revision 197578) > +++ gcc/tree-ssa-math-opts.c (working copy) > @@ -2461,6 +2461,12 @@ convert_plusminus_to_widen (gimple_stmt_ > return true; > } > > +static struct > +{ > + bool second_pass; > + bool retry_request; > +} mult_to_fma_pass; > + > /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2 > with uses in additions and subtractions to form fused multiply-add > operations. Returns true if successful and MUL_STMT should be removed. > */ > @@ -2570,6 +2576,22 @@ convert_mult_to_fma (gimple mul_stmt, tr > return false; > } > > + /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed > + by a MULT_EXPR that we'll visit later, we might be able to > + get a more profitable match with fnma. > + OTOH, if we don't, a negate / fma pair has likely lower latency > + that a mult / subtract pair. */ This makes it sound that this is purely an ordering issue, thus for x = a * b; y = c * d; z = x - y; instead of y = c * d; z = a * b + (-y); you want to generate x = a * b; z = -(c * d + (-x)); I fail to see why you need two passes for this rather than considering the case that the immediate use stmt of the multiplication we start from combines another multiplication with a MINUS_EXPR. That is ... > + if (use_code == MINUS_EXPR && !negate_p > + && gimple_assign_rhs1 (use_stmt) == result > + && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing > + && optab_handler (fnma_optab, TYPE_MODE (type)) != > CODE_FOR_nothing > + && mult_to_fma_pass.second_pass == false) see if that is the case here and simply not do the transform. A second pass will not recover from that without destroying the fnma pattern (testcase?) Richard. > + { > + /* ??? Could make setting of retry_request dependent on some > + rtx_cost measure we evaluate beforehand. */ > + mult_to_fma_pass.retry_request = true; > + return false; > + } > /* We can't handle a * b + a * b. */ > if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt)) > return false; > @@ -2657,76 +2679,89 @@ execute_optimize_widening_mul (void) > > memset (&widen_mul_stats, 0, sizeof (widen_mul_stats)); > > - FOR_EACH_BB (bb) > - { > - gimple_stmt_iterator gsi; > > - for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);) > - { > - gimple stmt = gsi_stmt (gsi); > - enum tree_code code; > + /* We may run one or two passes. In the first pass, if have fnma, > + but not fms, we don't synthesize fms so that we can get the maximum > + matches for fnma. If we have therefore skipped opportunities to > + synthesize fms, we'll run a second pass where we use any such > + opportunities that still remain. */ > + mult_to_fma_pass.retry_request = false; > + do > + { > + mult_to_fma_pass.second_pass = mult_to_fma_pass.retry_request; > + FOR_EACH_BB (bb) > + { > + gimple_stmt_iterator gsi; > > - if (is_gimple_assign (stmt)) > + for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);) > { > - code = gimple_assign_rhs_code (stmt); > - switch (code) > + gimple stmt = gsi_stmt (gsi); > + enum tree_code code; > + > + if (is_gimple_assign (stmt)) > { > - case MULT_EXPR: > - if (!convert_mult_to_widen (stmt, &gsi) > - && convert_mult_to_fma (stmt, > - gimple_assign_rhs1 (stmt), > - gimple_assign_rhs2 (stmt))) > + code = gimple_assign_rhs_code (stmt); > + switch (code) > { > - gsi_remove (&gsi, true); > - release_defs (stmt); > - continue; > - } > - break; > - > - case PLUS_EXPR: > - case MINUS_EXPR: > - convert_plusminus_to_widen (&gsi, stmt, code); > - break; > + case MULT_EXPR: > + if (!convert_mult_to_widen (stmt, &gsi) > + && convert_mult_to_fma (stmt, > + gimple_assign_rhs1 (stmt), > + gimple_assign_rhs2 > (stmt))) > + { > + gsi_remove (&gsi, true); > + release_defs (stmt); > + continue; > + } > + break; > + > + case PLUS_EXPR: > + case MINUS_EXPR: > + convert_plusminus_to_widen (&gsi, stmt, code); > + break; > > - default:; > + default:; > + } > } > - } > - else if (is_gimple_call (stmt) > - && gimple_call_lhs (stmt)) > - { > - tree fndecl = gimple_call_fndecl (stmt); > - if (fndecl > - && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL) > + else if (is_gimple_call (stmt) > + && gimple_call_lhs (stmt)) > { > - switch (DECL_FUNCTION_CODE (fndecl)) > + tree fndecl = gimple_call_fndecl (stmt); > + if (fndecl > + && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL) > { > - case BUILT_IN_POWF: > - case BUILT_IN_POW: > - case BUILT_IN_POWL: > - if (TREE_CODE (gimple_call_arg (stmt, 1)) == > REAL_CST > - && REAL_VALUES_EQUAL > - (TREE_REAL_CST (gimple_call_arg (stmt, 1)), > - dconst2) > - && convert_mult_to_fma (stmt, > - gimple_call_arg (stmt, > 0), > - gimple_call_arg (stmt, > 0))) > - { > - unlink_stmt_vdef (stmt); > - if (gsi_remove (&gsi, true) > - && gimple_purge_dead_eh_edges (bb)) > - cfg_changed = true; > - release_defs (stmt); > - continue; > - } > + switch (DECL_FUNCTION_CODE (fndecl)) > + { > + case BUILT_IN_POWF: > + case BUILT_IN_POW: > + case BUILT_IN_POWL: > + if ((TREE_CODE (gimple_call_arg (stmt, 1)) > + == REAL_CST) > + && (REAL_VALUES_EQUAL > + (TREE_REAL_CST (gimple_call_arg (stmt, > 1)), > + dconst2)) > + && (convert_mult_to_fma > + (stmt, gimple_call_arg (stmt, 0), > + gimple_call_arg (stmt, 0)))) > + { > + unlink_stmt_vdef (stmt); > + if (gsi_remove (&gsi, true) > + && gimple_purge_dead_eh_edges (bb)) > + cfg_changed = true; > + release_defs (stmt); > + continue; > + } > break; > > - default:; > + default:; > + } > } > } > + gsi_next (&gsi); > } > - gsi_next (&gsi); > } > } > + while (!mult_to_fma_pass.second_pass && mult_to_fma_pass.retry_request); > > statistics_counter_event (cfun, "widening multiplications inserted", > widen_mul_stats.widen_mults_inserted); >
Quoting Richard Biener <richard.guenther@gmail.com>: Oops, I missed your final comment when I wrote my first reply. > I fail to see why you need two passes for this rather than considering the > case that the immediate use stmt of the multiplication we start from > combines another multiplication with a MINUS_EXPR. That is ... > >> + if (use_code == MINUS_EXPR && !negate_p >> + && gimple_assign_rhs1 (use_stmt) == result >> + && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing >> + && optab_handler (fnma_optab, TYPE_MODE (type)) != >> CODE_FOR_nothing >> + && mult_to_fma_pass.second_pass == false) > > see if that is the case here and simply not do the transform. In other words, I would have to take gimple_assign_rh2 of use_stmt, then find the statement that sets that value, check if this is a multiply, and that it is not expanded as a widening multiply (thus duplicating most of the code in execute_optimize_widening_mul), and also check that if there are any other uses of the multiply result, that these are likewise reduced to fma somehow. At which point we hit recursion. This could easily double the size of this source file. Also, we are alternating between forward and backward searches, which could lead to quadratic complexity where we had linear otherwise. > A second pass > will not recover from that without destroying the fnma pattern (testcase?) The second pass is there to generate the fms where we didn't generate fnma. If fnma (as negate (fma)) has been generated, it will use up the MINUS_EXPR, so that fms cannot be formed anymore. This arrangement is a lot simpler than going for a depth-first decision.
On Tue, Apr 9, 2013 at 4:53 PM, Joern Rennecke <joern.rennecke@embecosm.com> wrote: > Quoting Richard Biener <richard.guenther@gmail.com>: > > Oops, I missed your final comment when I wrote my first reply. > > >> I fail to see why you need two passes for this rather than considering the >> case that the immediate use stmt of the multiplication we start from >> combines another multiplication with a MINUS_EXPR. That is ... >> >>> + if (use_code == MINUS_EXPR && !negate_p >>> + && gimple_assign_rhs1 (use_stmt) == result >>> + && optab_handler (fms_optab, TYPE_MODE (type)) == >>> CODE_FOR_nothing >>> + && optab_handler (fnma_optab, TYPE_MODE (type)) != >>> CODE_FOR_nothing >>> + && mult_to_fma_pass.second_pass == false) >> >> >> see if that is the case here and simply not do the transform. > > > In other words, I would have to take gimple_assign_rh2 of use_stmt, then > find the statement that sets that value, check if this is a multiply, > and that it is not expanded as a widening multiply (thus duplicating most of > the code in execute_optimize_widening_mul), and also check that if there are > any other uses of the multiply result, that these are likewise reduced to > fma somehow. At which point we hit recursion. This could easily double > the size of this source file. Also, we are alternating between forward > and backward searches, which could lead to quadratic complexity where > we had linear otherwise. I don't see that. It's merely a complication of optimal handling of a * b +- c * d vs. just a * b +- c. The pass does simple pattern matching only, not doing a global optimal transform, so adding another special-case is reasonable. Special-casing just for single-use 2nd multiplication simplifies the cases for example. Richard. >> A second pass >> will not recover from that without destroying the fnma pattern (testcase?) > > > The second pass is there to generate the fms where we didn't generate fnma. > If fnma (as negate (fma)) has been generated, it will use up the MINUS_EXPR, > so that fms cannot be formed anymore. > This arrangement is a lot simpler than going for a depth-first decision.
Index: gcc/tree-ssa-math-opts.c =================================================================== --- gcc/tree-ssa-math-opts.c (revision 197578) +++ gcc/tree-ssa-math-opts.c (working copy) @@ -2461,6 +2461,12 @@ convert_plusminus_to_widen (gimple_stmt_ return true; } +static struct +{ + bool second_pass; + bool retry_request; +} mult_to_fma_pass; + /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2 with uses in additions and subtractions to form fused multiply-add operations. Returns true if successful and MUL_STMT should be removed. */ @@ -2570,6 +2576,22 @@ convert_mult_to_fma (gimple mul_stmt, tr return false; } + /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed + by a MULT_EXPR that we'll visit later, we might be able to + get a more profitable match with fnma. + OTOH, if we don't, a negate / fma pair has likely lower latency + that a mult / subtract pair. */ + if (use_code == MINUS_EXPR && !negate_p + && gimple_assign_rhs1 (use_stmt) == result + && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing + && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing + && mult_to_fma_pass.second_pass == false) + { + /* ??? Could make setting of retry_request dependent on some + rtx_cost measure we evaluate beforehand. */ + mult_to_fma_pass.retry_request = true; + return false; + } /* We can't handle a * b + a * b. */ if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt)) return false; @@ -2657,76 +2679,89 @@ execute_optimize_widening_mul (void) memset (&widen_mul_stats, 0, sizeof (widen_mul_stats)); - FOR_EACH_BB (bb) - { - gimple_stmt_iterator gsi; - for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);) - { - gimple stmt = gsi_stmt (gsi); - enum tree_code code; + /* We may run one or two passes. In the first pass, if have fnma, + but not fms, we don't synthesize fms so that we can get the maximum + matches for fnma. If we have therefore skipped opportunities to + synthesize fms, we'll run a second pass where we use any such + opportunities that still remain. */ + mult_to_fma_pass.retry_request = false; + do + { + mult_to_fma_pass.second_pass = mult_to_fma_pass.retry_request; + FOR_EACH_BB (bb) + { + gimple_stmt_iterator gsi; - if (is_gimple_assign (stmt)) + for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);) { - code = gimple_assign_rhs_code (stmt); - switch (code) + gimple stmt = gsi_stmt (gsi); + enum tree_code code; + + if (is_gimple_assign (stmt)) { - case MULT_EXPR: - if (!convert_mult_to_widen (stmt, &gsi) - && convert_mult_to_fma (stmt, - gimple_assign_rhs1 (stmt), - gimple_assign_rhs2 (stmt))) + code = gimple_assign_rhs_code (stmt); + switch (code) { - gsi_remove (&gsi, true); - release_defs (stmt); - continue; - } - break; - - case PLUS_EXPR: - case MINUS_EXPR: - convert_plusminus_to_widen (&gsi, stmt, code); - break; + case MULT_EXPR: + if (!convert_mult_to_widen (stmt, &gsi) + && convert_mult_to_fma (stmt, + gimple_assign_rhs1 (stmt), + gimple_assign_rhs2 (stmt))) + { + gsi_remove (&gsi, true); + release_defs (stmt); + continue; + } + break; + + case PLUS_EXPR: + case MINUS_EXPR: + convert_plusminus_to_widen (&gsi, stmt, code); + break; - default:; + default:; + } } - } - else if (is_gimple_call (stmt) - && gimple_call_lhs (stmt)) - { - tree fndecl = gimple_call_fndecl (stmt); - if (fndecl - && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL) + else if (is_gimple_call (stmt) + && gimple_call_lhs (stmt)) { - switch (DECL_FUNCTION_CODE (fndecl)) + tree fndecl = gimple_call_fndecl (stmt); + if (fndecl + && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL) { - case BUILT_IN_POWF: - case BUILT_IN_POW: - case BUILT_IN_POWL: - if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST - && REAL_VALUES_EQUAL - (TREE_REAL_CST (gimple_call_arg (stmt, 1)), - dconst2) - && convert_mult_to_fma (stmt, - gimple_call_arg (stmt, 0), - gimple_call_arg (stmt, 0))) - { - unlink_stmt_vdef (stmt); - if (gsi_remove (&gsi, true) - && gimple_purge_dead_eh_edges (bb)) - cfg_changed = true; - release_defs (stmt); - continue; - } + switch (DECL_FUNCTION_CODE (fndecl)) + { + case BUILT_IN_POWF: + case BUILT_IN_POW: + case BUILT_IN_POWL: + if ((TREE_CODE (gimple_call_arg (stmt, 1)) + == REAL_CST) + && (REAL_VALUES_EQUAL + (TREE_REAL_CST (gimple_call_arg (stmt, 1)), + dconst2)) + && (convert_mult_to_fma + (stmt, gimple_call_arg (stmt, 0), + gimple_call_arg (stmt, 0)))) + { + unlink_stmt_vdef (stmt); + if (gsi_remove (&gsi, true) + && gimple_purge_dead_eh_edges (bb)) + cfg_changed = true; + release_defs (stmt); + continue; + } break; - default:; + default:; + } } } + gsi_next (&gsi); } - gsi_next (&gsi); } } + while (!mult_to_fma_pass.second_pass && mult_to_fma_pass.retry_request); statistics_counter_event (cfun, "widening multiplications inserted", widen_mul_stats.widen_mults_inserted);