Message ID | 20201217145117.GS3788@tucnak |
---|---|
State | New |
Headers | show |
Series | store-merging: Handle vector CONSTRUCTORs using bswap [PR96239] | expand |
On Thu, 17 Dec 2020, Jakub Jelinek wrote: > On Wed, Dec 16, 2020 at 09:29:31AM +0100, Richard Biener wrote: > > I think it probably makes sense to have some helper split out that > > collects & classifies vector constructor components we can use from > > both forwprop (where matching the V_C_E from integer could be done > > as well IMHO) and bswap (when a permute is involved) and store-merging. > > I've tried to add such helper, but handling over just analysis and letting > each pass handle it differently seems complicated given the limitations of > the bswap infrastructure. > > So, this patch just hooks the optimization also into store-merging so that > the original testcase from the PR can be fixed. > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? OK, but ... > Note, I have no idea why the bswap code needs TODO_update_ssa if it changed > things, for the vuses it copies them from the surrounding vuses, which looks > correct to me. Perhaps because it uses force_gimple_operand_gsi* in a few > spots in bswap_replace? Confused... .. that shouldn't cause updating SSA to be necessary. Maybe it at some point did not update virtual operands appropriately. Richard. > 2020-12-17 Jakub Jelinek <jakub@redhat.com> > > PR tree-optimization/96239 > * gimple-ssa-store-merging.c (maybe_optimize_vector_constructor): New > function. > (get_status_for_store_merging): Don't return BB_INVALID for blocks > with potential bswap optimizable CONSTRUCTORs. > (pass_store_merging::execute): Optimize vector CONSTRUCTORs with bswap > if possible. > > * gcc.dg/tree-ssa/pr96239.c: New test. > > --- gcc/gimple-ssa-store-merging.c.jj 2020-12-16 13:07:51.729733816 +0100 > +++ gcc/gimple-ssa-store-merging.c 2020-12-16 16:02:06.238868137 +0100 > @@ -1255,6 +1255,75 @@ bswap_replace (gimple_stmt_iterator gsi, > return tgt; > } > > +/* Try to optimize an assignment CUR_STMT with CONSTRUCTOR on the rhs > + using bswap optimizations. CDI_DOMINATORS need to be > + computed on entry. Return true if it has been optimized and > + TODO_update_ssa is needed. */ > + > +static bool > +maybe_optimize_vector_constructor (gimple *cur_stmt) > +{ > + tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type; > + struct symbolic_number n; > + bool bswap; > + > + gcc_assert (is_gimple_assign (cur_stmt) > + && gimple_assign_rhs_code (cur_stmt) == CONSTRUCTOR); > + > + tree rhs = gimple_assign_rhs1 (cur_stmt); > + if (!VECTOR_TYPE_P (TREE_TYPE (rhs)) > + || !INTEGRAL_TYPE_P (TREE_TYPE (TREE_TYPE (rhs))) > + || gimple_assign_lhs (cur_stmt) == NULL_TREE) > + return false; > + > + HOST_WIDE_INT sz = int_size_in_bytes (TREE_TYPE (rhs)) * BITS_PER_UNIT; > + switch (sz) > + { > + case 16: > + load_type = bswap_type = uint16_type_node; > + break; > + case 32: > + if (builtin_decl_explicit_p (BUILT_IN_BSWAP32) > + && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing) > + { > + load_type = uint32_type_node; > + fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32); > + bswap_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl))); > + } > + else > + return false; > + break; > + case 64: > + if (builtin_decl_explicit_p (BUILT_IN_BSWAP64) > + && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing > + || (word_mode == SImode > + && builtin_decl_explicit_p (BUILT_IN_BSWAP32) > + && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing))) > + { > + load_type = uint64_type_node; > + fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64); > + bswap_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl))); > + } > + else > + return false; > + break; > + default: > + return false; > + } > + > + gimple *ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap); > + if (!ins_stmt || n.range != (unsigned HOST_WIDE_INT) sz) > + return false; > + > + if (bswap && !fndecl && n.range != 16) > + return false; > + > + memset (&nop_stats, 0, sizeof (nop_stats)); > + memset (&bswap_stats, 0, sizeof (bswap_stats)); > + return bswap_replace (gsi_for_stmt (cur_stmt), ins_stmt, fndecl, > + bswap_type, load_type, &n, bswap) != NULL_TREE; > +} > + > /* Find manual byte swap implementations as well as load in a given > endianness. Byte swaps are turned into a bswap builtin invokation > while endian loads are converted to bswap builtin invokation or > @@ -5126,6 +5195,7 @@ static enum basic_block_status > get_status_for_store_merging (basic_block bb) > { > unsigned int num_statements = 0; > + unsigned int num_constructors = 0; > gimple_stmt_iterator gsi; > edge e; > > @@ -5138,9 +5208,27 @@ get_status_for_store_merging (basic_bloc > > if (store_valid_for_store_merging_p (stmt) && ++num_statements >= 2) > break; > + > + if (is_gimple_assign (stmt) > + && gimple_assign_rhs_code (stmt) == CONSTRUCTOR) > + { > + tree rhs = gimple_assign_rhs1 (stmt); > + if (VECTOR_TYPE_P (TREE_TYPE (rhs)) > + && INTEGRAL_TYPE_P (TREE_TYPE (TREE_TYPE (rhs))) > + && gimple_assign_lhs (stmt) != NULL_TREE) > + { > + HOST_WIDE_INT sz > + = int_size_in_bytes (TREE_TYPE (rhs)) * BITS_PER_UNIT; > + if (sz == 16 || sz == 32 || sz == 64) > + { > + num_constructors = 1; > + break; > + } > + } > + } > } > > - if (num_statements == 0) > + if (num_statements == 0 && num_constructors == 0) > return BB_INVALID; > > if (cfun->can_throw_non_call_exceptions && cfun->eh > @@ -5149,7 +5237,7 @@ get_status_for_store_merging (basic_bloc > && e->dest == bb->next_bb) > return BB_EXTENDED_VALID; > > - return num_statements >= 2 ? BB_VALID : BB_INVALID; > + return (num_statements >= 2 || num_constructors) ? BB_VALID : BB_INVALID; > } > > /* Entry point for the pass. Go over each basic block recording chains of > @@ -5163,6 +5251,7 @@ pass_store_merging::execute (function *f > basic_block bb; > hash_set<gimple *> orig_stmts; > bool changed = false, open_chains = false; > + unsigned todo = 0; > > /* If the function can throw and catch non-call exceptions, we'll be trying > to merge stores across different basic blocks so we need to first unsplit > @@ -5189,9 +5278,10 @@ pass_store_merging::execute (function *f > if (dump_file && (dump_flags & TDF_DETAILS)) > fprintf (dump_file, "Processing basic block <%d>:\n", bb->index); > > - for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi)) > + for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); ) > { > gimple *stmt = gsi_stmt (gsi); > + gsi_next (&gsi); > > if (is_gimple_debug (stmt)) > continue; > @@ -5207,6 +5297,14 @@ pass_store_merging::execute (function *f > continue; > } > > + if (is_gimple_assign (stmt) > + && gimple_assign_rhs_code (stmt) == CONSTRUCTOR > + && maybe_optimize_vector_constructor (stmt)) > + { > + todo = TODO_update_ssa; > + continue; > + } > + > if (store_valid_for_store_merging_p (stmt)) > changed |= process_store (stmt); > else > @@ -5230,10 +5328,10 @@ pass_store_merging::execute (function *f > if (cfun->can_throw_non_call_exceptions && cfun->eh && changed) > { > free_dominance_info (CDI_DOMINATORS); > - return TODO_cleanup_cfg; > + return TODO_cleanup_cfg | todo; > } > > - return 0; > + return todo; > } > > } // anon namespace > --- gcc/testsuite/gcc.dg/tree-ssa/pr96239.c.jj 2020-12-16 16:10:30.013256862 +0100 > +++ gcc/testsuite/gcc.dg/tree-ssa/pr96239.c 2020-12-16 16:11:08.802822537 +0100 > @@ -0,0 +1,18 @@ > +/* PR tree-optimization/96239 */ > +/* { dg-do compile { target { ilp32 || lp64 } } } */ > +/* { dg-options "-O3 -fdump-tree-optimized" } */ > +/* { dg-final { scan-tree-dump " r>> 8;" "optimized" { target bswap } } } */ > + > +union U { unsigned char c[2]; unsigned short s; }; > + > +unsigned short > +foo (unsigned short x) > +{ > + union U u; > + u.s = x; > + unsigned char v = u.c[0]; > + unsigned char w = u.c[1]; > + u.c[0] = w; > + u.c[1] = v; > + return u.s; > +} > > > Jakub > >
On Tue, Jan 05, 2021 at 01:55:21PM +0100, Richard Biener wrote: > > Note, I have no idea why the bswap code needs TODO_update_ssa if it changed > > things, for the vuses it copies them from the surrounding vuses, which looks > > correct to me. Perhaps because it uses force_gimple_operand_gsi* in a few > > spots in bswap_replace? Confused... > > .. that shouldn't cause updating SSA to be necessary. Maybe it at some > point did not update virtual operands appropriately. Ok, I've committed the following version without the TODO_update_ssa, which passed another bootstrap/regtest on x86_64-linux and i686-linux. 2021-01-05 Jakub Jelinek <jakub@redhat.com> PR tree-optimization/96239 * gimple-ssa-store-merging.c (maybe_optimize_vector_constructor): New function. (get_status_for_store_merging): Don't return BB_INVALID for blocks with potential bswap optimizable CONSTRUCTORs. (pass_store_merging::execute): Optimize vector CONSTRUCTORs with bswap if possible. * gcc.dg/tree-ssa/pr96239.c: New test. --- gcc/gimple-ssa-store-merging.c.jj 2020-12-16 13:07:51.729733816 +0100 +++ gcc/gimple-ssa-store-merging.c 2020-12-16 16:02:06.238868137 +0100 @@ -1255,6 +1255,75 @@ bswap_replace (gimple_stmt_iterator gsi, return tgt; } +/* Try to optimize an assignment CUR_STMT with CONSTRUCTOR on the rhs + using bswap optimizations. CDI_DOMINATORS need to be + computed on entry. Return true if it has been optimized and + TODO_update_ssa is needed. */ + +static bool +maybe_optimize_vector_constructor (gimple *cur_stmt) +{ + tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type; + struct symbolic_number n; + bool bswap; + + gcc_assert (is_gimple_assign (cur_stmt) + && gimple_assign_rhs_code (cur_stmt) == CONSTRUCTOR); + + tree rhs = gimple_assign_rhs1 (cur_stmt); + if (!VECTOR_TYPE_P (TREE_TYPE (rhs)) + || !INTEGRAL_TYPE_P (TREE_TYPE (TREE_TYPE (rhs))) + || gimple_assign_lhs (cur_stmt) == NULL_TREE) + return false; + + HOST_WIDE_INT sz = int_size_in_bytes (TREE_TYPE (rhs)) * BITS_PER_UNIT; + switch (sz) + { + case 16: + load_type = bswap_type = uint16_type_node; + break; + case 32: + if (builtin_decl_explicit_p (BUILT_IN_BSWAP32) + && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing) + { + load_type = uint32_type_node; + fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32); + bswap_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl))); + } + else + return false; + break; + case 64: + if (builtin_decl_explicit_p (BUILT_IN_BSWAP64) + && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing + || (word_mode == SImode + && builtin_decl_explicit_p (BUILT_IN_BSWAP32) + && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing))) + { + load_type = uint64_type_node; + fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64); + bswap_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl))); + } + else + return false; + break; + default: + return false; + } + + gimple *ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap); + if (!ins_stmt || n.range != (unsigned HOST_WIDE_INT) sz) + return false; + + if (bswap && !fndecl && n.range != 16) + return false; + + memset (&nop_stats, 0, sizeof (nop_stats)); + memset (&bswap_stats, 0, sizeof (bswap_stats)); + return bswap_replace (gsi_for_stmt (cur_stmt), ins_stmt, fndecl, + bswap_type, load_type, &n, bswap) != NULL_TREE; +} + /* Find manual byte swap implementations as well as load in a given endianness. Byte swaps are turned into a bswap builtin invokation while endian loads are converted to bswap builtin invokation or @@ -5126,6 +5195,7 @@ static enum basic_block_status get_status_for_store_merging (basic_block bb) { unsigned int num_statements = 0; + unsigned int num_constructors = 0; gimple_stmt_iterator gsi; edge e; @@ -5138,9 +5208,27 @@ get_status_for_store_merging (basic_bloc if (store_valid_for_store_merging_p (stmt) && ++num_statements >= 2) break; + + if (is_gimple_assign (stmt) + && gimple_assign_rhs_code (stmt) == CONSTRUCTOR) + { + tree rhs = gimple_assign_rhs1 (stmt); + if (VECTOR_TYPE_P (TREE_TYPE (rhs)) + && INTEGRAL_TYPE_P (TREE_TYPE (TREE_TYPE (rhs))) + && gimple_assign_lhs (stmt) != NULL_TREE) + { + HOST_WIDE_INT sz + = int_size_in_bytes (TREE_TYPE (rhs)) * BITS_PER_UNIT; + if (sz == 16 || sz == 32 || sz == 64) + { + num_constructors = 1; + break; + } + } + } } - if (num_statements == 0) + if (num_statements == 0 && num_constructors == 0) return BB_INVALID; if (cfun->can_throw_non_call_exceptions && cfun->eh @@ -5149,7 +5237,7 @@ get_status_for_store_merging (basic_bloc && e->dest == bb->next_bb) return BB_EXTENDED_VALID; - return num_statements >= 2 ? BB_VALID : BB_INVALID; + return (num_statements >= 2 || num_constructors) ? BB_VALID : BB_INVALID; } /* Entry point for the pass. Go over each basic block recording chains of @@ -5189,9 +5278,10 @@ pass_store_merging::execute (function *f if (dump_file && (dump_flags & TDF_DETAILS)) fprintf (dump_file, "Processing basic block <%d>:\n", bb->index); - for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi)) + for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); ) { gimple *stmt = gsi_stmt (gsi); + gsi_next (&gsi); if (is_gimple_debug (stmt)) continue; @@ -5207,6 +5297,11 @@ pass_store_merging::execute (function *f continue; } + if (is_gimple_assign (stmt) + && gimple_assign_rhs_code (stmt) == CONSTRUCTOR + && maybe_optimize_vector_constructor (stmt)) + continue; + if (store_valid_for_store_merging_p (stmt)) changed |= process_store (stmt); else --- gcc/testsuite/gcc.dg/tree-ssa/pr96239.c.jj 2020-12-16 16:10:30.013256862 +0100 +++ gcc/testsuite/gcc.dg/tree-ssa/pr96239.c 2020-12-16 16:11:08.802822537 +0100 @@ -0,0 +1,18 @@ +/* PR tree-optimization/96239 */ +/* { dg-do compile { target { ilp32 || lp64 } } } */ +/* { dg-options "-O3 -fdump-tree-optimized" } */ +/* { dg-final { scan-tree-dump " r>> 8;" "optimized" { target bswap } } } */ + +union U { unsigned char c[2]; unsigned short s; }; + +unsigned short +foo (unsigned short x) +{ + union U u; + u.s = x; + unsigned char v = u.c[0]; + unsigned char w = u.c[1]; + u.c[0] = w; + u.c[1] = v; + return u.s; +} Jakub
--- gcc/gimple-ssa-store-merging.c.jj 2020-12-16 13:07:51.729733816 +0100 +++ gcc/gimple-ssa-store-merging.c 2020-12-16 16:02:06.238868137 +0100 @@ -1255,6 +1255,75 @@ bswap_replace (gimple_stmt_iterator gsi, return tgt; } +/* Try to optimize an assignment CUR_STMT with CONSTRUCTOR on the rhs + using bswap optimizations. CDI_DOMINATORS need to be + computed on entry. Return true if it has been optimized and + TODO_update_ssa is needed. */ + +static bool +maybe_optimize_vector_constructor (gimple *cur_stmt) +{ + tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type; + struct symbolic_number n; + bool bswap; + + gcc_assert (is_gimple_assign (cur_stmt) + && gimple_assign_rhs_code (cur_stmt) == CONSTRUCTOR); + + tree rhs = gimple_assign_rhs1 (cur_stmt); + if (!VECTOR_TYPE_P (TREE_TYPE (rhs)) + || !INTEGRAL_TYPE_P (TREE_TYPE (TREE_TYPE (rhs))) + || gimple_assign_lhs (cur_stmt) == NULL_TREE) + return false; + + HOST_WIDE_INT sz = int_size_in_bytes (TREE_TYPE (rhs)) * BITS_PER_UNIT; + switch (sz) + { + case 16: + load_type = bswap_type = uint16_type_node; + break; + case 32: + if (builtin_decl_explicit_p (BUILT_IN_BSWAP32) + && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing) + { + load_type = uint32_type_node; + fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32); + bswap_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl))); + } + else + return false; + break; + case 64: + if (builtin_decl_explicit_p (BUILT_IN_BSWAP64) + && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing + || (word_mode == SImode + && builtin_decl_explicit_p (BUILT_IN_BSWAP32) + && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing))) + { + load_type = uint64_type_node; + fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64); + bswap_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl))); + } + else + return false; + break; + default: + return false; + } + + gimple *ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap); + if (!ins_stmt || n.range != (unsigned HOST_WIDE_INT) sz) + return false; + + if (bswap && !fndecl && n.range != 16) + return false; + + memset (&nop_stats, 0, sizeof (nop_stats)); + memset (&bswap_stats, 0, sizeof (bswap_stats)); + return bswap_replace (gsi_for_stmt (cur_stmt), ins_stmt, fndecl, + bswap_type, load_type, &n, bswap) != NULL_TREE; +} + /* Find manual byte swap implementations as well as load in a given endianness. Byte swaps are turned into a bswap builtin invokation while endian loads are converted to bswap builtin invokation or @@ -5126,6 +5195,7 @@ static enum basic_block_status get_status_for_store_merging (basic_block bb) { unsigned int num_statements = 0; + unsigned int num_constructors = 0; gimple_stmt_iterator gsi; edge e; @@ -5138,9 +5208,27 @@ get_status_for_store_merging (basic_bloc if (store_valid_for_store_merging_p (stmt) && ++num_statements >= 2) break; + + if (is_gimple_assign (stmt) + && gimple_assign_rhs_code (stmt) == CONSTRUCTOR) + { + tree rhs = gimple_assign_rhs1 (stmt); + if (VECTOR_TYPE_P (TREE_TYPE (rhs)) + && INTEGRAL_TYPE_P (TREE_TYPE (TREE_TYPE (rhs))) + && gimple_assign_lhs (stmt) != NULL_TREE) + { + HOST_WIDE_INT sz + = int_size_in_bytes (TREE_TYPE (rhs)) * BITS_PER_UNIT; + if (sz == 16 || sz == 32 || sz == 64) + { + num_constructors = 1; + break; + } + } + } } - if (num_statements == 0) + if (num_statements == 0 && num_constructors == 0) return BB_INVALID; if (cfun->can_throw_non_call_exceptions && cfun->eh @@ -5149,7 +5237,7 @@ get_status_for_store_merging (basic_bloc && e->dest == bb->next_bb) return BB_EXTENDED_VALID; - return num_statements >= 2 ? BB_VALID : BB_INVALID; + return (num_statements >= 2 || num_constructors) ? BB_VALID : BB_INVALID; } /* Entry point for the pass. Go over each basic block recording chains of @@ -5163,6 +5251,7 @@ pass_store_merging::execute (function *f basic_block bb; hash_set<gimple *> orig_stmts; bool changed = false, open_chains = false; + unsigned todo = 0; /* If the function can throw and catch non-call exceptions, we'll be trying to merge stores across different basic blocks so we need to first unsplit @@ -5189,9 +5278,10 @@ pass_store_merging::execute (function *f if (dump_file && (dump_flags & TDF_DETAILS)) fprintf (dump_file, "Processing basic block <%d>:\n", bb->index); - for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi)) + for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); ) { gimple *stmt = gsi_stmt (gsi); + gsi_next (&gsi); if (is_gimple_debug (stmt)) continue; @@ -5207,6 +5297,14 @@ pass_store_merging::execute (function *f continue; } + if (is_gimple_assign (stmt) + && gimple_assign_rhs_code (stmt) == CONSTRUCTOR + && maybe_optimize_vector_constructor (stmt)) + { + todo = TODO_update_ssa; + continue; + } + if (store_valid_for_store_merging_p (stmt)) changed |= process_store (stmt); else @@ -5230,10 +5328,10 @@ pass_store_merging::execute (function *f if (cfun->can_throw_non_call_exceptions && cfun->eh && changed) { free_dominance_info (CDI_DOMINATORS); - return TODO_cleanup_cfg; + return TODO_cleanup_cfg | todo; } - return 0; + return todo; } } // anon namespace --- gcc/testsuite/gcc.dg/tree-ssa/pr96239.c.jj 2020-12-16 16:10:30.013256862 +0100 +++ gcc/testsuite/gcc.dg/tree-ssa/pr96239.c 2020-12-16 16:11:08.802822537 +0100 @@ -0,0 +1,18 @@ +/* PR tree-optimization/96239 */ +/* { dg-do compile { target { ilp32 || lp64 } } } */ +/* { dg-options "-O3 -fdump-tree-optimized" } */ +/* { dg-final { scan-tree-dump " r>> 8;" "optimized" { target bswap } } } */ + +union U { unsigned char c[2]; unsigned short s; }; + +unsigned short +foo (unsigned short x) +{ + union U u; + u.s = x; + unsigned char v = u.c[0]; + unsigned char w = u.c[1]; + u.c[0] = w; + u.c[1] = v; + return u.s; +}