Message ID | ZViRqiN7i3waSY2v@tucnak |
---|---|
State | New |
Headers | show |
Series | tree-ssa-math-opts: popcount (X) == 1 to (X ^ (X - 1)) > (X - 1) optimization for direct optab [PR90693] | expand |
On 11/18/23 03:27, Jakub Jelinek wrote: > Hi! > > On Fri, Nov 17, 2023 at 03:01:04PM +0100, Jakub Jelinek wrote: >> As a follow-up, I'm considering changing in this routine the popcount >> call to IFN_POPCOUNT with 2 arguments and during expansion test costs. > > Here is the follow-up which does the rtx costs testing. > While having to tweak internal-fn.def so that POPCOUNT can have a custom > expand_POPCOUNT, I have noticed we are inconsistent, some DEF_INTERNAL* > macros (most of them) were undefined at the end of internal-fn.def (but in > some cases uselessly undefined again after inclusion), while others were not > (and sometimes undefined after the inclusion). I've changed it to always > undefine at the end of internal-fn.def. > > Ok for trunk if it passes bootstrap/regtest? > > 2023-11-18 Jakub Jelinek <jakub@redhat.com> > > PR tree-optimization/90693 > * tree-ssa-math-opts.cc (match_single_bit_test): Mark POPCOUNT with > result only used in equality comparison against 1 with direct optab > support as .POPCOUNT call with 2 arguments. > * internal-fn.h (expand_POPCOUNT): Declare. > * internal-fn.def: Document missing DEF_INTERNAL* macros and make sure > they are all undefined at the end. > (DEF_INTERNAL_INT_EXT_FN): New macro. > (POPCOUNT): Use it instead of DEF_INTERNAL_INT_FN. > * internal-fn.cc (lookup_hilo_internal_fn, lookup_evenodd_internal_fn, > widening_fn_p, get_len_internal_fn): Don't undef DEF_INTERNAL_*FN > macros after inclusion of internal-fn.def. > (DEF_INTERNAL_INT_EXT_FN): Define to nothing before inclusion to > define expanders. > (expand_POPCOUNT): New function. > > + unsigned cmp_cost = seq_cost (cmp_insns, speed_p); > + if (popcount_cost < cmp_cost) > + emit_insn (popcount_insns); > + else > + { > + emit_insn (cmp_insns); > + plhs = expand_normal (lhs); > + if (GET_MODE (cmp) != GET_MODE (plhs)) > + cmp = convert_to_mode (GET_MODE (plhs), cmp, 1); > + emit_move_insn (plhs, cmp); > + } Did you want <= for the test to use popcount? That seems like a better choice in that scenario to me as the popcount is likely smaller. OK for the trunk as-is or using a <= test. jeff
On Sat, 18 Nov 2023, Jakub Jelinek wrote: > Hi! > > On Fri, Nov 17, 2023 at 03:01:04PM +0100, Jakub Jelinek wrote: > > As a follow-up, I'm considering changing in this routine the popcount > > call to IFN_POPCOUNT with 2 arguments and during expansion test costs. > > Here is the follow-up which does the rtx costs testing. > While having to tweak internal-fn.def so that POPCOUNT can have a custom > expand_POPCOUNT, I have noticed we are inconsistent, some DEF_INTERNAL* > macros (most of them) were undefined at the end of internal-fn.def (but in > some cases uselessly undefined again after inclusion), while others were not > (and sometimes undefined after the inclusion). I've changed it to always > undefine at the end of internal-fn.def. The last bit is cleanup that could go in independently. > Ok for trunk if it passes bootstrap/regtest? > > 2023-11-18 Jakub Jelinek <jakub@redhat.com> > > PR tree-optimization/90693 > * tree-ssa-math-opts.cc (match_single_bit_test): Mark POPCOUNT with > result only used in equality comparison against 1 with direct optab > support as .POPCOUNT call with 2 arguments. > * internal-fn.h (expand_POPCOUNT): Declare. > * internal-fn.def: Document missing DEF_INTERNAL* macros and make sure > they are all undefined at the end. > (DEF_INTERNAL_INT_EXT_FN): New macro. > (POPCOUNT): Use it instead of DEF_INTERNAL_INT_FN. > * internal-fn.cc (lookup_hilo_internal_fn, lookup_evenodd_internal_fn, > widening_fn_p, get_len_internal_fn): Don't undef DEF_INTERNAL_*FN > macros after inclusion of internal-fn.def. > (DEF_INTERNAL_INT_EXT_FN): Define to nothing before inclusion to > define expanders. > (expand_POPCOUNT): New function. > > --- gcc/tree-ssa-math-opts.cc.jj 2023-11-18 09:38:03.460813910 +0100 > +++ gcc/tree-ssa-math-opts.cc 2023-11-18 10:25:18.751207936 +0100 > @@ -5195,7 +5195,16 @@ match_single_bit_test (gimple_stmt_itera > if (!INTEGRAL_TYPE_P (type)) > return; > if (direct_internal_fn_supported_p (IFN_POPCOUNT, type, OPTIMIZE_FOR_BOTH)) > - return; > + { > + /* Tell expand_POPCOUNT the popcount result is only used in equality > + comparison with one, so that it can decide based on rtx costs. */ > + gimple *g = gimple_build_call_internal (IFN_POPCOUNT, 2, arg, > + integer_one_node); > + gimple_call_set_lhs (g, gimple_call_lhs (call)); > + gimple_stmt_iterator gsi2 = gsi_for_stmt (call); > + gsi_replace (&gsi2, g, true); > + return; > + } > tree argm1 = make_ssa_name (type); > gimple *g = gimple_build_assign (argm1, PLUS_EXPR, arg, > build_int_cst (type, -1)); > --- gcc/internal-fn.h.jj 2023-11-02 12:15:12.223998565 +0100 > +++ gcc/internal-fn.h 2023-11-18 10:14:25.834340505 +0100 > @@ -262,6 +262,7 @@ extern void expand_MULBITINT (internal_f > extern void expand_DIVMODBITINT (internal_fn, gcall *); > extern void expand_FLOATTOBITINT (internal_fn, gcall *); > extern void expand_BITINTTOFLOAT (internal_fn, gcall *); > +extern void expand_POPCOUNT (internal_fn, gcall *); > > extern bool vectorized_internal_fn_supported_p (internal_fn, tree); > > --- gcc/internal-fn.def.jj 2023-11-17 15:51:02.642802521 +0100 > +++ gcc/internal-fn.def 2023-11-18 10:12:10.329236626 +0100 > @@ -33,9 +33,13 @@ along with GCC; see the file COPYING3. > DEF_INTERNAL_SIGNED_OPTAB_FN (NAME, FLAGS, SELECTOR, SIGNED_OPTAB, > UNSIGNED_OPTAB, TYPE) > DEF_INTERNAL_FLT_FN (NAME, FLAGS, OPTAB, TYPE) > + DEF_INTERNAL_FLT_FLOATN_FN (NAME, FLAGS, OPTAB, TYPE) > DEF_INTERNAL_INT_FN (NAME, FLAGS, OPTAB, TYPE) > + DEF_INTERNAL_INT_EXT_FN (NAME, FLAGS, OPTAB, TYPE) > DEF_INTERNAL_COND_FN (NAME, FLAGS, OPTAB, TYPE) > DEF_INTERNAL_SIGNED_COND_FN (NAME, FLAGS, OPTAB, TYPE) > + DEF_INTERNAL_WIDENING_OPTAB_FN (NAME, FLAGS, SELECTOR, SOPTAB, UOPTAB, > + TYPE) > > where NAME is the name of the function, FLAGS is a set of > ECF_* flags and FNSPEC is a string describing functions fnspec. > @@ -97,6 +101,10 @@ along with GCC; see the file COPYING3. > says that the function extends the C-level BUILT_IN_<NAME>{,L,LL,IMAX} > group of functions to any integral mode (including vector modes). > > + DEF_INTERNAL_INT_EXT_FN is like DEF_INTERNAL_INT_FN, except that it > + has expand_##NAME defined in internal-fn.cc to override the > + DEF_INTERNAL_INT_FN expansion behavior. > + > DEF_INTERNAL_WIDENING_OPTAB_FN is a wrapper that defines five internal > functions with DEF_INTERNAL_SIGNED_OPTAB_FN: > - one that describes a widening operation with the same number of elements > @@ -160,6 +168,11 @@ along with GCC; see the file COPYING3. > DEF_INTERNAL_OPTAB_FN (NAME, FLAGS, OPTAB, TYPE) > #endif > > +#ifndef DEF_INTERNAL_INT_EXT_FN > +#define DEF_INTERNAL_INT_EXT_FN(NAME, FLAGS, OPTAB, TYPE) \ > + DEF_INTERNAL_INT_FN (NAME, FLAGS, OPTAB, TYPE) > +#endif > + > #ifndef DEF_INTERNAL_WIDENING_OPTAB_FN > #define DEF_INTERNAL_WIDENING_OPTAB_FN(NAME, FLAGS, SELECTOR, SOPTAB, UOPTAB, TYPE) \ > DEF_INTERNAL_SIGNED_OPTAB_FN (NAME, FLAGS, SELECTOR, SOPTAB, UOPTAB, TYPE) \ > @@ -432,7 +445,7 @@ DEF_INTERNAL_INT_FN (CLZ, ECF_CONST | EC > DEF_INTERNAL_INT_FN (CTZ, ECF_CONST | ECF_NOTHROW, ctz, unary) > DEF_INTERNAL_INT_FN (FFS, ECF_CONST | ECF_NOTHROW, ffs, unary) > DEF_INTERNAL_INT_FN (PARITY, ECF_CONST | ECF_NOTHROW, parity, unary) > -DEF_INTERNAL_INT_FN (POPCOUNT, ECF_CONST | ECF_NOTHROW, popcount, unary) > +DEF_INTERNAL_INT_EXT_FN (POPCOUNT, ECF_CONST | ECF_NOTHROW, popcount, unary) > > DEF_INTERNAL_FN (GOMP_TARGET_REV, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL) > DEF_INTERNAL_FN (GOMP_USE_SIMT, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL) > @@ -572,6 +585,10 @@ DEF_INTERNAL_FN (DIVMODBITINT, ECF_LEAF, > DEF_INTERNAL_FN (FLOATTOBITINT, ECF_LEAF | ECF_NOTHROW, ". O . . ") > DEF_INTERNAL_FN (BITINTTOFLOAT, ECF_PURE | ECF_LEAF, ". R . ") > > +#undef DEF_INTERNAL_WIDENING_OPTAB_FN > +#undef DEF_INTERNAL_SIGNED_COND_FN > +#undef DEF_INTERNAL_COND_FN > +#undef DEF_INTERNAL_INT_EXT_FN > #undef DEF_INTERNAL_INT_FN > #undef DEF_INTERNAL_FLT_FN > #undef DEF_INTERNAL_FLT_FLOATN_FN > --- gcc/internal-fn.cc.jj 2023-11-07 08:32:01.741254155 +0100 > +++ gcc/internal-fn.cc 2023-11-18 11:06:31.740703230 +0100 > @@ -102,8 +102,6 @@ lookup_hilo_internal_fn (internal_fn ifn > { > default: > gcc_unreachable (); > -#undef DEF_INTERNAL_FN > -#undef DEF_INTERNAL_WIDENING_OPTAB_FN > #define DEF_INTERNAL_FN(NAME, FLAGS, TYPE) > #define DEF_INTERNAL_WIDENING_OPTAB_FN(NAME, F, S, SO, UO, T) \ > case IFN_##NAME: \ > @@ -111,8 +109,6 @@ lookup_hilo_internal_fn (internal_fn ifn > *hi = internal_fn (IFN_##NAME##_HI); \ > break; > #include "internal-fn.def" > -#undef DEF_INTERNAL_FN > -#undef DEF_INTERNAL_WIDENING_OPTAB_FN > } > } > > @@ -129,8 +125,6 @@ lookup_evenodd_internal_fn (internal_fn > { > default: > gcc_unreachable (); > -#undef DEF_INTERNAL_FN > -#undef DEF_INTERNAL_WIDENING_OPTAB_FN > #define DEF_INTERNAL_FN(NAME, FLAGS, TYPE) > #define DEF_INTERNAL_WIDENING_OPTAB_FN(NAME, F, S, SO, UO, T) \ > case IFN_##NAME: \ > @@ -138,8 +132,6 @@ lookup_evenodd_internal_fn (internal_fn > *odd = internal_fn (IFN_##NAME##_ODD); \ > break; > #include "internal-fn.def" > -#undef DEF_INTERNAL_FN > -#undef DEF_INTERNAL_WIDENING_OPTAB_FN > } > } > > @@ -4261,7 +4253,6 @@ widening_fn_p (code_helper code) > internal_fn fn = as_internal_fn ((combined_fn) code); > switch (fn) > { > - #undef DEF_INTERNAL_WIDENING_OPTAB_FN > #define DEF_INTERNAL_WIDENING_OPTAB_FN(NAME, F, S, SO, UO, T) \ > case IFN_##NAME: \ > case IFN_##NAME##_HI: \ > @@ -4270,7 +4261,6 @@ widening_fn_p (code_helper code) > case IFN_##NAME##_ODD: \ > return true; > #include "internal-fn.def" > - #undef DEF_INTERNAL_WIDENING_OPTAB_FN > > default: > return false; > @@ -4295,6 +4285,7 @@ set_edom_supported_p (void) > { \ > expand_##TYPE##_optab_fn (fn, stmt, OPTAB##_optab); \ > } > +#define DEF_INTERNAL_INT_EXT_FN(CODE, FLAGS, OPTAB, TYPE) > #define DEF_INTERNAL_SIGNED_OPTAB_FN(CODE, FLAGS, SELECTOR, SIGNED_OPTAB, \ > UNSIGNED_OPTAB, TYPE) \ > static void \ > @@ -4305,8 +4296,6 @@ set_edom_supported_p (void) > expand_##TYPE##_optab_fn (fn, stmt, which_optab); \ > } > #include "internal-fn.def" > -#undef DEF_INTERNAL_OPTAB_FN > -#undef DEF_INTERNAL_SIGNED_OPTAB_FN > > /* Routines to expand each internal function, indexed by function number. > Each routine has the prototype: > @@ -4465,8 +4454,6 @@ get_len_internal_fn (internal_fn fn) > { > switch (fn) > { > -#undef DEF_INTERNAL_COND_FN > -#undef DEF_INTERNAL_SIGNED_COND_FN > #define DEF_INTERNAL_COND_FN(NAME, ...) \ > case IFN_COND_##NAME: \ > return IFN_COND_LEN_##NAME; > @@ -4474,8 +4461,6 @@ get_len_internal_fn (internal_fn fn) > case IFN_COND_##NAME: \ > return IFN_COND_LEN_##NAME; > #include "internal-fn.def" > -#undef DEF_INTERNAL_COND_FN > -#undef DEF_INTERNAL_SIGNED_COND_FN > default: > return IFN_LAST; > } > @@ -5113,3 +5098,67 @@ expand_BITINTTOFLOAT (internal_fn, gcall > if (val != target) > emit_move_insn (target, val); > } > + > +void > +expand_POPCOUNT (internal_fn fn, gcall *stmt) > +{ > + if (gimple_call_num_args (stmt) == 1) > + { > + expand_unary_optab_fn (fn, stmt, popcount_optab); > + return; > + } > + /* If .POPCOUNT call has 2 arguments, match_single_bit_test marked it > + because the result is only used in an equality comparison against 1. > + Use rtx costs in that case to determine if .POPCOUNT (arg) == 1 > + or (arg ^ (arg - 1)) > arg - 1 is cheaper. */ We're doing plenty of RTX costing during GIMPLE optimizations, I wonder if it could be done equally well there given that seq_cost of an RTX sequence isn't going to capture the compare & branch cost very reliably anyway ... > + bool speed_p = optimize_insn_for_speed_p (); > + tree lhs = gimple_call_lhs (stmt); > + tree arg = gimple_call_arg (stmt, 0); > + tree type = TREE_TYPE (arg); > + machine_mode mode = TYPE_MODE (type); > + do_pending_stack_adjust (); > + start_sequence (); > + expand_unary_optab_fn (fn, stmt, popcount_optab); > + rtx_insn *popcount_insns = get_insns (); > + end_sequence (); > + start_sequence (); > + rtx plhs = expand_normal (lhs); > + rtx pcmp = emit_store_flag (NULL_RTX, EQ, plhs, const1_rtx, mode, 0, 0); > + if (pcmp == NULL_RTX) > + { > + fail: > + end_sequence (); > + emit_insn (popcount_insns); > + return; > + } > + rtx_insn *popcount_cmp_insns = get_insns (); > + end_sequence (); > + start_sequence (); > + rtx op0 = expand_normal (arg); > + rtx argm1 = expand_simple_binop (mode, PLUS, op0, constm1_rtx, NULL_RTX, > + 1, OPTAB_DIRECT); > + if (argm1 == NULL_RTX) > + goto fail; > + rtx argxorargm1 = expand_simple_binop (mode, XOR, op0, argm1, NULL_RTX, > + 1, OPTAB_DIRECT); > + if (argxorargm1 == NULL_RTX) > + goto fail; > + rtx cmp = emit_store_flag (NULL_RTX, GTU, argxorargm1, argm1, mode, 1, 1); > + if (cmp == NULL_RTX) > + goto fail; > + rtx_insn *cmp_insns = get_insns (); > + end_sequence (); > + unsigned popcount_cost = (seq_cost (popcount_insns, speed_p) > + + seq_cost (popcount_cmp_insns, speed_p)); > + unsigned cmp_cost = seq_cost (cmp_insns, speed_p); > + if (popcount_cost < cmp_cost) > + emit_insn (popcount_insns); > + else > + { > + emit_insn (cmp_insns); > + plhs = expand_normal (lhs); > + if (GET_MODE (cmp) != GET_MODE (plhs)) > + cmp = convert_to_mode (GET_MODE (plhs), cmp, 1); > + emit_move_insn (plhs, cmp); > + } > +} > > > Jakub > >
On Mon, Nov 20, 2023 at 08:01:45AM +0000, Richard Biener wrote: > > On Fri, Nov 17, 2023 at 03:01:04PM +0100, Jakub Jelinek wrote: > > > As a follow-up, I'm considering changing in this routine the popcount > > > call to IFN_POPCOUNT with 2 arguments and during expansion test costs. > > > > Here is the follow-up which does the rtx costs testing. > > While having to tweak internal-fn.def so that POPCOUNT can have a custom > > expand_POPCOUNT, I have noticed we are inconsistent, some DEF_INTERNAL* > > macros (most of them) were undefined at the end of internal-fn.def (but in > > some cases uselessly undefined again after inclusion), while others were not > > (and sometimes undefined after the inclusion). I've changed it to always > > undefine at the end of internal-fn.def. > > The last bit is cleanup that could go in independently. Ok, I've committed the following patch first and the 2 patches adjusted on top of that. The 2 original ones were bootstrapped/regtested successfully on x86_64-linux and i686-linux last night. 2023-11-20 Jakub Jelinek <jakub@redhat.com> * internal-fn.def: Document missing DEF_INTERNAL* macros and make sure they are all undefined at the end. * internal-fn.cc (lookup_hilo_internal_fn, lookup_evenodd_internal_fn, widening_fn_p, get_len_internal_fn): Don't undef DEF_INTERNAL_*FN macros after inclusion of internal-fn.def. --- gcc/internal-fn.def.jj 2023-11-17 15:51:02.642802521 +0100 +++ gcc/internal-fn.def 2023-11-18 10:12:10.329236626 +0100 @@ -33,9 +33,12 @@ along with GCC; see the file COPYING3. DEF_INTERNAL_SIGNED_OPTAB_FN (NAME, FLAGS, SELECTOR, SIGNED_OPTAB, UNSIGNED_OPTAB, TYPE) DEF_INTERNAL_FLT_FN (NAME, FLAGS, OPTAB, TYPE) + DEF_INTERNAL_FLT_FLOATN_FN (NAME, FLAGS, OPTAB, TYPE) DEF_INTERNAL_INT_FN (NAME, FLAGS, OPTAB, TYPE) DEF_INTERNAL_COND_FN (NAME, FLAGS, OPTAB, TYPE) DEF_INTERNAL_SIGNED_COND_FN (NAME, FLAGS, OPTAB, TYPE) + DEF_INTERNAL_WIDENING_OPTAB_FN (NAME, FLAGS, SELECTOR, SOPTAB, UOPTAB, + TYPE) where NAME is the name of the function, FLAGS is a set of ECF_* flags and FNSPEC is a string describing functions fnspec. @@ -572,6 +585,9 @@ DEF_INTERNAL_FN (DIVMODBITINT, ECF_LEAF, DEF_INTERNAL_FN (FLOATTOBITINT, ECF_LEAF | ECF_NOTHROW, ". O . . ") DEF_INTERNAL_FN (BITINTTOFLOAT, ECF_PURE | ECF_LEAF, ". R . ") +#undef DEF_INTERNAL_WIDENING_OPTAB_FN +#undef DEF_INTERNAL_SIGNED_COND_FN +#undef DEF_INTERNAL_COND_FN #undef DEF_INTERNAL_INT_FN #undef DEF_INTERNAL_FLT_FN #undef DEF_INTERNAL_FLT_FLOATN_FN --- gcc/internal-fn.cc.jj 2023-11-07 08:32:01.741254155 +0100 +++ gcc/internal-fn.cc 2023-11-18 11:06:31.740703230 +0100 @@ -102,8 +102,6 @@ lookup_hilo_internal_fn (internal_fn ifn { default: gcc_unreachable (); -#undef DEF_INTERNAL_FN -#undef DEF_INTERNAL_WIDENING_OPTAB_FN #define DEF_INTERNAL_FN(NAME, FLAGS, TYPE) #define DEF_INTERNAL_WIDENING_OPTAB_FN(NAME, F, S, SO, UO, T) \ case IFN_##NAME: \ @@ -111,8 +109,6 @@ lookup_hilo_internal_fn (internal_fn ifn *hi = internal_fn (IFN_##NAME##_HI); \ break; #include "internal-fn.def" -#undef DEF_INTERNAL_FN -#undef DEF_INTERNAL_WIDENING_OPTAB_FN } } @@ -129,8 +125,6 @@ lookup_evenodd_internal_fn (internal_fn { default: gcc_unreachable (); -#undef DEF_INTERNAL_FN -#undef DEF_INTERNAL_WIDENING_OPTAB_FN #define DEF_INTERNAL_FN(NAME, FLAGS, TYPE) #define DEF_INTERNAL_WIDENING_OPTAB_FN(NAME, F, S, SO, UO, T) \ case IFN_##NAME: \ @@ -138,8 +132,6 @@ lookup_evenodd_internal_fn (internal_fn *odd = internal_fn (IFN_##NAME##_ODD); \ break; #include "internal-fn.def" -#undef DEF_INTERNAL_FN -#undef DEF_INTERNAL_WIDENING_OPTAB_FN } } @@ -4261,7 +4253,6 @@ widening_fn_p (code_helper code) internal_fn fn = as_internal_fn ((combined_fn) code); switch (fn) { - #undef DEF_INTERNAL_WIDENING_OPTAB_FN #define DEF_INTERNAL_WIDENING_OPTAB_FN(NAME, F, S, SO, UO, T) \ case IFN_##NAME: \ case IFN_##NAME##_HI: \ @@ -4270,7 +4261,6 @@ widening_fn_p (code_helper code) case IFN_##NAME##_ODD: \ return true; #include "internal-fn.def" - #undef DEF_INTERNAL_WIDENING_OPTAB_FN default: return false; @@ -4305,8 +4296,6 @@ set_edom_supported_p (void) expand_##TYPE##_optab_fn (fn, stmt, which_optab); \ } #include "internal-fn.def" -#undef DEF_INTERNAL_OPTAB_FN -#undef DEF_INTERNAL_SIGNED_OPTAB_FN /* Routines to expand each internal function, indexed by function number. Each routine has the prototype: @@ -4465,8 +4454,6 @@ get_len_internal_fn (internal_fn fn) { switch (fn) { -#undef DEF_INTERNAL_COND_FN -#undef DEF_INTERNAL_SIGNED_COND_FN #define DEF_INTERNAL_COND_FN(NAME, ...) \ case IFN_COND_##NAME: \ return IFN_COND_LEN_##NAME; @@ -4474,8 +4461,6 @@ get_len_internal_fn (internal_fn fn) case IFN_COND_##NAME: \ return IFN_COND_LEN_##NAME; #include "internal-fn.def" -#undef DEF_INTERNAL_COND_FN -#undef DEF_INTERNAL_SIGNED_COND_FN default: return IFN_LAST; } Jakub
--- gcc/tree-ssa-math-opts.cc.jj 2023-11-18 09:38:03.460813910 +0100 +++ gcc/tree-ssa-math-opts.cc 2023-11-18 10:25:18.751207936 +0100 @@ -5195,7 +5195,16 @@ match_single_bit_test (gimple_stmt_itera if (!INTEGRAL_TYPE_P (type)) return; if (direct_internal_fn_supported_p (IFN_POPCOUNT, type, OPTIMIZE_FOR_BOTH)) - return; + { + /* Tell expand_POPCOUNT the popcount result is only used in equality + comparison with one, so that it can decide based on rtx costs. */ + gimple *g = gimple_build_call_internal (IFN_POPCOUNT, 2, arg, + integer_one_node); + gimple_call_set_lhs (g, gimple_call_lhs (call)); + gimple_stmt_iterator gsi2 = gsi_for_stmt (call); + gsi_replace (&gsi2, g, true); + return; + } tree argm1 = make_ssa_name (type); gimple *g = gimple_build_assign (argm1, PLUS_EXPR, arg, build_int_cst (type, -1)); --- gcc/internal-fn.h.jj 2023-11-02 12:15:12.223998565 +0100 +++ gcc/internal-fn.h 2023-11-18 10:14:25.834340505 +0100 @@ -262,6 +262,7 @@ extern void expand_MULBITINT (internal_f extern void expand_DIVMODBITINT (internal_fn, gcall *); extern void expand_FLOATTOBITINT (internal_fn, gcall *); extern void expand_BITINTTOFLOAT (internal_fn, gcall *); +extern void expand_POPCOUNT (internal_fn, gcall *); extern bool vectorized_internal_fn_supported_p (internal_fn, tree); --- gcc/internal-fn.def.jj 2023-11-17 15:51:02.642802521 +0100 +++ gcc/internal-fn.def 2023-11-18 10:12:10.329236626 +0100 @@ -33,9 +33,13 @@ along with GCC; see the file COPYING3. DEF_INTERNAL_SIGNED_OPTAB_FN (NAME, FLAGS, SELECTOR, SIGNED_OPTAB, UNSIGNED_OPTAB, TYPE) DEF_INTERNAL_FLT_FN (NAME, FLAGS, OPTAB, TYPE) + DEF_INTERNAL_FLT_FLOATN_FN (NAME, FLAGS, OPTAB, TYPE) DEF_INTERNAL_INT_FN (NAME, FLAGS, OPTAB, TYPE) + DEF_INTERNAL_INT_EXT_FN (NAME, FLAGS, OPTAB, TYPE) DEF_INTERNAL_COND_FN (NAME, FLAGS, OPTAB, TYPE) DEF_INTERNAL_SIGNED_COND_FN (NAME, FLAGS, OPTAB, TYPE) + DEF_INTERNAL_WIDENING_OPTAB_FN (NAME, FLAGS, SELECTOR, SOPTAB, UOPTAB, + TYPE) where NAME is the name of the function, FLAGS is a set of ECF_* flags and FNSPEC is a string describing functions fnspec. @@ -97,6 +101,10 @@ along with GCC; see the file COPYING3. says that the function extends the C-level BUILT_IN_<NAME>{,L,LL,IMAX} group of functions to any integral mode (including vector modes). + DEF_INTERNAL_INT_EXT_FN is like DEF_INTERNAL_INT_FN, except that it + has expand_##NAME defined in internal-fn.cc to override the + DEF_INTERNAL_INT_FN expansion behavior. + DEF_INTERNAL_WIDENING_OPTAB_FN is a wrapper that defines five internal functions with DEF_INTERNAL_SIGNED_OPTAB_FN: - one that describes a widening operation with the same number of elements @@ -160,6 +168,11 @@ along with GCC; see the file COPYING3. DEF_INTERNAL_OPTAB_FN (NAME, FLAGS, OPTAB, TYPE) #endif +#ifndef DEF_INTERNAL_INT_EXT_FN +#define DEF_INTERNAL_INT_EXT_FN(NAME, FLAGS, OPTAB, TYPE) \ + DEF_INTERNAL_INT_FN (NAME, FLAGS, OPTAB, TYPE) +#endif + #ifndef DEF_INTERNAL_WIDENING_OPTAB_FN #define DEF_INTERNAL_WIDENING_OPTAB_FN(NAME, FLAGS, SELECTOR, SOPTAB, UOPTAB, TYPE) \ DEF_INTERNAL_SIGNED_OPTAB_FN (NAME, FLAGS, SELECTOR, SOPTAB, UOPTAB, TYPE) \ @@ -432,7 +445,7 @@ DEF_INTERNAL_INT_FN (CLZ, ECF_CONST | EC DEF_INTERNAL_INT_FN (CTZ, ECF_CONST | ECF_NOTHROW, ctz, unary) DEF_INTERNAL_INT_FN (FFS, ECF_CONST | ECF_NOTHROW, ffs, unary) DEF_INTERNAL_INT_FN (PARITY, ECF_CONST | ECF_NOTHROW, parity, unary) -DEF_INTERNAL_INT_FN (POPCOUNT, ECF_CONST | ECF_NOTHROW, popcount, unary) +DEF_INTERNAL_INT_EXT_FN (POPCOUNT, ECF_CONST | ECF_NOTHROW, popcount, unary) DEF_INTERNAL_FN (GOMP_TARGET_REV, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL) DEF_INTERNAL_FN (GOMP_USE_SIMT, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL) @@ -572,6 +585,10 @@ DEF_INTERNAL_FN (DIVMODBITINT, ECF_LEAF, DEF_INTERNAL_FN (FLOATTOBITINT, ECF_LEAF | ECF_NOTHROW, ". O . . ") DEF_INTERNAL_FN (BITINTTOFLOAT, ECF_PURE | ECF_LEAF, ". R . ") +#undef DEF_INTERNAL_WIDENING_OPTAB_FN +#undef DEF_INTERNAL_SIGNED_COND_FN +#undef DEF_INTERNAL_COND_FN +#undef DEF_INTERNAL_INT_EXT_FN #undef DEF_INTERNAL_INT_FN #undef DEF_INTERNAL_FLT_FN #undef DEF_INTERNAL_FLT_FLOATN_FN --- gcc/internal-fn.cc.jj 2023-11-07 08:32:01.741254155 +0100 +++ gcc/internal-fn.cc 2023-11-18 11:06:31.740703230 +0100 @@ -102,8 +102,6 @@ lookup_hilo_internal_fn (internal_fn ifn { default: gcc_unreachable (); -#undef DEF_INTERNAL_FN -#undef DEF_INTERNAL_WIDENING_OPTAB_FN #define DEF_INTERNAL_FN(NAME, FLAGS, TYPE) #define DEF_INTERNAL_WIDENING_OPTAB_FN(NAME, F, S, SO, UO, T) \ case IFN_##NAME: \ @@ -111,8 +109,6 @@ lookup_hilo_internal_fn (internal_fn ifn *hi = internal_fn (IFN_##NAME##_HI); \ break; #include "internal-fn.def" -#undef DEF_INTERNAL_FN -#undef DEF_INTERNAL_WIDENING_OPTAB_FN } } @@ -129,8 +125,6 @@ lookup_evenodd_internal_fn (internal_fn { default: gcc_unreachable (); -#undef DEF_INTERNAL_FN -#undef DEF_INTERNAL_WIDENING_OPTAB_FN #define DEF_INTERNAL_FN(NAME, FLAGS, TYPE) #define DEF_INTERNAL_WIDENING_OPTAB_FN(NAME, F, S, SO, UO, T) \ case IFN_##NAME: \ @@ -138,8 +132,6 @@ lookup_evenodd_internal_fn (internal_fn *odd = internal_fn (IFN_##NAME##_ODD); \ break; #include "internal-fn.def" -#undef DEF_INTERNAL_FN -#undef DEF_INTERNAL_WIDENING_OPTAB_FN } } @@ -4261,7 +4253,6 @@ widening_fn_p (code_helper code) internal_fn fn = as_internal_fn ((combined_fn) code); switch (fn) { - #undef DEF_INTERNAL_WIDENING_OPTAB_FN #define DEF_INTERNAL_WIDENING_OPTAB_FN(NAME, F, S, SO, UO, T) \ case IFN_##NAME: \ case IFN_##NAME##_HI: \ @@ -4270,7 +4261,6 @@ widening_fn_p (code_helper code) case IFN_##NAME##_ODD: \ return true; #include "internal-fn.def" - #undef DEF_INTERNAL_WIDENING_OPTAB_FN default: return false; @@ -4295,6 +4285,7 @@ set_edom_supported_p (void) { \ expand_##TYPE##_optab_fn (fn, stmt, OPTAB##_optab); \ } +#define DEF_INTERNAL_INT_EXT_FN(CODE, FLAGS, OPTAB, TYPE) #define DEF_INTERNAL_SIGNED_OPTAB_FN(CODE, FLAGS, SELECTOR, SIGNED_OPTAB, \ UNSIGNED_OPTAB, TYPE) \ static void \ @@ -4305,8 +4296,6 @@ set_edom_supported_p (void) expand_##TYPE##_optab_fn (fn, stmt, which_optab); \ } #include "internal-fn.def" -#undef DEF_INTERNAL_OPTAB_FN -#undef DEF_INTERNAL_SIGNED_OPTAB_FN /* Routines to expand each internal function, indexed by function number. Each routine has the prototype: @@ -4465,8 +4454,6 @@ get_len_internal_fn (internal_fn fn) { switch (fn) { -#undef DEF_INTERNAL_COND_FN -#undef DEF_INTERNAL_SIGNED_COND_FN #define DEF_INTERNAL_COND_FN(NAME, ...) \ case IFN_COND_##NAME: \ return IFN_COND_LEN_##NAME; @@ -4474,8 +4461,6 @@ get_len_internal_fn (internal_fn fn) case IFN_COND_##NAME: \ return IFN_COND_LEN_##NAME; #include "internal-fn.def" -#undef DEF_INTERNAL_COND_FN -#undef DEF_INTERNAL_SIGNED_COND_FN default: return IFN_LAST; } @@ -5113,3 +5098,67 @@ expand_BITINTTOFLOAT (internal_fn, gcall if (val != target) emit_move_insn (target, val); } + +void +expand_POPCOUNT (internal_fn fn, gcall *stmt) +{ + if (gimple_call_num_args (stmt) == 1) + { + expand_unary_optab_fn (fn, stmt, popcount_optab); + return; + } + /* If .POPCOUNT call has 2 arguments, match_single_bit_test marked it + because the result is only used in an equality comparison against 1. + Use rtx costs in that case to determine if .POPCOUNT (arg) == 1 + or (arg ^ (arg - 1)) > arg - 1 is cheaper. */ + bool speed_p = optimize_insn_for_speed_p (); + tree lhs = gimple_call_lhs (stmt); + tree arg = gimple_call_arg (stmt, 0); + tree type = TREE_TYPE (arg); + machine_mode mode = TYPE_MODE (type); + do_pending_stack_adjust (); + start_sequence (); + expand_unary_optab_fn (fn, stmt, popcount_optab); + rtx_insn *popcount_insns = get_insns (); + end_sequence (); + start_sequence (); + rtx plhs = expand_normal (lhs); + rtx pcmp = emit_store_flag (NULL_RTX, EQ, plhs, const1_rtx, mode, 0, 0); + if (pcmp == NULL_RTX) + { + fail: + end_sequence (); + emit_insn (popcount_insns); + return; + } + rtx_insn *popcount_cmp_insns = get_insns (); + end_sequence (); + start_sequence (); + rtx op0 = expand_normal (arg); + rtx argm1 = expand_simple_binop (mode, PLUS, op0, constm1_rtx, NULL_RTX, + 1, OPTAB_DIRECT); + if (argm1 == NULL_RTX) + goto fail; + rtx argxorargm1 = expand_simple_binop (mode, XOR, op0, argm1, NULL_RTX, + 1, OPTAB_DIRECT); + if (argxorargm1 == NULL_RTX) + goto fail; + rtx cmp = emit_store_flag (NULL_RTX, GTU, argxorargm1, argm1, mode, 1, 1); + if (cmp == NULL_RTX) + goto fail; + rtx_insn *cmp_insns = get_insns (); + end_sequence (); + unsigned popcount_cost = (seq_cost (popcount_insns, speed_p) + + seq_cost (popcount_cmp_insns, speed_p)); + unsigned cmp_cost = seq_cost (cmp_insns, speed_p); + if (popcount_cost < cmp_cost) + emit_insn (popcount_insns); + else + { + emit_insn (cmp_insns); + plhs = expand_normal (lhs); + if (GET_MODE (cmp) != GET_MODE (plhs)) + cmp = convert_to_mode (GET_MODE (plhs), cmp, 1); + emit_move_insn (plhs, cmp); + } +}