Message ID | alpine.DEB.2.02.1208111417100.26487@stedding.saclay.inria.fr |
---|---|
State | New |
Headers | show |
On Sat, Aug 11, 2012 at 2:25 PM, Marc Glisse <marc.glisse@inria.fr> wrote: > On Fri, 10 Aug 2012, Marc Glisse wrote: > >> this patch detects permutations of permutations and merges them. It also >> canonicalizes permutations a bit more. >> >> There are several issues with this patch: >> >> 1) I am not sure we always want to combine permutations. Indeed, someone >> (user? vectorizer?) may have written 2 permutations to help the backend >> generate optimal code, whereas it will do a bad job on the complicated >> combined permutation. At tree level, I am not sure what can be done except >> possibly restrict the optimization to the case where the combined >> permutation is the identity. At the RTL level, we could compare what insns >> are generated, but that's going to be painful (doing anything with >> permutations at the RTL level is painful). >> >> 2) I don't understand how the cleanup works in tree-ssa-forwprop.c. I >> copied some fold/update/remove lines from other simplifications, but I don't >> know if they are right. >> >> 3) In a first version of the patch, where I had SSA_NAME_DEF_STMT instead >> of get_prop_source_stmt, I got an ICE in one of the torture vectorization >> testcases. It happened in expand_expr_real_2, because it asserts that >> expand_vec_perm will never fail. However, expand_vec_perm does return >> NULL_RTX sometimes. Here it was in V16QImode, with the permutation >> {0,0,2,2,4,...,14,14}. maybe_expand_insn can't handle it directly (that's ok >> I guess), but then expand_vec_perm sees VOIDmode and exits instead of trying >> other ways. I don't know if there is a latent bug or if (more likely) my >> patch may produce trees with wrong modes. >> >> 4) Is this the right place? >> >> This isn't the transformation I am most interested in, but it is a first >> step to see if the direction is right. > > > Hello, > > there was yet another issue with the version I posted: the testcase was > trivial so I didn't notice that it didn't perform the optimization at all... > > Here is a new version. It seems a bit strange to me that there are plenty of > functions that check for single-use variables, but there isn't any that > checks for variables used in a single statement (but possibly twice). So I > may be doing something strange, but it seems to be the natural test here. > > Happily passed bootstrap+regtest. > > 2012-08-11 Marc Glisse <marc.glisse@inria.fr> > > > gcc/ > * fold-const.c (fold_ternary_loc): Detect identity permutations. > Canonicalize permutations more. > * tree-ssa-forwprop.c (is_only_used_in): New function. > (simplify_permutation): Likewise. > > (ssa_forward_propagate_and_combine): Call it. > > gcc/testsuite/ > * gcc.dg/tree-ssa/forwprop-19.c: New testcase. > > -- > Marc Glisse > > Index: gcc/tree-ssa-forwprop.c > =================================================================== > --- gcc/tree-ssa-forwprop.c (revision 190311) > +++ gcc/tree-ssa-forwprop.c (working copy) > @@ -2569,20 +2569,97 @@ combine_conversions (gimple_stmt_iterato > gimple_assign_set_rhs_code (stmt, CONVERT_EXPR); > update_stmt (stmt); > return remove_prop_source_from_use (op0) ? 2 : 1; > } > } > } > > return 0; > } > > +/* Return true if VAR has no nondebug use but in stmt. */ > +static bool > +is_only_used_in (const_tree var, const_gimple stmt) > +{ > + const ssa_use_operand_t *const ptr0 = &(SSA_NAME_IMM_USE_NODE (var)); > + const ssa_use_operand_t *ptr = ptr0->next; > + > + for (; ptr != ptr0; ptr = ptr->next) > + { > + const_gimple use_stmt = USE_STMT (ptr); > + if (is_gimple_debug (use_stmt)) > + continue; > + if (use_stmt != stmt) > + return false; > + } > + return true; > +} if (!single_imm_use (var, &use, &use_stmt) || use_stmt != stmt) instead of the above. > +/* Combine two shuffles in a row. Returns 1 if there were any changes > + made, 2 if cfg-cleanup needs to run. Else it returns 0. */ > + > +static int > +simplify_permutation (gimple_stmt_iterator *gsi) > +{ > + gimple stmt = gsi_stmt (*gsi); > + gimple def_stmt; > + tree op0, op1, op2, op3, mask; > + enum tree_code code = gimple_assign_rhs_code (stmt); > + enum tree_code code2; > + location_t loc = gimple_location (stmt); > + > + gcc_checking_assert (code == VEC_PERM_EXPR); > + > + op0 = gimple_assign_rhs1 (stmt); > + op1 = gimple_assign_rhs2 (stmt); > + op2 = gimple_assign_rhs3 (stmt); > + > + if (TREE_CODE (op0) != SSA_NAME) > + return 0; > + > + if (TREE_CODE (op2) != VECTOR_CST) > + return 0; > + > + def_stmt = SSA_NAME_DEF_STMT (op0); > + if (!def_stmt || !is_gimple_assign (def_stmt) > + || !can_propagate_from (def_stmt) > + || !is_only_used_in (op0, stmt)) Or rather than the above, simply check || !has_single_use (op0) here. > + return 0; > + > + /* Check that it is only used here. We cannot use has_single_use > + since the expression is using it twice itself... */ Ah ... so then || num_imm_uses (op0) != 2 > + code2 = gimple_assign_rhs_code (def_stmt); > + > + /* Two consecutive shuffles. */ > + if (code2 == VEC_PERM_EXPR) > + { > + op3 = gimple_assign_rhs3 (def_stmt); > + if (TREE_CODE (op3) != VECTOR_CST) > + return 0; > + mask = fold_build3_loc (loc, VEC_PERM_EXPR, TREE_TYPE (op3), > + op3, op3, op2); > + gcc_assert (TREE_CODE (mask) == VECTOR_CST); which means you can use mask = fold_ternary (loc, ...); Ok with that changes. Thanks, Richard. > + gimple_assign_set_rhs1 (stmt, gimple_assign_rhs1 (def_stmt)); > + gimple_assign_set_rhs2 (stmt, gimple_assign_rhs2 (def_stmt)); > + gimple_assign_set_rhs3 (stmt, mask); > + fold_stmt_inplace (gsi); > + update_stmt (stmt); > + return remove_prop_source_from_use (op0) ? 2 : 1; > + } > + > + return false; > +} > + > /* Main entry point for the forward propagation and statement combine > optimizer. */ > > static unsigned int > ssa_forward_propagate_and_combine (void) > { > basic_block bb; > unsigned int todoflags = 0; > > cfg_changed = false; > @@ -2731,20 +2808,27 @@ ssa_forward_propagate_and_combine (void) > changed = associate_pointerplus (&gsi); > else if (CONVERT_EXPR_CODE_P (code) > || code == FLOAT_EXPR > || code == FIX_TRUNC_EXPR) > { > int did_something = combine_conversions (&gsi); > if (did_something == 2) > cfg_changed = true; > changed = did_something != 0; > } > + else if (code == VEC_PERM_EXPR) > + { > + int did_something = simplify_permutation (&gsi); > + if (did_something == 2) > + cfg_changed = true; > + changed = did_something != 0; > + } > break; > } > > case GIMPLE_SWITCH: > changed = simplify_gimple_switch (stmt); > break; > > case GIMPLE_COND: > { > int did_something; > Index: gcc/fold-const.c > =================================================================== > --- gcc/fold-const.c (revision 190311) > +++ gcc/fold-const.c (working copy) > @@ -14148,58 +14148,96 @@ fold_ternary_loc (location_t loc, enum t > return fold_build2_loc (loc, PLUS_EXPR, type, > const_binop (MULT_EXPR, arg0, arg1), arg2); > if (integer_zerop (arg2)) > return fold_build2_loc (loc, MULT_EXPR, type, arg0, arg1); > > return fold_fma (loc, type, arg0, arg1, arg2); > > case VEC_PERM_EXPR: > if (TREE_CODE (arg2) == VECTOR_CST) > { > - unsigned int nelts = TYPE_VECTOR_SUBPARTS (type), i; > + unsigned int nelts = TYPE_VECTOR_SUBPARTS (type), i, mask; > unsigned char *sel = XALLOCAVEC (unsigned char, nelts); > tree t; > bool need_mask_canon = false; > + bool all_in_vec0 = true; > + bool all_in_vec1 = true; > + bool maybe_identity = true; > + bool single_arg = (op0 == op1); > + bool changed = false; > > + mask = single_arg ? (nelts - 1) : (2 * nelts - 1); > gcc_assert (nelts == VECTOR_CST_NELTS (arg2)); > for (i = 0; i < nelts; i++) > { > tree val = VECTOR_CST_ELT (arg2, i); > if (TREE_CODE (val) != INTEGER_CST) > return NULL_TREE; > > - sel[i] = TREE_INT_CST_LOW (val) & (2 * nelts - 1); > + sel[i] = TREE_INT_CST_LOW (val) & mask; > if (TREE_INT_CST_HIGH (val) > || ((unsigned HOST_WIDE_INT) > TREE_INT_CST_LOW (val) != sel[i])) > need_mask_canon = true; > + > + if (sel[i] < nelts) > + all_in_vec1 = false; > + else > + all_in_vec0 = false; > + > + if ((sel[i] & (nelts-1)) != i) > + maybe_identity = false; > + } > + > + if (maybe_identity) > + { > + if (all_in_vec0) > + return op0; > + if (all_in_vec1) > + return op1; > } > > if ((TREE_CODE (arg0) == VECTOR_CST > || TREE_CODE (arg0) == CONSTRUCTOR) > && (TREE_CODE (arg1) == VECTOR_CST > || TREE_CODE (arg1) == CONSTRUCTOR)) > { > t = fold_vec_perm (type, arg0, arg1, sel); > if (t != NULL_TREE) > return t; > } > > + if (all_in_vec0) > + op1 = op0; > + else if (all_in_vec1) > + { > + op0 = op1; > + for (i = 0; i < nelts; i++) > + sel[i] -= nelts; > + need_mask_canon = true; > + } > + > + if (op0 == op1 && !single_arg) > + changed = true; > + > if (need_mask_canon && arg2 == op2) > { > tree *tsel = XALLOCAVEC (tree, nelts); > tree eltype = TREE_TYPE (TREE_TYPE (arg2)); > for (i = 0; i < nelts; i++) > tsel[i] = build_int_cst (eltype, sel[i]); > - t = build_vector (TREE_TYPE (arg2), tsel); > - return build3_loc (loc, VEC_PERM_EXPR, type, op0, op1, t); > + op2 = build_vector (TREE_TYPE (arg2), tsel); > + changed = true; > } > + > + if (changed) > + return build3_loc (loc, VEC_PERM_EXPR, type, op0, op1, op2); > } > return NULL_TREE; > > default: > return NULL_TREE; > } /* switch (code) */ > } > > /* Perform constant folding and related simplification of EXPR. > The related simplifications include x*1 => x, x*0 => 0, etc., > Index: gcc/testsuite/gcc.dg/tree-ssa/forwprop-19.c > =================================================================== > --- gcc/testsuite/gcc.dg/tree-ssa/forwprop-19.c (revision 0) > +++ gcc/testsuite/gcc.dg/tree-ssa/forwprop-19.c (revision 0) > @@ -0,0 +1,17 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O -fdump-tree-forwprop2" } */ > + > +typedef int vec __attribute__((vector_size (2 * sizeof (int)))); > +void f (vec *x1, vec *x2) > +{ > + vec m = { 3, 1 }; > + vec n = { 1, 8 }; > + vec y = __builtin_shuffle (*x1, *x2, n); > + vec z = __builtin_shuffle (y, m); > + *x1 = z; > +} > + > +/* { dg-final { scan-tree-dump "VEC_PERM_EXPR.*{ 0, 0 }" "forwprop2" } } */ > +/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 1 "forwprop2" } } */ > +/* { dg-final { scan-tree-dump-times "x2" 1 "forwprop2" } } */ > +/* { dg-final { cleanup-tree-dump "forwprop2" } } */ > > Property changes on: gcc/testsuite/gcc.dg/tree-ssa/forwprop-19.c > ___________________________________________________________________ > Added: svn:eol-style > + native > Added: svn:keywords > + Author Date Id Revision URL > >
On Mon, 13 Aug 2012, Richard Guenther wrote: > On Sat, Aug 11, 2012 at 2:25 PM, Marc Glisse <marc.glisse@inria.fr> wrote: >> On Fri, 10 Aug 2012, Marc Glisse wrote: >> >>> 1) I am not sure we always want to combine permutations. Indeed, someone >>> (user? vectorizer?) may have written 2 permutations to help the backend >>> generate optimal code, whereas it will do a bad job on the complicated >>> combined permutation. At tree level, I am not sure what can be done except >>> possibly restrict the optimization to the case where the combined >>> permutation is the identity. At the RTL level, we could compare what insns >>> are generated, but that's going to be painful (doing anything with >>> permutations at the RTL level is painful). I guess people will complain soon enough if this causes horrible performance regressions in vectorized code. >> +/* Return true if VAR has no nondebug use but in stmt. */ >> +static bool >> +is_only_used_in (const_tree var, const_gimple stmt) >> +{ >> + const ssa_use_operand_t *const ptr0 = &(SSA_NAME_IMM_USE_NODE (var)); >> + const ssa_use_operand_t *ptr = ptr0->next; >> + >> + for (; ptr != ptr0; ptr = ptr->next) >> + { >> + const_gimple use_stmt = USE_STMT (ptr); >> + if (is_gimple_debug (use_stmt)) >> + continue; >> + if (use_stmt != stmt) >> + return false; >> + } >> + return true; >> +} > > if (!single_imm_use (var, &use, &use_stmt) || use_stmt != stmt) > > instead of the above. I don't think that works with statements that use a variable twice. >> +/* Combine two shuffles in a row. Returns 1 if there were any changes >> + made, 2 if cfg-cleanup needs to run. Else it returns 0. */ >> + >> +static int >> +simplify_permutation (gimple_stmt_iterator *gsi) >> +{ >> + gimple stmt = gsi_stmt (*gsi); >> + gimple def_stmt; >> + tree op0, op1, op2, op3, mask; >> + enum tree_code code = gimple_assign_rhs_code (stmt); >> + enum tree_code code2; >> + location_t loc = gimple_location (stmt); >> + >> + gcc_checking_assert (code == VEC_PERM_EXPR); >> + >> + op0 = gimple_assign_rhs1 (stmt); >> + op1 = gimple_assign_rhs2 (stmt); >> + op2 = gimple_assign_rhs3 (stmt); >> + >> + if (TREE_CODE (op0) != SSA_NAME) >> + return 0; >> + >> + if (TREE_CODE (op2) != VECTOR_CST) >> + return 0; >> + >> + def_stmt = SSA_NAME_DEF_STMT (op0); >> + if (!def_stmt || !is_gimple_assign (def_stmt) >> + || !can_propagate_from (def_stmt) >> + || !is_only_used_in (op0, stmt)) > > Or rather than the above, simply check > > || !has_single_use (op0) > > here. Then there was my previous (non-working) patch that used get_prop_source_stmt. >> + return 0; >> + >> + /* Check that it is only used here. We cannot use has_single_use >> + since the expression is using it twice itself... */ > > Ah ... so then > > || num_imm_uses (op0) != 2 Ah, ok, that's simpler indeed, but there were such dire warnings to never use that evil function unless absolutely necessary that I didn't dare use it... Thanks for the permission. >> + code2 = gimple_assign_rhs_code (def_stmt); >> + >> + /* Two consecutive shuffles. */ >> + if (code2 == VEC_PERM_EXPR) >> + { >> + op3 = gimple_assign_rhs3 (def_stmt); >> + if (TREE_CODE (op3) != VECTOR_CST) >> + return 0; >> + mask = fold_build3_loc (loc, VEC_PERM_EXPR, TREE_TYPE (op3), >> + op3, op3, op2); >> + gcc_assert (TREE_CODE (mask) == VECTOR_CST); > > which means you can use > > mask = fold_ternary (loc, ...); > > Ok with that changes. Thanks a lot, I'll do that. Next step should be either BIT_FIELD_REF(VEC_PERM_EXPR) or VEC_PERM_EXPR(CONSTRUCTOR). Is there a good way to determine whether this kind of combination should be done forwards or backwards (i.e. start from VEC_PERM_EXPR and look if its single use is in BIT_FIELD_REF, or start from BIT_FIELD_REF and look if its argument is a VEC_PERM_EXPR only used here?
On Mon, Aug 13, 2012 at 3:03 PM, Marc Glisse <marc.glisse@inria.fr> wrote: > On Mon, 13 Aug 2012, Richard Guenther wrote: > >> On Sat, Aug 11, 2012 at 2:25 PM, Marc Glisse <marc.glisse@inria.fr> wrote: >>> >>> On Fri, 10 Aug 2012, Marc Glisse wrote: >>> >>>> 1) I am not sure we always want to combine permutations. Indeed, someone >>>> (user? vectorizer?) may have written 2 permutations to help the backend >>>> generate optimal code, whereas it will do a bad job on the complicated >>>> combined permutation. At tree level, I am not sure what can be done >>>> except >>>> possibly restrict the optimization to the case where the combined >>>> permutation is the identity. At the RTL level, we could compare what >>>> insns >>>> are generated, but that's going to be painful (doing anything with >>>> permutations at the RTL level is painful). > > > I guess people will complain soon enough if this causes horrible performance > regressions in vectorized code. > > >>> +/* Return true if VAR has no nondebug use but in stmt. */ >>> +static bool >>> +is_only_used_in (const_tree var, const_gimple stmt) >>> +{ >>> + const ssa_use_operand_t *const ptr0 = &(SSA_NAME_IMM_USE_NODE (var)); >>> + const ssa_use_operand_t *ptr = ptr0->next; >>> + >>> + for (; ptr != ptr0; ptr = ptr->next) >>> + { >>> + const_gimple use_stmt = USE_STMT (ptr); >>> + if (is_gimple_debug (use_stmt)) >>> + continue; >>> + if (use_stmt != stmt) >>> + return false; >>> + } >>> + return true; >>> +} >> >> >> if (!single_imm_use (var, &use, &use_stmt) || use_stmt != stmt) >> >> instead of the above. > > > I don't think that works with statements that use a variable twice. > > >>> +/* Combine two shuffles in a row. Returns 1 if there were any changes >>> + made, 2 if cfg-cleanup needs to run. Else it returns 0. */ >>> + >>> +static int >>> +simplify_permutation (gimple_stmt_iterator *gsi) >>> +{ >>> + gimple stmt = gsi_stmt (*gsi); >>> + gimple def_stmt; >>> + tree op0, op1, op2, op3, mask; >>> + enum tree_code code = gimple_assign_rhs_code (stmt); >>> + enum tree_code code2; >>> + location_t loc = gimple_location (stmt); >>> + >>> + gcc_checking_assert (code == VEC_PERM_EXPR); >>> + >>> + op0 = gimple_assign_rhs1 (stmt); >>> + op1 = gimple_assign_rhs2 (stmt); >>> + op2 = gimple_assign_rhs3 (stmt); >>> + >>> + if (TREE_CODE (op0) != SSA_NAME) >>> + return 0; >>> + >>> + if (TREE_CODE (op2) != VECTOR_CST) >>> + return 0; >>> + >>> + def_stmt = SSA_NAME_DEF_STMT (op0); >>> + if (!def_stmt || !is_gimple_assign (def_stmt) >>> + || !can_propagate_from (def_stmt) >>> + || !is_only_used_in (op0, stmt)) >> >> >> Or rather than the above, simply check >> >> || !has_single_use (op0) >> >> here. > > > Then there was my previous (non-working) patch that used > get_prop_source_stmt. > > >>> + return 0; >>> + >>> + /* Check that it is only used here. We cannot use has_single_use >>> + since the expression is using it twice itself... */ >> >> >> Ah ... so then >> >> || num_imm_uses (op0) != 2 > > > Ah, ok, that's simpler indeed, but there were such dire warnings to never > use that evil function unless absolutely necessary that I didn't dare use > it... Thanks for the permission. If your new predicate would match more places (can you do a quick search?) then we want it in tree-flow-inline.h instead of in tree-ssa-forwprop.c. But yes, num_imm_uses can be expensive. For now just stick with the above. >>> + code2 = gimple_assign_rhs_code (def_stmt); >>> + >>> + /* Two consecutive shuffles. */ >>> + if (code2 == VEC_PERM_EXPR) >>> + { >>> + op3 = gimple_assign_rhs3 (def_stmt); >>> + if (TREE_CODE (op3) != VECTOR_CST) >>> + return 0; >>> + mask = fold_build3_loc (loc, VEC_PERM_EXPR, TREE_TYPE (op3), >>> + op3, op3, op2); >>> + gcc_assert (TREE_CODE (mask) == VECTOR_CST); >> >> >> which means you can use >> >> mask = fold_ternary (loc, ...); >> >> Ok with that changes. > > > Thanks a lot, I'll do that. > > Next step should be either BIT_FIELD_REF(VEC_PERM_EXPR) or > VEC_PERM_EXPR(CONSTRUCTOR). Is there a good way to determine whether this > kind of combination should be done forwards or backwards (i.e. start from > VEC_PERM_EXPR and look if its single use is in BIT_FIELD_REF, or start from > BIT_FIELD_REF and look if its argument is a VEC_PERM_EXPR only used here? The natural SSA (and forwprop) way is to look for BIT_FIELD_REF and see if the def stmt is a VEC_PERM_EXPR. Richard. > -- > Marc Glisse
> > I guess people will complain soon enough if this causes horrible performance > regressions in vectorized code. Not having looked at your patch in great detail,. surely what we don't want is a situation where 2 constant permutations are converted into one generic permute. Based on a quick read of your patch I couldn't work that out. It might be that 2 constant permutes are cheaper than a generic permute. Have you looked at any examples in that space . I surely wouldn't like to see a sequence of interleave / transpose change into a generic permute operation on Neon as that would be far more expensive than this. It surely needs more testting than just this bit before going in. The reason being that this would likely take more registers and indeed produce loads of a constant pool for the new mask. regards, Ramana
On Mon, Aug 13, 2012 at 3:12 PM, Ramana Radhakrishnan <ramana.radhakrishnan@linaro.org> wrote: >> >> I guess people will complain soon enough if this causes horrible performance >> regressions in vectorized code. > > Not having looked at your patch in great detail,. surely what we don't > want is a situation where 2 constant permutations are converted into > one generic permute. Based on a quick read of your patch I couldn't > work that out. It might be that 2 constant permutes are cheaper than > a generic permute. Have you looked at any examples in that space . I > surely wouldn't like to see a sequence of interleave / transpose > change into a generic permute operation on Neon as that would be far > more expensive than this. It surely needs more testting than just > this bit before going in. The reason being that this would likely take > more registers and indeed produce loads of a constant pool for the new > mask. The patch does not do that. It merely assumes that the target knows how to perform an optimal constant permute and that two constant permutes never generate better code than a single one. Richard. > regards, > Ramana
On Mon, Aug 13, 2012 at 03:13:26PM +0200, Richard Guenther wrote: > On Mon, Aug 13, 2012 at 3:12 PM, Ramana Radhakrishnan > <ramana.radhakrishnan@linaro.org> wrote: > >> > >> I guess people will complain soon enough if this causes horrible performance > >> regressions in vectorized code. > > > > Not having looked at your patch in great detail,. surely what we don't > > want is a situation where 2 constant permutations are converted into > > one generic permute. Based on a quick read of your patch I couldn't > > work that out. It might be that 2 constant permutes are cheaper than > > a generic permute. Have you looked at any examples in that space . I > > surely wouldn't like to see a sequence of interleave / transpose > > change into a generic permute operation on Neon as that would be far > > more expensive than this. It surely needs more testting than just > > this bit before going in. The reason being that this would likely take > > more registers and indeed produce loads of a constant pool for the new > > mask. > > The patch does not do that. It merely assumes that the target knows > how to perform an optimal constant permute and that two constant > permutes never generate better code than a single one. Still, the patch should do some tests whether it is beneficial. At least a can_vec_perm_p (mode, false, sel) test of the resulting permutation if both the original permutations pass that test, and perhaps additionally if targetm.vectorize.vec_perm_const_ok is non-NULL and passes for both the original permutations then it should also pass for the resulting permutation. Jakub
On Mon, 13 Aug 2012, Richard Guenther wrote: > On Mon, Aug 13, 2012 at 3:12 PM, Ramana Radhakrishnan > <ramana.radhakrishnan@linaro.org> wrote: >>> >>> I guess people will complain soon enough if this causes horrible performance >>> regressions in vectorized code. >> >> Not having looked at your patch in great detail,. surely what we don't >> want is a situation where 2 constant permutations are converted into >> one generic permute. Based on a quick read of your patch I couldn't >> work that out. It might be that 2 constant permutes are cheaper than >> a generic permute. Have you looked at any examples in that space . I >> surely wouldn't like to see a sequence of interleave / transpose >> change into a generic permute operation on Neon as that would be far >> more expensive than this. It surely needs more testting than just >> this bit before going in. The reason being that this would likely take >> more registers and indeed produce loads of a constant pool for the new >> mask. What do you call constant / non-constant? The combined permutation is still constant, although the expansion (in the back-end) might fail to expand it efficiently and fall back to the generic permutation expansion... > The patch does not do that. It merely assumes that the target knows > how to perform an optimal constant permute and that two constant > permutes never generate better code than a single one. Which, to be honest, is false on all platforms I know, although I did contribute some minor enhancements for x86.
On Mon, 13 Aug 2012, Richard Guenther wrote: >>>> + /* Check that it is only used here. We cannot use has_single_use >>>> + since the expression is using it twice itself... */ >>> >>> Ah ... so then >>> >>> || num_imm_uses (op0) != 2 >> >> Ah, ok, that's simpler indeed, but there were such dire warnings to never >> use that evil function unless absolutely necessary that I didn't dare use >> it... Thanks for the permission. > > If your new predicate would match more places (can you do a quick search?) You mean: if there are more optimizations that either already check for double use in the same statement, or could benefit from doing so? I'll take a look. > then we want it in tree-flow-inline.h instead of in > tree-ssa-forwprop.c. But yes, > num_imm_uses can be expensive. For now just stick with the above. I assume "the above" means "num_imm_uses (op0) != 2", since both versions are above ;-) > The natural SSA (and forwprop) way is to look for BIT_FIELD_REF and see > if the def stmt is a VEC_PERM_EXPR. Thanks again,
On Mon, 13 Aug 2012, Jakub Jelinek wrote: > On Mon, Aug 13, 2012 at 03:13:26PM +0200, Richard Guenther wrote: >> The patch does not do that. It merely assumes that the target knows >> how to perform an optimal constant permute and that two constant >> permutes never generate better code than a single one. > > Still, the patch should do some tests whether it is beneficial. > At least a can_vec_perm_p (mode, false, sel) test of the resulting > permutation if both the original permutations pass that test, Sounds good. The last argument should be the constant permutation vector, presented as an array of indices stored in unsigned char? I hadn't realized we already had access to backend information that early in the compilation. It doesn't give the cost though. > and perhaps > additionally if targetm.vectorize.vec_perm_const_ok is non-NULL and > passes for both the original permutations then it should also pass > for the resulting permutation. Isn't that implied by the previous test (I see calls to vec_perm_const_ok in there)? Or maybe not quite?
On Mon, Aug 13, 2012 at 03:45:00PM +0200, Marc Glisse wrote: > On Mon, 13 Aug 2012, Jakub Jelinek wrote: > > >On Mon, Aug 13, 2012 at 03:13:26PM +0200, Richard Guenther wrote: > >>The patch does not do that. It merely assumes that the target knows > >>how to perform an optimal constant permute and that two constant > >>permutes never generate better code than a single one. > > > >Still, the patch should do some tests whether it is beneficial. > >At least a can_vec_perm_p (mode, false, sel) test of the resulting > >permutation if both the original permutations pass that test, > > Sounds good. The last argument should be the constant permutation > vector, presented as an array of indices stored in unsigned char? I > hadn't realized we already had access to backend information that > early in the compilation. It doesn't give the cost though. Yeah, it doesn't give the cost, just whether it is supported at all. > >and perhaps > >additionally if targetm.vectorize.vec_perm_const_ok is non-NULL and > >passes for both the original permutations then it should also pass > >for the resulting permutation. > > Isn't that implied by the previous test (I see calls to > vec_perm_const_ok in there)? Or maybe not quite? can_vec_perm_p can also return true if e.g. the target supports generic variable permutation for the mode in question. So the targetm.vectorize.vec_perm_const_ok check is stricter than that, it tells you whether it can be supported by some constant permutation (or a sequence of them, you know how the i386 code for that is complicated). Jakub
On 13 August 2012 14:21, Marc Glisse <marc.glisse@inria.fr> wrote: > On Mon, 13 Aug 2012, Richard Guenther wrote: > >> On Mon, Aug 13, 2012 at 3:12 PM, Ramana Radhakrishnan >> <ramana.radhakrishnan@linaro.org> wrote: >>>> >>>> >>>> I guess people will complain soon enough if this causes horrible >>>> performance >>>> regressions in vectorized code. >>> >>> >>> Not having looked at your patch in great detail,. surely what we don't >>> want is a situation where 2 constant permutations are converted into >>> one generic permute. Based on a quick read of your patch I couldn't >>> work that out. It might be that 2 constant permutes are cheaper than >>> a generic permute. Have you looked at any examples in that space . I >>> surely wouldn't like to see a sequence of interleave / transpose >>> change into a generic permute operation on Neon as that would be far >>> more expensive than this. It surely needs more testting than just >>> this bit before going in. The reason being that this would likely take >>> more registers and indeed produce loads of a constant pool for the new >>> mask. > > > What do you call constant / non-constant? The combined permutation is still > constant, although the expansion (in the back-end) might fail to expand it > efficiently and fall back to the generic permutation expansion... That is exactly what I was worried about. By constant I would expect something that is expanded as a constant permute by the backend - an interleave operation or a transpose operation or indeed some of the funky operations as below in ARM / Neon which is the SIMD architecture I'm most familiar with . If you had something that did the following : uint8x8_t tst_vrev642_u8 (uint8x8_t __a) { uint8x8_t __rv; uint8x8_t __mask1 = { 7, 6, 5, 4, 3, 2, 1, 0}; uint8x8_t __mask2 = { 0, 8, 2, 10, 4, 12, 6, 14 }; return __builtin_shuffle ( __builtin_shuffle ( __a, __mask1), __mask2) ; } I would expect these instructions vrev64.8 d0, d0 vmov d16, d0 @ v8qi vtrn.8 d0, d16 bx lr With your patch I see tst_vrev642_u8: @ args = 0, pretend = 0, frame = 0 @ frame_needed = 0, uses_anonymous_args = 0 @ link register save eliminated. fldd d16, .L2 vtbl.8 d0, {d0}, d16 bx lr .L3: .align 3 .L2: .byte 7 .byte 7 .byte 5 .byte 5 .byte 3 .byte 3 .byte 1 .byte 1 It might be that the backend predicates need tightening in this case but why not try to get cost info about such combinations rather than just doing this gratuitously ? While in this case the ARM port might be wrong , but in general when the vector permute rewrites were done we chose to go ahead and keep the generic constant permutes in the backend as the last resort to fall back to. However if fwprop starts making such transformations really one ought to get relative costs for each of these operations rather than allowing gratuitous replacements. Thanks, Ramana
On 13 August 2012 14:54, Jakub Jelinek <jakub@redhat.com> wrote: > On Mon, Aug 13, 2012 at 03:45:00PM +0200, Marc Glisse wrote: >> On Mon, 13 Aug 2012, Jakub Jelinek wrote: >> >> >On Mon, Aug 13, 2012 at 03:13:26PM +0200, Richard Guenther wrote: >> >>The patch does not do that. It merely assumes that the target knows >> >>how to perform an optimal constant permute and that two constant >> >>permutes never generate better code than a single one. >> > >> >Still, the patch should do some tests whether it is beneficial. >> >At least a can_vec_perm_p (mode, false, sel) test of the resulting >> >permutation if both the original permutations pass that test, >> >> Sounds good. The last argument should be the constant permutation >> vector, presented as an array of indices stored in unsigned char? I >> hadn't realized we already had access to backend information that >> early in the compilation. It doesn't give the cost though. > > Yeah, it doesn't give the cost, just whether it is supported at all. Maybe we need an interface of that form. A generic permute operation used for constant permute operations is going to be more expensive than more specialized constant permute operations. It might be that this cost gets amortized over a large number of operations at which point it makes sense but the compiler should make this transformation based on cost rather than just whether something is supported or not. Ramana
On Mon, 13 Aug 2012, Ramana Radhakrishnan wrote: > On 13 August 2012 14:21, Marc Glisse <marc.glisse@inria.fr> wrote: >> On Mon, 13 Aug 2012, Richard Guenther wrote: >> >>> On Mon, Aug 13, 2012 at 3:12 PM, Ramana Radhakrishnan >>> <ramana.radhakrishnan@linaro.org> wrote: >>>>> >>>>> >>>>> I guess people will complain soon enough if this causes horrible >>>>> performance >>>>> regressions in vectorized code. >>>> >>>> >>>> Not having looked at your patch in great detail,. surely what we don't >>>> want is a situation where 2 constant permutations are converted into >>>> one generic permute. Based on a quick read of your patch I couldn't >>>> work that out. It might be that 2 constant permutes are cheaper than >>>> a generic permute. Have you looked at any examples in that space . I >>>> surely wouldn't like to see a sequence of interleave / transpose >>>> change into a generic permute operation on Neon as that would be far >>>> more expensive than this. It surely needs more testting than just >>>> this bit before going in. The reason being that this would likely take >>>> more registers and indeed produce loads of a constant pool for the new >>>> mask. >> >> >> What do you call constant / non-constant? The combined permutation is still >> constant, although the expansion (in the back-end) might fail to expand it >> efficiently and fall back to the generic permutation expansion... > > > That is exactly what I was worried about. By constant I would expect > something that is expanded as a constant permute by the backend - an > interleave operation or a transpose operation or indeed some of the > funky operations as below in ARM / Neon which is the SIMD architecture > I'm most familiar with . > > > If you had something that did the following : > > > uint8x8_t > tst_vrev642_u8 (uint8x8_t __a) > { > uint8x8_t __rv; > uint8x8_t __mask1 = { 7, 6, 5, 4, 3, 2, 1, 0}; > uint8x8_t __mask2 = { 0, 8, 2, 10, 4, 12, 6, 14 }; > return __builtin_shuffle ( __builtin_shuffle ( __a, __mask1), __mask2) ; > } > > > > I would expect these instructions > > vrev64.8 d0, d0 > vmov d16, d0 @ v8qi > vtrn.8 d0, d16 > bx lr > > With your patch I see > > tst_vrev642_u8: > @ args = 0, pretend = 0, frame = 0 > @ frame_needed = 0, uses_anonymous_args = 0 > @ link register save eliminated. > fldd d16, .L2 > vtbl.8 d0, {d0}, d16 > bx lr > .L3: > .align 3 > .L2: > .byte 7 > .byte 7 > .byte 5 > .byte 5 > .byte 3 > .byte 3 > .byte 1 > .byte 1 Seems to be one instruction shorter at least ;-) Yes, there can be much worse regressions than that because of the patch (like 40 instructions instead of 4, in the x86 backend). > It might be that the backend predicates need tightening in this case > but why not try to get cost info about such combinations rather than > just doing this gratuitously ? I don't think the word gratuitous is appropriate. middle-end replaces a+-b with a-b without first asking the backend whether it might be more efficient. One permutation is better than 2. It just happens that the range of possible permutations is too large (and the basic instructions are too strange) for backends to do a good job on them, and thus keeping toplevel input as a hint is important. > While in this case the ARM port might > be wrong , but in general when the vector permute rewrites were done > we chose to go ahead and keep the generic constant permutes in the > backend as the last resort to fall back to. However if fwprop starts > making such transformations really one ought to get relative costs for > each of these operations rather than allowing gratuitous replacements. Indeed, having costs would help, but that's a lot of work. As you can see from the original email, I am willing to limit the optimization to the case where the combined permutation is the identity (yes, that's quite drastic...). I should probably implement that special case anyway, because it doesn't require its argument to have a single use for the optimization to make sense.
On Mon, Aug 13, 2012 at 4:12 PM, Ramana Radhakrishnan <ramana.radhakrishnan@linaro.org> wrote: > On 13 August 2012 14:54, Jakub Jelinek <jakub@redhat.com> wrote: >> On Mon, Aug 13, 2012 at 03:45:00PM +0200, Marc Glisse wrote: >>> On Mon, 13 Aug 2012, Jakub Jelinek wrote: >>> >>> >On Mon, Aug 13, 2012 at 03:13:26PM +0200, Richard Guenther wrote: >>> >>The patch does not do that. It merely assumes that the target knows >>> >>how to perform an optimal constant permute and that two constant >>> >>permutes never generate better code than a single one. >>> > >>> >Still, the patch should do some tests whether it is beneficial. >>> >At least a can_vec_perm_p (mode, false, sel) test of the resulting >>> >permutation if both the original permutations pass that test, >>> >>> Sounds good. The last argument should be the constant permutation >>> vector, presented as an array of indices stored in unsigned char? I >>> hadn't realized we already had access to backend information that >>> early in the compilation. It doesn't give the cost though. >> >> Yeah, it doesn't give the cost, just whether it is supported at all. > > > Maybe we need an interface of that form. A generic permute operation > used for constant permute operations is going to be more expensive > than more specialized constant permute operations. It might be that > this cost gets amortized over a large number of operations at which > point it makes sense but the compiler should make this transformation > based on cost rather than just whether something is supported or not. Well. I think the middle-end can reasonably assume that backends know how to most efficiently perform any constant permute. We do not limit converting a = a + 5; a = a + 254; to a = a + 259; either though a target may generate better code for the first case. Which means that we should go without a target test and take regressions as they appear as an opportunity to improve targets instead. Richard. > Ramana
On Mon, 13 Aug 2012, Marc Glisse wrote: > On Mon, 13 Aug 2012, Richard Guenther wrote: > >> If your new predicate would match more places (can you do a quick search?) > > You mean: if there are more optimizations that either already check for > double use in the same statement, or could benefit from doing so? I'll take a > look. I couldn't find anything obvious (not that I understood a significant percentage of what I saw...). Note that the comments are efficient since the only current use of num_imm_uses is in a debug printf call. I'll give it a few more days for the conversation to settle, so I know what I should do between: - the barely modified patch you accepted, - the check asked by Jakub, - the restriction to identity that prevents any regression (well...), - something else?
On Mon, 13 Aug 2012, Marc Glisse wrote: > I'll give it a few more days for the conversation to settle, so I know what I > should do between: > - the barely modified patch you accepted, > - the check asked by Jakub, > - the restriction to identity that prevents any regression (well...), > - something else? I didn't realize the conversation would go quiet immediatly... Is someone willing to take the responsibility of telling me which approach is right? I can add Jakub's checks (though I am not sure how I would test them), but I would rather not do it if the conclusion is that they are either unnecessary (original patch is ok) or insufficient (don't avoid Ramana's regressions). I can do the very safe option 3 (only combine permutations when the result is the identity permutation), but I first want to make sure the more general thing is bad. (sorry, it's my first patch that gets conflicting answers...)
On Wed, Aug 15, 2012 at 1:22 PM, Marc Glisse <marc.glisse@inria.fr> wrote: > On Mon, 13 Aug 2012, Marc Glisse wrote: > >> I'll give it a few more days for the conversation to settle, so I know >> what I should do between: >> - the barely modified patch you accepted, >> - the check asked by Jakub, >> - the restriction to identity that prevents any regression (well...), >> - something else? > > > I didn't realize the conversation would go quiet immediatly... > > Is someone willing to take the responsibility of telling me which approach > is right? I can add Jakub's checks (though I am not sure how I would test > them), but I would rather not do it if the conclusion is that they are > either unnecessary (original patch is ok) or insufficient (don't avoid > Ramana's regressions). I can do the very safe option 3 (only combine > permutations when the result is the identity permutation), but I first want > to make sure the more general thing is bad. > > (sorry, it's my first patch that gets conflicting answers...) Well, we're waiting for someone to break the tie ... I'd go with the original patch, improving the backends where necessary. Richard. > -- > Marc Glisse
[It looks like I missed hitting the send button on this response] > > Seems to be one instruction shorter at least ;-) Yes, there can be much > worse regressions than that because of the patch (like 40 instructions > instead of 4, in the x86 backend). If this is replacing 4 instructions with 40 in x86 backend maybe someone will notice :) Not a win in this particular testcase because the compiler replaces 2 constant permutes ( that's about 4 cycles) with a load from the constant pool , a generic permute and in addition are polluting the icache with guff in the constant pool . If you go to 3 -4 permutes into a single one then it might be a win but not till that point. > with a-b without first asking the backend whether it might be more > efficient. One permutation is better than 2. > It just happens that the range > of possible permutations is too large (and the basic instructions are too > strange) for backends to do a good job on them, and thus keeping toplevel > input as a hint is important. Of-course, the problem here is this change of semantics with the hook TARGET_VEC_PERM_CONST_OK. Targets were expanding to generic permutes with constants in the *absence* of being able to deal with them with the specialized permutes. fwprop will now leave us at a point where each target has to now grow more knowledge with respect to how best to expand a generic constant permute with a sequence of permutes rather than just using the generic permute. Generating a sequence of permutes from a single constant permute will be a harder problem than (say) dealing with immediate expansions so you are pushing more complexity into the target but in the short term target maintainers should definitely have a heads up that folks using vector permute intrinsics could actually have performance regressions on their targets. Thanks, Ramana
On Wed, Aug 15, 2012 at 1:56 PM, Ramana Radhakrishnan <ramana.radhakrishnan@linaro.org> wrote: > [It looks like I missed hitting the send button on this response] > >> >> Seems to be one instruction shorter at least ;-) Yes, there can be much >> worse regressions than that because of the patch (like 40 instructions >> instead of 4, in the x86 backend). > > If this is replacing 4 instructions with 40 in x86 backend maybe > someone will notice :) > > Not a win in this particular testcase because the compiler replaces 2 > constant permutes ( that's about 4 cycles) with a load from the > constant pool , a generic permute and in addition are polluting the > icache with guff in the constant pool . If you go to 3 -4 permutes > into a single one then it might be a win but not till that point. > > >> with a-b without first asking the backend whether it might be more >> efficient. One permutation is better than 2. >> It just happens that the range >> of possible permutations is too large (and the basic instructions are too >> strange) for backends to do a good job on them, and thus keeping toplevel >> input as a hint is important. > > Of-course, the problem here is this change of semantics with the hook > TARGET_VEC_PERM_CONST_OK. Targets were expanding to generic permutes > with constants in the *absence* of being able to deal with them with > the specialized permutes. fwprop will now leave us at a point where > each target has to now grow more knowledge with respect to how best to > expand a generic constant permute with a sequence of permutes rather > than just using the generic permute. > > Generating a sequence of permutes from a single constant permute will > be a harder problem than (say) dealing with immediate expansions so > you are pushing more complexity into the target but in the short term > target maintainers should definitely have a heads up that folks using > vector permute intrinsics could actually have performance regressions > on their targets. It's of course the same with the user input containing such a non-optimal handled constant permute. So I'm less convinced that it's too much burden on the target side. OTOH if there is a generic kind of shuffles that targets do not implement directly but can be simulated by two that are directly implemented pushing the logic to the expander (and adjusting the target hook semantic) would be ok. Richard. > Thanks, > Ramana
On Wed, Aug 15, 2012 at 01:46:05PM +0200, Richard Guenther wrote: > Well, we're waiting for someone to break the tie ... I'd go with the original > patch, improving the backends where necessary. E.g. i?86/x86_64 with just plain -msse2 has only very small subset of constant shuffles (and no variable shuffle), so by doing the transformation you could end up with a scalarized shuffle instead of two constant vector shuffles. Expecting the backend to figure it out and doing the two constant vector shuffles in every case is not realistic, i386/x86_64 has way too many different shuffling instructions (and worse different ISA levels have different subsets of them) and while a lot of effort has been spent on it already by Richard, me, Marc and others, we are nowhere close to having optimal sequence in many cases for various modes and ISA levels. Doing a brute force might be too expensive. Jakub
On Wed, Aug 15, 2012 at 2:07 PM, Jakub Jelinek <jakub@redhat.com> wrote: > On Wed, Aug 15, 2012 at 01:46:05PM +0200, Richard Guenther wrote: >> Well, we're waiting for someone to break the tie ... I'd go with the original >> patch, improving the backends where necessary. > > E.g. i?86/x86_64 with just plain -msse2 has only very small subset of > constant shuffles (and no variable shuffle), so by doing the transformation > you could end up with a scalarized shuffle instead of two constant vector > shuffles. Expecting the backend to figure it out and doing the two constant > vector shuffles in every case is not realistic, i386/x86_64 has way too many > different shuffling instructions (and worse different ISA levels have > different subsets of them) and while a lot of effort has been spent on > it already by Richard, me, Marc and others, we are nowhere close to having > optimal sequence in many cases for various modes and ISA levels. Doing a > brute force might be too expensive. Ok. That would still leave us with the issue Ramana brought up - the target hook returning true unconditionally if a generic permute is implemented. We just avoid generic expansion by tree-vect-generic.c that way. (I wonder if there exists some automated machinery that finds a path using existing instructions to shuffle from {0, 1, ..., n } to the mask and if that would be a viable way to handle shuffle expansions, or pre-compute them all) Richard. > Jakub
On 15 August 2012 13:07, Jakub Jelinek <jakub@redhat.com> wrote: > On Wed, Aug 15, 2012 at 01:46:05PM +0200, Richard Guenther wrote: >> Well, we're waiting for someone to break the tie ... I'd go with the original >> patch, improving the backends where necessary. > > E.g. i?86/x86_64 with just plain -msse2 has only very small subset of > constant shuffles (and no variable shuffle), so by doing the transformation > you could end up with a scalarized shuffle instead of two constant vector > shuffles. Expecting the backend to figure it out and doing the two constant > vector shuffles in every case is not realistic, i386/x86_64 has way too many > different shuffling instructions (and worse different ISA levels have > different subsets of them) and while a lot of effort has been spent on > it already by Richard, me, Marc and others, we are nowhere close to having > optimal sequence in many cases for various modes and ISA levels. Doing a > brute force might be too expensive. > Neon has atleast 7-8 such forms. Brute force computing this would be expensive. I don't know what happens on other vector targets - surely other SIMD targets in GCC will also have the same problem. Ramana > Jakub
On Wed, Aug 15, 2012 at 02:15:03PM +0200, Richard Guenther wrote: > Ok. That would still leave us with the issue Ramana brought up - the > target hook returning true unconditionally if a generic permute is implemented. > We just avoid generic expansion by tree-vect-generic.c that way. Yeah, if there is a generic permute, can_vec_perm_p will return true always for the modes for which it is available. Which is why I've been suggesting also the target hook which should return false if only generic permute is going to be used. Jakub
On Wed, Aug 15, 2012 at 2:29 PM, Jakub Jelinek <jakub@redhat.com> wrote: > On Wed, Aug 15, 2012 at 02:15:03PM +0200, Richard Guenther wrote: >> Ok. That would still leave us with the issue Ramana brought up - the >> target hook returning true unconditionally if a generic permute is implemented. >> We just avoid generic expansion by tree-vect-generic.c that way. > > Yeah, if there is a generic permute, can_vec_perm_p will return true always > for the modes for which it is available. Which is why I've been suggesting > also the target hook which should return false if only generic permute is > going to be used. Well. What about returning a cost instead? We don't want to transform two insns to four either. Richard. > Jakub
On Wed, Aug 15, 2012 at 02:36:54PM +0200, Richard Guenther wrote: > On Wed, Aug 15, 2012 at 2:29 PM, Jakub Jelinek <jakub@redhat.com> wrote: > > On Wed, Aug 15, 2012 at 02:15:03PM +0200, Richard Guenther wrote: > >> Ok. That would still leave us with the issue Ramana brought up - the > >> target hook returning true unconditionally if a generic permute is implemented. > >> We just avoid generic expansion by tree-vect-generic.c that way. > > > > Yeah, if there is a generic permute, can_vec_perm_p will return true always > > for the modes for which it is available. Which is why I've been suggesting > > also the target hook which should return false if only generic permute is > > going to be used. > > Well. What about returning a cost instead? We don't want to transform > two insns to four either. That could work, it would just be a lot of work to change it (and more costly). E.g. on i386 for 16-byte vectors with certain ISAs we return true right away, when returning cost we'd need to go the slow path always even when the caller is just interested in a boolean flag whether it is possible or not. CCing rth as the author of the hooks. Jakub
On Wed, Aug 15, 2012 at 2:53 PM, Jakub Jelinek <jakub@redhat.com> wrote: > On Wed, Aug 15, 2012 at 02:36:54PM +0200, Richard Guenther wrote: >> On Wed, Aug 15, 2012 at 2:29 PM, Jakub Jelinek <jakub@redhat.com> wrote: >> > On Wed, Aug 15, 2012 at 02:15:03PM +0200, Richard Guenther wrote: >> >> Ok. That would still leave us with the issue Ramana brought up - the >> >> target hook returning true unconditionally if a generic permute is implemented. >> >> We just avoid generic expansion by tree-vect-generic.c that way. >> > >> > Yeah, if there is a generic permute, can_vec_perm_p will return true always >> > for the modes for which it is available. Which is why I've been suggesting >> > also the target hook which should return false if only generic permute is >> > going to be used. >> >> Well. What about returning a cost instead? We don't want to transform >> two insns to four either. > > That could work, it would just be a lot of work to change it (and more > costly). E.g. on i386 for 16-byte vectors with certain ISAs we return true > right away, when returning cost we'd need to go the slow path always > even when the caller is just interested in a boolean flag whether it is > possible or not. > > CCing rth as the author of the hooks. We'd want more accurate costs for the vectorizer cost model anyways. Richard. > Jakub
On Wed, 15 Aug 2012, Richard Guenther wrote: > On Wed, Aug 15, 2012 at 1:56 PM, Ramana Radhakrishnan > <ramana.radhakrishnan@linaro.org> wrote: > > Of-course, the problem here is this change of semantics with the hook > > TARGET_VEC_PERM_CONST_OK. Targets were expanding to generic permutes > > with constants in the *absence* of being able to deal with them with > > the specialized permutes. fwprop will now leave us at a point where > > each target has to now grow more knowledge with respect to how best to > > expand a generic constant permute with a sequence of permutes rather > > than just using the generic permute. > > > > Generating a sequence of permutes from a single constant permute will > > be a harder problem than (say) dealing with immediate expansions so > > you are pushing more complexity into the target but in the short term > > target maintainers should definitely have a heads up that folks using > > vector permute intrinsics could actually have performance regressions > > on their targets. > > It's of course the same with the user input containing such a non-optimal > handled constant permute. So I'm less convinced that it's too much burden > on the target side. OTOH if there is a generic kind of shuffles that targets > do not implement directly but can be simulated by two that are directly > implemented pushing the logic to the expander (and adjusting the target > hook semantic) would be ok. There's a wonderful knapsack-class problem in there, something for next year's GSoC? (Given a constant permutation, synthesize it with a set of simpler operations with total cost < constant_shuffle, where the "simpler operation" and "costs" are target-specific (query for presence by TARGET_VEC_PERM_CONST_OK and cost hooks; decompose to "simpler operation" by generic heuristics). A similar but simpler problem is to synthesize a constant vector instead of loading it from memory (though a load may be cheap enough for most SIMD targets that this may be uninteresting to generalize). brgds, H-P
Index: gcc/tree-ssa-forwprop.c =================================================================== --- gcc/tree-ssa-forwprop.c (revision 190311) +++ gcc/tree-ssa-forwprop.c (working copy) @@ -2569,20 +2569,97 @@ combine_conversions (gimple_stmt_iterato gimple_assign_set_rhs_code (stmt, CONVERT_EXPR); update_stmt (stmt); return remove_prop_source_from_use (op0) ? 2 : 1; } } } return 0; } +/* Return true if VAR has no nondebug use but in stmt. */ +static bool +is_only_used_in (const_tree var, const_gimple stmt) +{ + const ssa_use_operand_t *const ptr0 = &(SSA_NAME_IMM_USE_NODE (var)); + const ssa_use_operand_t *ptr = ptr0->next; + + for (; ptr != ptr0; ptr = ptr->next) + { + const_gimple use_stmt = USE_STMT (ptr); + if (is_gimple_debug (use_stmt)) + continue; + if (use_stmt != stmt) + return false; + } + return true; +} + +/* Combine two shuffles in a row. Returns 1 if there were any changes + made, 2 if cfg-cleanup needs to run. Else it returns 0. */ + +static int +simplify_permutation (gimple_stmt_iterator *gsi) +{ + gimple stmt = gsi_stmt (*gsi); + gimple def_stmt; + tree op0, op1, op2, op3, mask; + enum tree_code code = gimple_assign_rhs_code (stmt); + enum tree_code code2; + location_t loc = gimple_location (stmt); + + gcc_checking_assert (code == VEC_PERM_EXPR); + + op0 = gimple_assign_rhs1 (stmt); + op1 = gimple_assign_rhs2 (stmt); + op2 = gimple_assign_rhs3 (stmt); + + if (TREE_CODE (op0) != SSA_NAME) + return 0; + + if (TREE_CODE (op2) != VECTOR_CST) + return 0; + + if (op0 != op1) + return 0; + + def_stmt = SSA_NAME_DEF_STMT (op0); + if (!def_stmt || !is_gimple_assign (def_stmt) + || !can_propagate_from (def_stmt) + || !is_only_used_in (op0, stmt)) + return 0; + + /* Check that it is only used here. We cannot use has_single_use + since the expression is using it twice itself... */ + + code2 = gimple_assign_rhs_code (def_stmt); + + /* Two consecutive shuffles. */ + if (code2 == VEC_PERM_EXPR) + { + op3 = gimple_assign_rhs3 (def_stmt); + if (TREE_CODE (op3) != VECTOR_CST) + return 0; + mask = fold_build3_loc (loc, VEC_PERM_EXPR, TREE_TYPE (op3), + op3, op3, op2); + gcc_assert (TREE_CODE (mask) == VECTOR_CST); + gimple_assign_set_rhs1 (stmt, gimple_assign_rhs1 (def_stmt)); + gimple_assign_set_rhs2 (stmt, gimple_assign_rhs2 (def_stmt)); + gimple_assign_set_rhs3 (stmt, mask); + fold_stmt_inplace (gsi); + update_stmt (stmt); + return remove_prop_source_from_use (op0) ? 2 : 1; + } + + return false; +} + /* Main entry point for the forward propagation and statement combine optimizer. */ static unsigned int ssa_forward_propagate_and_combine (void) { basic_block bb; unsigned int todoflags = 0; cfg_changed = false; @@ -2731,20 +2808,27 @@ ssa_forward_propagate_and_combine (void) changed = associate_pointerplus (&gsi); else if (CONVERT_EXPR_CODE_P (code) || code == FLOAT_EXPR || code == FIX_TRUNC_EXPR) { int did_something = combine_conversions (&gsi); if (did_something == 2) cfg_changed = true; changed = did_something != 0; } + else if (code == VEC_PERM_EXPR) + { + int did_something = simplify_permutation (&gsi); + if (did_something == 2) + cfg_changed = true; + changed = did_something != 0; + } break; } case GIMPLE_SWITCH: changed = simplify_gimple_switch (stmt); break; case GIMPLE_COND: { int did_something; Index: gcc/fold-const.c =================================================================== --- gcc/fold-const.c (revision 190311) +++ gcc/fold-const.c (working copy) @@ -14148,58 +14148,96 @@ fold_ternary_loc (location_t loc, enum t return fold_build2_loc (loc, PLUS_EXPR, type, const_binop (MULT_EXPR, arg0, arg1), arg2); if (integer_zerop (arg2)) return fold_build2_loc (loc, MULT_EXPR, type, arg0, arg1); return fold_fma (loc, type, arg0, arg1, arg2); case VEC_PERM_EXPR: if (TREE_CODE (arg2) == VECTOR_CST) { - unsigned int nelts = TYPE_VECTOR_SUBPARTS (type), i; + unsigned int nelts = TYPE_VECTOR_SUBPARTS (type), i, mask; unsigned char *sel = XALLOCAVEC (unsigned char, nelts); tree t; bool need_mask_canon = false; + bool all_in_vec0 = true; + bool all_in_vec1 = true; + bool maybe_identity = true; + bool single_arg = (op0 == op1); + bool changed = false; + mask = single_arg ? (nelts - 1) : (2 * nelts - 1); gcc_assert (nelts == VECTOR_CST_NELTS (arg2)); for (i = 0; i < nelts; i++) { tree val = VECTOR_CST_ELT (arg2, i); if (TREE_CODE (val) != INTEGER_CST) return NULL_TREE; - sel[i] = TREE_INT_CST_LOW (val) & (2 * nelts - 1); + sel[i] = TREE_INT_CST_LOW (val) & mask; if (TREE_INT_CST_HIGH (val) || ((unsigned HOST_WIDE_INT) TREE_INT_CST_LOW (val) != sel[i])) need_mask_canon = true; + + if (sel[i] < nelts) + all_in_vec1 = false; + else + all_in_vec0 = false; + + if ((sel[i] & (nelts-1)) != i) + maybe_identity = false; + } + + if (maybe_identity) + { + if (all_in_vec0) + return op0; + if (all_in_vec1) + return op1; } if ((TREE_CODE (arg0) == VECTOR_CST || TREE_CODE (arg0) == CONSTRUCTOR) && (TREE_CODE (arg1) == VECTOR_CST || TREE_CODE (arg1) == CONSTRUCTOR)) { t = fold_vec_perm (type, arg0, arg1, sel); if (t != NULL_TREE) return t; } + if (all_in_vec0) + op1 = op0; + else if (all_in_vec1) + { + op0 = op1; + for (i = 0; i < nelts; i++) + sel[i] -= nelts; + need_mask_canon = true; + } + + if (op0 == op1 && !single_arg) + changed = true; + if (need_mask_canon && arg2 == op2) { tree *tsel = XALLOCAVEC (tree, nelts); tree eltype = TREE_TYPE (TREE_TYPE (arg2)); for (i = 0; i < nelts; i++) tsel[i] = build_int_cst (eltype, sel[i]); - t = build_vector (TREE_TYPE (arg2), tsel); - return build3_loc (loc, VEC_PERM_EXPR, type, op0, op1, t); + op2 = build_vector (TREE_TYPE (arg2), tsel); + changed = true; } + + if (changed) + return build3_loc (loc, VEC_PERM_EXPR, type, op0, op1, op2); } return NULL_TREE; default: return NULL_TREE; } /* switch (code) */ } /* Perform constant folding and related simplification of EXPR. The related simplifications include x*1 => x, x*0 => 0, etc., Index: gcc/testsuite/gcc.dg/tree-ssa/forwprop-19.c =================================================================== --- gcc/testsuite/gcc.dg/tree-ssa/forwprop-19.c (revision 0) +++ gcc/testsuite/gcc.dg/tree-ssa/forwprop-19.c (revision 0) @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O -fdump-tree-forwprop2" } */ + +typedef int vec __attribute__((vector_size (2 * sizeof (int)))); +void f (vec *x1, vec *x2) +{ + vec m = { 3, 1 }; + vec n = { 1, 8 }; + vec y = __builtin_shuffle (*x1, *x2, n); + vec z = __builtin_shuffle (y, m); + *x1 = z; +} + +/* { dg-final { scan-tree-dump "VEC_PERM_EXPR.*{ 0, 0 }" "forwprop2" } } */ +/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 1 "forwprop2" } } */ +/* { dg-final { scan-tree-dump-times "x2" 1 "forwprop2" } } */ +/* { dg-final { cleanup-tree-dump "forwprop2" } } */