Message ID | alpine.LSU.2.20.1801251253310.18265@zhemvz.fhfr.qr |
---|---|
State | New |
Headers | show |
Series | Fix PR81082 | expand |
On Thu, 25 Jan 2018, Richard Biener wrote: > --- gcc/match.pd (revision 257047) > +++ gcc/match.pd (working copy) > @@ -1939,6 +1939,37 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > (minus (convert (view_convert:stype @1)) > (convert (view_convert:stype @2))))))) > > +/* (A * C) +- (B * C) -> (A+-B) * C and (A * C) +- A -> A * (C+-1). > + Modeled after fold_plusminus_mult_expr. */ > +(if (!TYPE_SATURATING (type) > + && (!FLOAT_TYPE_P (type) || flag_associative_math)) > + (for plusminus (plus minus) > + (simplify > + (plusminus (mult:s @0 @1) (mult:cs @0 @2)) No :c on the first mult, so we don't actually handle A*C+B*C?
On Thu, 25 Jan 2018, Marc Glisse wrote: > On Thu, 25 Jan 2018, Richard Biener wrote: > > > --- gcc/match.pd (revision 257047) > > +++ gcc/match.pd (working copy) > > @@ -1939,6 +1939,37 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > > (minus (convert (view_convert:stype @1)) > > (convert (view_convert:stype @2))))))) > > > > +/* (A * C) +- (B * C) -> (A+-B) * C and (A * C) +- A -> A * (C+-1). > > + Modeled after fold_plusminus_mult_expr. */ > > +(if (!TYPE_SATURATING (type) > > + && (!FLOAT_TYPE_P (type) || flag_associative_math)) > > + (for plusminus (plus minus) > > + (simplify > > + (plusminus (mult:s @0 @1) (mult:cs @0 @2)) > > No :c on the first mult, so we don't actually handle A*C+B*C? Hmm, I somehow convinced myself that it's only necessary on one of the mults... but you are of course right. Will re-test with that fixed. Richard.
On Thu, 25 Jan 2018, Richard Biener wrote: > On Thu, 25 Jan 2018, Marc Glisse wrote: > > > On Thu, 25 Jan 2018, Richard Biener wrote: > > > > > --- gcc/match.pd (revision 257047) > > > +++ gcc/match.pd (working copy) > > > @@ -1939,6 +1939,37 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > > > (minus (convert (view_convert:stype @1)) > > > (convert (view_convert:stype @2))))))) > > > > > > +/* (A * C) +- (B * C) -> (A+-B) * C and (A * C) +- A -> A * (C+-1). > > > + Modeled after fold_plusminus_mult_expr. */ > > > +(if (!TYPE_SATURATING (type) > > > + && (!FLOAT_TYPE_P (type) || flag_associative_math)) > > > + (for plusminus (plus minus) > > > + (simplify > > > + (plusminus (mult:s @0 @1) (mult:cs @0 @2)) > > > > No :c on the first mult, so we don't actually handle A*C+B*C? > > Hmm, I somehow convinced myself that it's only necessary on one > of the mults... but you are of course right. Will re-test with > that fixed. This is what I have applied. Note I had to XFAIL a minor part of gcc.dg/tree-ssa/loop-15.c, namely simplifying the final value replacement down to n * n. While the patch should enable this the place where the transform could trigger with enough information about n is VRP but that doesn't end up folding all stmts - and that's something I'm not willing to change right now. Bootstrapped and tested on x86_64-unknown-linux-gnu, applied on trunk. Richard. 2018-01-26 Richard Biener <rguenther@suse.de> PR tree-optimization/81082 * fold-const.c (fold_plusminus_mult_expr): Do not perform the association if it requires casting to unsigned. * match.pd ((A * C) +- (B * C) -> (A+-B)): New patterns derived from fold_plusminus_mult_expr to catch important cases late when range info is available. * gcc.dg/vect/pr81082.c: New testcase. * gcc.dg/tree-ssa/loop-15.c: XFAIL the (int)((unsigned)n + -1U) * n + n simplification to n * n. Index: gcc/fold-const.c =================================================================== --- gcc/fold-const.c (revision 257047) +++ gcc/fold-const.c (working copy) @@ -7097,7 +7097,7 @@ fold_plusminus_mult_expr (location_t loc /* Same may be zero and thus the operation 'code' may overflow. Likewise same may be minus one and thus the multiplication may overflow. Perform - the operations in an unsigned type. */ + the sum operation in an unsigned type. */ tree utype = unsigned_type_for (type); tree tem = fold_build2_loc (loc, code, utype, fold_convert_loc (loc, utype, alt0), @@ -7110,9 +7110,9 @@ fold_plusminus_mult_expr (location_t loc return fold_build2_loc (loc, MULT_EXPR, type, fold_convert (type, tem), same); - return fold_convert_loc (loc, type, - fold_build2_loc (loc, MULT_EXPR, utype, tem, - fold_convert_loc (loc, utype, same))); + /* Do not resort to unsigned multiplication because + we lose the no-overflow property of the expression. */ + return NULL_TREE; } /* Subroutine of native_encode_expr. Encode the INTEGER_CST Index: gcc/match.pd =================================================================== --- gcc/match.pd (revision 257047) +++ gcc/match.pd (working copy) @@ -1939,6 +1939,37 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) (minus (convert (view_convert:stype @1)) (convert (view_convert:stype @2))))))) +/* (A * C) +- (B * C) -> (A+-B) * C and (A * C) +- A -> A * (C+-1). + Modeled after fold_plusminus_mult_expr. */ +(if (!TYPE_SATURATING (type) + && (!FLOAT_TYPE_P (type) || flag_associative_math)) + (for plusminus (plus minus) + (simplify + (plusminus (mult:cs @0 @1) (mult:cs @0 @2)) + (if (!ANY_INTEGRAL_TYPE_P (type) + || TYPE_OVERFLOW_WRAPS (type) + || (INTEGRAL_TYPE_P (type) + && tree_expr_nonzero_p (@0) + && expr_not_equal_to (@0, wi::minus_one (TYPE_PRECISION (type))))) + (mult (plusminus @1 @2) @0))) + /* We cannot generate constant 1 for fract. */ + (if (!ALL_FRACT_MODE_P (TYPE_MODE (type))) + (simplify + (plusminus @0 (mult:cs @0 @2)) + (if (!ANY_INTEGRAL_TYPE_P (type) + || TYPE_OVERFLOW_WRAPS (type) + || (INTEGRAL_TYPE_P (type) + && tree_expr_nonzero_p (@0) + && expr_not_equal_to (@0, wi::minus_one (TYPE_PRECISION (type))))) + (mult (plusminus { build_one_cst (type); } @2) @0))) + (simplify + (plusminus (mult:cs @0 @2) @0) + (if (!ANY_INTEGRAL_TYPE_P (type) + || TYPE_OVERFLOW_WRAPS (type) + || (INTEGRAL_TYPE_P (type) + && tree_expr_nonzero_p (@0) + && expr_not_equal_to (@0, wi::minus_one (TYPE_PRECISION (type))))) + (mult (plusminus @2 { build_one_cst (type); }) @0)))))) /* Simplifications of MIN_EXPR, MAX_EXPR, fmin() and fmax(). */ Index: gcc/testsuite/gcc.dg/vect/pr81082.c =================================================================== --- gcc/testsuite/gcc.dg/vect/pr81082.c (revision 0) +++ gcc/testsuite/gcc.dg/vect/pr81082.c (working copy) @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_int } */ + +int +f (int *x, int b1, int b2, int b3) +{ + int foo = 0; + for (int i1 = 0; i1 < b1; ++i1) + for (int i2 = 0; i2 < b2; ++i2) + for (int i3 = 0; i3 < b3; ++i3) + foo += x[i1 * b2 * b3 + i2 * b3 + (i3 - 1)]; + return foo; +} + +/* { dg-final { scan-tree-dump "vectorized 1 loops in function" "vect" } } */ Index: gcc/testsuite/gcc.dg/tree-ssa/loop-15.c =================================================================== --- gcc/testsuite/gcc.dg/tree-ssa/loop-15.c (revision 257048) +++ gcc/testsuite/gcc.dg/tree-ssa/loop-15.c (working copy) @@ -19,7 +19,7 @@ int bla(void) } /* Since the loop is removed, there should be no addition. */ -/* { dg-final { scan-tree-dump-times " \\+ " 0 "optimized" } } */ +/* { dg-final { scan-tree-dump-times " \\+ " 0 "optimized" { xfail *-*-* } } } */ /* { dg-final { scan-tree-dump-times " \\* " 1 "optimized" } } */ /* The if from the loop header copying remains in the code. */
On 26 January 2018 at 11:25, Richard Biener <rguenther@suse.de> wrote: > On Thu, 25 Jan 2018, Richard Biener wrote: > >> On Thu, 25 Jan 2018, Marc Glisse wrote: >> >> > On Thu, 25 Jan 2018, Richard Biener wrote: >> > >> > > --- gcc/match.pd (revision 257047) >> > > +++ gcc/match.pd (working copy) >> > > @@ -1939,6 +1939,37 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) >> > > (minus (convert (view_convert:stype @1)) >> > > (convert (view_convert:stype @2))))))) >> > > >> > > +/* (A * C) +- (B * C) -> (A+-B) * C and (A * C) +- A -> A * (C+-1). >> > > + Modeled after fold_plusminus_mult_expr. */ >> > > +(if (!TYPE_SATURATING (type) >> > > + && (!FLOAT_TYPE_P (type) || flag_associative_math)) >> > > + (for plusminus (plus minus) >> > > + (simplify >> > > + (plusminus (mult:s @0 @1) (mult:cs @0 @2)) >> > >> > No :c on the first mult, so we don't actually handle A*C+B*C? >> >> Hmm, I somehow convinced myself that it's only necessary on one >> of the mults... but you are of course right. Will re-test with >> that fixed. > > This is what I have applied. Note I had to XFAIL a minor part of > gcc.dg/tree-ssa/loop-15.c, namely simplifying the final value > replacement down to n * n. While the patch should enable this > the place where the transform could trigger with enough information > about n is VRP but that doesn't end up folding all stmts - and > that's something I'm not willing to change right now. > > Bootstrapped and tested on x86_64-unknown-linux-gnu, applied on trunk. > Hi, I think this patch caused regressions on aarch64: FAIL: gcc.dg/wmul-1.c scan-tree-dump-not widening_mul "WIDEN_MULT_PLUS_EXPR" Thanks, Christophe > Richard. > > 2018-01-26 Richard Biener <rguenther@suse.de> > > PR tree-optimization/81082 > * fold-const.c (fold_plusminus_mult_expr): Do not perform the > association if it requires casting to unsigned. > * match.pd ((A * C) +- (B * C) -> (A+-B)): New patterns derived > from fold_plusminus_mult_expr to catch important cases late when > range info is available. > > * gcc.dg/vect/pr81082.c: New testcase. > * gcc.dg/tree-ssa/loop-15.c: XFAIL the (int)((unsigned)n + -1U) * n + n > simplification to n * n. > > Index: gcc/fold-const.c > =================================================================== > --- gcc/fold-const.c (revision 257047) > +++ gcc/fold-const.c (working copy) > @@ -7097,7 +7097,7 @@ fold_plusminus_mult_expr (location_t loc > > /* Same may be zero and thus the operation 'code' may overflow. Likewise > same may be minus one and thus the multiplication may overflow. Perform > - the operations in an unsigned type. */ > + the sum operation in an unsigned type. */ > tree utype = unsigned_type_for (type); > tree tem = fold_build2_loc (loc, code, utype, > fold_convert_loc (loc, utype, alt0), > @@ -7110,9 +7110,9 @@ fold_plusminus_mult_expr (location_t loc > return fold_build2_loc (loc, MULT_EXPR, type, > fold_convert (type, tem), same); > > - return fold_convert_loc (loc, type, > - fold_build2_loc (loc, MULT_EXPR, utype, tem, > - fold_convert_loc (loc, utype, same))); > + /* Do not resort to unsigned multiplication because > + we lose the no-overflow property of the expression. */ > + return NULL_TREE; > } > > /* Subroutine of native_encode_expr. Encode the INTEGER_CST > Index: gcc/match.pd > =================================================================== > --- gcc/match.pd (revision 257047) > +++ gcc/match.pd (working copy) > @@ -1939,6 +1939,37 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > (minus (convert (view_convert:stype @1)) > (convert (view_convert:stype @2))))))) > > +/* (A * C) +- (B * C) -> (A+-B) * C and (A * C) +- A -> A * (C+-1). > + Modeled after fold_plusminus_mult_expr. */ > +(if (!TYPE_SATURATING (type) > + && (!FLOAT_TYPE_P (type) || flag_associative_math)) > + (for plusminus (plus minus) > + (simplify > + (plusminus (mult:cs @0 @1) (mult:cs @0 @2)) > + (if (!ANY_INTEGRAL_TYPE_P (type) > + || TYPE_OVERFLOW_WRAPS (type) > + || (INTEGRAL_TYPE_P (type) > + && tree_expr_nonzero_p (@0) > + && expr_not_equal_to (@0, wi::minus_one (TYPE_PRECISION (type))))) > + (mult (plusminus @1 @2) @0))) > + /* We cannot generate constant 1 for fract. */ > + (if (!ALL_FRACT_MODE_P (TYPE_MODE (type))) > + (simplify > + (plusminus @0 (mult:cs @0 @2)) > + (if (!ANY_INTEGRAL_TYPE_P (type) > + || TYPE_OVERFLOW_WRAPS (type) > + || (INTEGRAL_TYPE_P (type) > + && tree_expr_nonzero_p (@0) > + && expr_not_equal_to (@0, wi::minus_one (TYPE_PRECISION (type))))) > + (mult (plusminus { build_one_cst (type); } @2) @0))) > + (simplify > + (plusminus (mult:cs @0 @2) @0) > + (if (!ANY_INTEGRAL_TYPE_P (type) > + || TYPE_OVERFLOW_WRAPS (type) > + || (INTEGRAL_TYPE_P (type) > + && tree_expr_nonzero_p (@0) > + && expr_not_equal_to (@0, wi::minus_one (TYPE_PRECISION (type))))) > + (mult (plusminus @2 { build_one_cst (type); }) @0)))))) > > /* Simplifications of MIN_EXPR, MAX_EXPR, fmin() and fmax(). */ > > Index: gcc/testsuite/gcc.dg/vect/pr81082.c > =================================================================== > --- gcc/testsuite/gcc.dg/vect/pr81082.c (revision 0) > +++ gcc/testsuite/gcc.dg/vect/pr81082.c (working copy) > @@ -0,0 +1,15 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target vect_int } */ > + > +int > +f (int *x, int b1, int b2, int b3) > +{ > + int foo = 0; > + for (int i1 = 0; i1 < b1; ++i1) > + for (int i2 = 0; i2 < b2; ++i2) > + for (int i3 = 0; i3 < b3; ++i3) > + foo += x[i1 * b2 * b3 + i2 * b3 + (i3 - 1)]; > + return foo; > +} > + > +/* { dg-final { scan-tree-dump "vectorized 1 loops in function" "vect" } } */ > Index: gcc/testsuite/gcc.dg/tree-ssa/loop-15.c > =================================================================== > --- gcc/testsuite/gcc.dg/tree-ssa/loop-15.c (revision 257048) > +++ gcc/testsuite/gcc.dg/tree-ssa/loop-15.c (working copy) > @@ -19,7 +19,7 @@ int bla(void) > } > > /* Since the loop is removed, there should be no addition. */ > -/* { dg-final { scan-tree-dump-times " \\+ " 0 "optimized" } } */ > +/* { dg-final { scan-tree-dump-times " \\+ " 0 "optimized" { xfail *-*-* } } } */ > /* { dg-final { scan-tree-dump-times " \\* " 1 "optimized" } } */ > > /* The if from the loop header copying remains in the code. */
On Fri, Jan 26, 2018 at 3:57 PM, Christophe Lyon <christophe.lyon@linaro.org> wrote: > On 26 January 2018 at 11:25, Richard Biener <rguenther@suse.de> wrote: >> On Thu, 25 Jan 2018, Richard Biener wrote: >> >>> On Thu, 25 Jan 2018, Marc Glisse wrote: >>> >>> > On Thu, 25 Jan 2018, Richard Biener wrote: >>> > >>> > > --- gcc/match.pd (revision 257047) >>> > > +++ gcc/match.pd (working copy) >>> > > @@ -1939,6 +1939,37 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) >>> > > (minus (convert (view_convert:stype @1)) >>> > > (convert (view_convert:stype @2))))))) >>> > > >>> > > +/* (A * C) +- (B * C) -> (A+-B) * C and (A * C) +- A -> A * (C+-1). >>> > > + Modeled after fold_plusminus_mult_expr. */ >>> > > +(if (!TYPE_SATURATING (type) >>> > > + && (!FLOAT_TYPE_P (type) || flag_associative_math)) >>> > > + (for plusminus (plus minus) >>> > > + (simplify >>> > > + (plusminus (mult:s @0 @1) (mult:cs @0 @2)) >>> > >>> > No :c on the first mult, so we don't actually handle A*C+B*C? >>> >>> Hmm, I somehow convinced myself that it's only necessary on one >>> of the mults... but you are of course right. Will re-test with >>> that fixed. >> >> This is what I have applied. Note I had to XFAIL a minor part of >> gcc.dg/tree-ssa/loop-15.c, namely simplifying the final value >> replacement down to n * n. While the patch should enable this >> the place where the transform could trigger with enough information >> about n is VRP but that doesn't end up folding all stmts - and >> that's something I'm not willing to change right now. >> >> Bootstrapped and tested on x86_64-unknown-linux-gnu, applied on trunk. >> > > Hi, > > > I think this patch caused regressions on aarch64: > FAIL: gcc.dg/wmul-1.c scan-tree-dump-not widening_mul "WIDEN_MULT_PLUS_EXPR" Late forwprop does @@ -75,7 +21,7 @@ _1 = (long unsigned int) Idx_6(D); _2 = _1 * 40; _12 = _1 * 4; - _17 = _2 + _12; + _17 = _1 * 44; _13 = Arr_7(D) + _17; MEM[(int[10] *)_13] = 1; _4 = _2 + 400; _18 = _4 + _12; _16 = Arr_7(D) + _18; MEM[(int[10] *)_16] = 2; which I'm not sure ends up profitable at the end. It makes _12 dead so the total number of multiplications stays the same. For this we then apply the WIDEN_MULT_PLUS_EXPR two times given _2 is no longer used multiple times. The reason the above transform happens is the match.pd :s behavior which "ignores" :s in case the resulting expression is "simple". Can you open a bugreport? Richard. > Thanks, > > Christophe > >> Richard. >> >> 2018-01-26 Richard Biener <rguenther@suse.de> >> >> PR tree-optimization/81082 >> * fold-const.c (fold_plusminus_mult_expr): Do not perform the >> association if it requires casting to unsigned. >> * match.pd ((A * C) +- (B * C) -> (A+-B)): New patterns derived >> from fold_plusminus_mult_expr to catch important cases late when >> range info is available. >> >> * gcc.dg/vect/pr81082.c: New testcase. >> * gcc.dg/tree-ssa/loop-15.c: XFAIL the (int)((unsigned)n + -1U) * n + n >> simplification to n * n. >> >> Index: gcc/fold-const.c >> =================================================================== >> --- gcc/fold-const.c (revision 257047) >> +++ gcc/fold-const.c (working copy) >> @@ -7097,7 +7097,7 @@ fold_plusminus_mult_expr (location_t loc >> >> /* Same may be zero and thus the operation 'code' may overflow. Likewise >> same may be minus one and thus the multiplication may overflow. Perform >> - the operations in an unsigned type. */ >> + the sum operation in an unsigned type. */ >> tree utype = unsigned_type_for (type); >> tree tem = fold_build2_loc (loc, code, utype, >> fold_convert_loc (loc, utype, alt0), >> @@ -7110,9 +7110,9 @@ fold_plusminus_mult_expr (location_t loc >> return fold_build2_loc (loc, MULT_EXPR, type, >> fold_convert (type, tem), same); >> >> - return fold_convert_loc (loc, type, >> - fold_build2_loc (loc, MULT_EXPR, utype, tem, >> - fold_convert_loc (loc, utype, same))); >> + /* Do not resort to unsigned multiplication because >> + we lose the no-overflow property of the expression. */ >> + return NULL_TREE; >> } >> >> /* Subroutine of native_encode_expr. Encode the INTEGER_CST >> Index: gcc/match.pd >> =================================================================== >> --- gcc/match.pd (revision 257047) >> +++ gcc/match.pd (working copy) >> @@ -1939,6 +1939,37 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) >> (minus (convert (view_convert:stype @1)) >> (convert (view_convert:stype @2))))))) >> >> +/* (A * C) +- (B * C) -> (A+-B) * C and (A * C) +- A -> A * (C+-1). >> + Modeled after fold_plusminus_mult_expr. */ >> +(if (!TYPE_SATURATING (type) >> + && (!FLOAT_TYPE_P (type) || flag_associative_math)) >> + (for plusminus (plus minus) >> + (simplify >> + (plusminus (mult:cs @0 @1) (mult:cs @0 @2)) >> + (if (!ANY_INTEGRAL_TYPE_P (type) >> + || TYPE_OVERFLOW_WRAPS (type) >> + || (INTEGRAL_TYPE_P (type) >> + && tree_expr_nonzero_p (@0) >> + && expr_not_equal_to (@0, wi::minus_one (TYPE_PRECISION (type))))) >> + (mult (plusminus @1 @2) @0))) >> + /* We cannot generate constant 1 for fract. */ >> + (if (!ALL_FRACT_MODE_P (TYPE_MODE (type))) >> + (simplify >> + (plusminus @0 (mult:cs @0 @2)) >> + (if (!ANY_INTEGRAL_TYPE_P (type) >> + || TYPE_OVERFLOW_WRAPS (type) >> + || (INTEGRAL_TYPE_P (type) >> + && tree_expr_nonzero_p (@0) >> + && expr_not_equal_to (@0, wi::minus_one (TYPE_PRECISION (type))))) >> + (mult (plusminus { build_one_cst (type); } @2) @0))) >> + (simplify >> + (plusminus (mult:cs @0 @2) @0) >> + (if (!ANY_INTEGRAL_TYPE_P (type) >> + || TYPE_OVERFLOW_WRAPS (type) >> + || (INTEGRAL_TYPE_P (type) >> + && tree_expr_nonzero_p (@0) >> + && expr_not_equal_to (@0, wi::minus_one (TYPE_PRECISION (type))))) >> + (mult (plusminus @2 { build_one_cst (type); }) @0)))))) >> >> /* Simplifications of MIN_EXPR, MAX_EXPR, fmin() and fmax(). */ >> >> Index: gcc/testsuite/gcc.dg/vect/pr81082.c >> =================================================================== >> --- gcc/testsuite/gcc.dg/vect/pr81082.c (revision 0) >> +++ gcc/testsuite/gcc.dg/vect/pr81082.c (working copy) >> @@ -0,0 +1,15 @@ >> +/* { dg-do compile } */ >> +/* { dg-require-effective-target vect_int } */ >> + >> +int >> +f (int *x, int b1, int b2, int b3) >> +{ >> + int foo = 0; >> + for (int i1 = 0; i1 < b1; ++i1) >> + for (int i2 = 0; i2 < b2; ++i2) >> + for (int i3 = 0; i3 < b3; ++i3) >> + foo += x[i1 * b2 * b3 + i2 * b3 + (i3 - 1)]; >> + return foo; >> +} >> + >> +/* { dg-final { scan-tree-dump "vectorized 1 loops in function" "vect" } } */ >> Index: gcc/testsuite/gcc.dg/tree-ssa/loop-15.c >> =================================================================== >> --- gcc/testsuite/gcc.dg/tree-ssa/loop-15.c (revision 257048) >> +++ gcc/testsuite/gcc.dg/tree-ssa/loop-15.c (working copy) >> @@ -19,7 +19,7 @@ int bla(void) >> } >> >> /* Since the loop is removed, there should be no addition. */ >> -/* { dg-final { scan-tree-dump-times " \\+ " 0 "optimized" } } */ >> +/* { dg-final { scan-tree-dump-times " \\+ " 0 "optimized" { xfail *-*-* } } } */ >> /* { dg-final { scan-tree-dump-times " \\* " 1 "optimized" } } */ >> >> /* The if from the loop header copying remains in the code. */
On 26 January 2018 at 16:13, Richard Biener <richard.guenther@gmail.com> wrote: > On Fri, Jan 26, 2018 at 3:57 PM, Christophe Lyon > <christophe.lyon@linaro.org> wrote: >> On 26 January 2018 at 11:25, Richard Biener <rguenther@suse.de> wrote: >>> On Thu, 25 Jan 2018, Richard Biener wrote: >>> >>>> On Thu, 25 Jan 2018, Marc Glisse wrote: >>>> >>>> > On Thu, 25 Jan 2018, Richard Biener wrote: >>>> > >>>> > > --- gcc/match.pd (revision 257047) >>>> > > +++ gcc/match.pd (working copy) >>>> > > @@ -1939,6 +1939,37 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) >>>> > > (minus (convert (view_convert:stype @1)) >>>> > > (convert (view_convert:stype @2))))))) >>>> > > >>>> > > +/* (A * C) +- (B * C) -> (A+-B) * C and (A * C) +- A -> A * (C+-1). >>>> > > + Modeled after fold_plusminus_mult_expr. */ >>>> > > +(if (!TYPE_SATURATING (type) >>>> > > + && (!FLOAT_TYPE_P (type) || flag_associative_math)) >>>> > > + (for plusminus (plus minus) >>>> > > + (simplify >>>> > > + (plusminus (mult:s @0 @1) (mult:cs @0 @2)) >>>> > >>>> > No :c on the first mult, so we don't actually handle A*C+B*C? >>>> >>>> Hmm, I somehow convinced myself that it's only necessary on one >>>> of the mults... but you are of course right. Will re-test with >>>> that fixed. >>> >>> This is what I have applied. Note I had to XFAIL a minor part of >>> gcc.dg/tree-ssa/loop-15.c, namely simplifying the final value >>> replacement down to n * n. While the patch should enable this >>> the place where the transform could trigger with enough information >>> about n is VRP but that doesn't end up folding all stmts - and >>> that's something I'm not willing to change right now. >>> >>> Bootstrapped and tested on x86_64-unknown-linux-gnu, applied on trunk. >>> >> >> Hi, >> >> >> I think this patch caused regressions on aarch64: >> FAIL: gcc.dg/wmul-1.c scan-tree-dump-not widening_mul "WIDEN_MULT_PLUS_EXPR" > > Late forwprop does > > @@ -75,7 +21,7 @@ > _1 = (long unsigned int) Idx_6(D); > _2 = _1 * 40; > _12 = _1 * 4; > - _17 = _2 + _12; > + _17 = _1 * 44; > _13 = Arr_7(D) + _17; > MEM[(int[10] *)_13] = 1; > _4 = _2 + 400; > _18 = _4 + _12; > _16 = Arr_7(D) + _18; > MEM[(int[10] *)_16] = 2; > > > which I'm not sure ends up profitable at the end. It makes _12 > dead so the total number of multiplications stays the same. > For this we then apply the WIDEN_MULT_PLUS_EXPR two > times given _2 is no longer used multiple times. > > The reason the above transform happens is the match.pd :s > behavior which "ignores" :s in case the resulting expression > is "simple". > > Can you open a bugreport? > Sure, this is: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=84067 > Richard. > >> Thanks, >> >> Christophe >> >>> Richard. >>> >>> 2018-01-26 Richard Biener <rguenther@suse.de> >>> >>> PR tree-optimization/81082 >>> * fold-const.c (fold_plusminus_mult_expr): Do not perform the >>> association if it requires casting to unsigned. >>> * match.pd ((A * C) +- (B * C) -> (A+-B)): New patterns derived >>> from fold_plusminus_mult_expr to catch important cases late when >>> range info is available. >>> >>> * gcc.dg/vect/pr81082.c: New testcase. >>> * gcc.dg/tree-ssa/loop-15.c: XFAIL the (int)((unsigned)n + -1U) * n + n >>> simplification to n * n. >>> >>> Index: gcc/fold-const.c >>> =================================================================== >>> --- gcc/fold-const.c (revision 257047) >>> +++ gcc/fold-const.c (working copy) >>> @@ -7097,7 +7097,7 @@ fold_plusminus_mult_expr (location_t loc >>> >>> /* Same may be zero and thus the operation 'code' may overflow. Likewise >>> same may be minus one and thus the multiplication may overflow. Perform >>> - the operations in an unsigned type. */ >>> + the sum operation in an unsigned type. */ >>> tree utype = unsigned_type_for (type); >>> tree tem = fold_build2_loc (loc, code, utype, >>> fold_convert_loc (loc, utype, alt0), >>> @@ -7110,9 +7110,9 @@ fold_plusminus_mult_expr (location_t loc >>> return fold_build2_loc (loc, MULT_EXPR, type, >>> fold_convert (type, tem), same); >>> >>> - return fold_convert_loc (loc, type, >>> - fold_build2_loc (loc, MULT_EXPR, utype, tem, >>> - fold_convert_loc (loc, utype, same))); >>> + /* Do not resort to unsigned multiplication because >>> + we lose the no-overflow property of the expression. */ >>> + return NULL_TREE; >>> } >>> >>> /* Subroutine of native_encode_expr. Encode the INTEGER_CST >>> Index: gcc/match.pd >>> =================================================================== >>> --- gcc/match.pd (revision 257047) >>> +++ gcc/match.pd (working copy) >>> @@ -1939,6 +1939,37 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) >>> (minus (convert (view_convert:stype @1)) >>> (convert (view_convert:stype @2))))))) >>> >>> +/* (A * C) +- (B * C) -> (A+-B) * C and (A * C) +- A -> A * (C+-1). >>> + Modeled after fold_plusminus_mult_expr. */ >>> +(if (!TYPE_SATURATING (type) >>> + && (!FLOAT_TYPE_P (type) || flag_associative_math)) >>> + (for plusminus (plus minus) >>> + (simplify >>> + (plusminus (mult:cs @0 @1) (mult:cs @0 @2)) >>> + (if (!ANY_INTEGRAL_TYPE_P (type) >>> + || TYPE_OVERFLOW_WRAPS (type) >>> + || (INTEGRAL_TYPE_P (type) >>> + && tree_expr_nonzero_p (@0) >>> + && expr_not_equal_to (@0, wi::minus_one (TYPE_PRECISION (type))))) >>> + (mult (plusminus @1 @2) @0))) >>> + /* We cannot generate constant 1 for fract. */ >>> + (if (!ALL_FRACT_MODE_P (TYPE_MODE (type))) >>> + (simplify >>> + (plusminus @0 (mult:cs @0 @2)) >>> + (if (!ANY_INTEGRAL_TYPE_P (type) >>> + || TYPE_OVERFLOW_WRAPS (type) >>> + || (INTEGRAL_TYPE_P (type) >>> + && tree_expr_nonzero_p (@0) >>> + && expr_not_equal_to (@0, wi::minus_one (TYPE_PRECISION (type))))) >>> + (mult (plusminus { build_one_cst (type); } @2) @0))) >>> + (simplify >>> + (plusminus (mult:cs @0 @2) @0) >>> + (if (!ANY_INTEGRAL_TYPE_P (type) >>> + || TYPE_OVERFLOW_WRAPS (type) >>> + || (INTEGRAL_TYPE_P (type) >>> + && tree_expr_nonzero_p (@0) >>> + && expr_not_equal_to (@0, wi::minus_one (TYPE_PRECISION (type))))) >>> + (mult (plusminus @2 { build_one_cst (type); }) @0)))))) >>> >>> /* Simplifications of MIN_EXPR, MAX_EXPR, fmin() and fmax(). */ >>> >>> Index: gcc/testsuite/gcc.dg/vect/pr81082.c >>> =================================================================== >>> --- gcc/testsuite/gcc.dg/vect/pr81082.c (revision 0) >>> +++ gcc/testsuite/gcc.dg/vect/pr81082.c (working copy) >>> @@ -0,0 +1,15 @@ >>> +/* { dg-do compile } */ >>> +/* { dg-require-effective-target vect_int } */ >>> + >>> +int >>> +f (int *x, int b1, int b2, int b3) >>> +{ >>> + int foo = 0; >>> + for (int i1 = 0; i1 < b1; ++i1) >>> + for (int i2 = 0; i2 < b2; ++i2) >>> + for (int i3 = 0; i3 < b3; ++i3) >>> + foo += x[i1 * b2 * b3 + i2 * b3 + (i3 - 1)]; >>> + return foo; >>> +} >>> + >>> +/* { dg-final { scan-tree-dump "vectorized 1 loops in function" "vect" } } */ >>> Index: gcc/testsuite/gcc.dg/tree-ssa/loop-15.c >>> =================================================================== >>> --- gcc/testsuite/gcc.dg/tree-ssa/loop-15.c (revision 257048) >>> +++ gcc/testsuite/gcc.dg/tree-ssa/loop-15.c (working copy) >>> @@ -19,7 +19,7 @@ int bla(void) >>> } >>> >>> /* Since the loop is removed, there should be no addition. */ >>> -/* { dg-final { scan-tree-dump-times " \\+ " 0 "optimized" } } */ >>> +/* { dg-final { scan-tree-dump-times " \\+ " 0 "optimized" { xfail *-*-* } } } */ >>> /* { dg-final { scan-tree-dump-times " \\* " 1 "optimized" } } */ >>> >>> /* The if from the loop header copying remains in the code. */
Index: gcc/fold-const.c =================================================================== --- gcc/fold-const.c (revision 257047) +++ gcc/fold-const.c (working copy) @@ -7097,7 +7097,7 @@ fold_plusminus_mult_expr (location_t loc /* Same may be zero and thus the operation 'code' may overflow. Likewise same may be minus one and thus the multiplication may overflow. Perform - the operations in an unsigned type. */ + the sum operation in an unsigned type. */ tree utype = unsigned_type_for (type); tree tem = fold_build2_loc (loc, code, utype, fold_convert_loc (loc, utype, alt0), @@ -7110,9 +7110,9 @@ fold_plusminus_mult_expr (location_t loc return fold_build2_loc (loc, MULT_EXPR, type, fold_convert (type, tem), same); - return fold_convert_loc (loc, type, - fold_build2_loc (loc, MULT_EXPR, utype, tem, - fold_convert_loc (loc, utype, same))); + /* Do not resort to unsigned multiplication because + we lose the no-overflow property of the expression. */ + return NULL_TREE; } /* Subroutine of native_encode_expr. Encode the INTEGER_CST Index: gcc/match.pd =================================================================== --- gcc/match.pd (revision 257047) +++ gcc/match.pd (working copy) @@ -1939,6 +1939,37 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) (minus (convert (view_convert:stype @1)) (convert (view_convert:stype @2))))))) +/* (A * C) +- (B * C) -> (A+-B) * C and (A * C) +- A -> A * (C+-1). + Modeled after fold_plusminus_mult_expr. */ +(if (!TYPE_SATURATING (type) + && (!FLOAT_TYPE_P (type) || flag_associative_math)) + (for plusminus (plus minus) + (simplify + (plusminus (mult:s @0 @1) (mult:cs @0 @2)) + (if (!ANY_INTEGRAL_TYPE_P (type) + || TYPE_OVERFLOW_WRAPS (type) + || (INTEGRAL_TYPE_P (type) + && tree_expr_nonzero_p (@0) + && expr_not_equal_to (@0, wi::minus_one (TYPE_PRECISION (type))))) + (mult (plusminus @1 @2) @0))) + /* We cannot generate constant 1 for fract. */ + (if (!ALL_FRACT_MODE_P (TYPE_MODE (type))) + (simplify + (plusminus @0 (mult:cs @0 @2)) + (if (!ANY_INTEGRAL_TYPE_P (type) + || TYPE_OVERFLOW_WRAPS (type) + || (INTEGRAL_TYPE_P (type) + && tree_expr_nonzero_p (@0) + && expr_not_equal_to (@0, wi::minus_one (TYPE_PRECISION (type))))) + (mult (plusminus { build_one_cst (type); } @2) @0))) + (simplify + (plusminus (mult:cs @0 @2) @0) + (if (!ANY_INTEGRAL_TYPE_P (type) + || TYPE_OVERFLOW_WRAPS (type) + || (INTEGRAL_TYPE_P (type) + && tree_expr_nonzero_p (@0) + && expr_not_equal_to (@0, wi::minus_one (TYPE_PRECISION (type))))) + (mult (plusminus @2 { build_one_cst (type); }) @0)))))) /* Simplifications of MIN_EXPR, MAX_EXPR, fmin() and fmax(). */ Index: gcc/testsuite/gcc.dg/vect/pr81082.c =================================================================== --- gcc/testsuite/gcc.dg/vect/pr81082.c (nonexistent) +++ gcc/testsuite/gcc.dg/vect/pr81082.c (working copy) @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_int } */ + +int +f (int *x, int b1, int b2, int b3) +{ + int foo = 0; + for (int i1 = 0; i1 < b1; ++i1) + for (int i2 = 0; i2 < b2; ++i2) + for (int i3 = 0; i3 < b3; ++i3) + foo += x[i1 * b2 * b3 + i2 * b3 + (i3 - 1)]; + return foo; +} + +/* { dg-final { scan-tree-dump "vectorized 1 loops in function" "vect" } } */