Message ID | alpine.LSU.2.20.1712061417230.12252@zhemvz.fhfr.qr |
---|---|
State | New |
Headers | show |
Series | Fix vectorizer part of PR81303 | expand |
On Wed, Dec 6, 2017 at 1:29 PM, Richard Biener <rguenther@suse.de> wrote: > > The following fixes a vectorization issue that appears when trying > to vectorize the bwaves mat_times_vec kernel after interchange was > performed by the interchange pass. That interchange inserts the > following code for the former reduction created by LIM store-motion: I do observe more cases are vectorized by this patch on AArch64. Still want to find a way not generating the cond_expr, but for the moment I will have another patch make interchange even more conservative for small cases. In which the new cmp/select instructions do cost a lot against the small loop body. Thanks, bin > > <bb 13> [local count: 161061274]: > # m_58 = PHI <1(10), m_84(20)> > ... > <bb 14> [local count: 912680551]: > # l_35 = PHI <1(13), l_57(21)> > ... > y__I_lsm.113_140 = *y_139(D)[_31]; > y__I_lsm.113_94 = m_58 != 1 ? y__I_lsm.113_140 : 0.0; > ... > *y_139(D)[_31] = _101; > > > so we have a COND_EXPR with a test on an integer IV m_58 with > double values. Note that the m_58 != 1 condition is invariant > in the l loop. > > Currently we vectorize this condition using V8SImode vectors > causing a vectorization factor of 8 and thus forcing the scalar > path for the bwaves case (the loops have an iteration count of 5). > > The following patch makes the vectorizer handle invariant conditions > in the first place and second handle widening of operands of invariant > conditions transparently (the promotion will happen on the invariant > scalars). This makes it possible to use a vectorization factor of 4, > reducing the bwaves runtime from 208s before interchange > (via 190s after interchange) to 172s after interchange and vectorization > with AVX256 (on a Haswell machine). > > For the vectorizable_condition part to work I need to avoid > pulling apart the condition from the COND_EXPR during pattern > detection. > > Bootstrapped on x86_64-unknown-linux-gnu, testing in progress. > > Richard. > > 2017-12-06 Richard Biener <rguenther@suse.de> > > PR tree-optimization/81303 > * tree-vect-stmts.c (vect_is_simple_cond): For invariant > conditions try to create a comparison vector type matching > the data vector type. > (vectorizable_condition): Adjust. > * tree-vect-patterns.c (vect_recog_mask_conversion_pattern): > Leave invariant conditions alone in case we can vectorize those. > > * gcc.target/i386/vectorize9.c: New testcase. > * gcc.target/i386/vectorize10.c: New testcase. > > Index: gcc/tree-vect-stmts.c > =================================================================== > --- gcc/tree-vect-stmts.c (revision 255438) > +++ gcc/tree-vect-stmts.c (working copy) > @@ -7792,7 +7792,8 @@ vectorizable_load (gimple *stmt, gimple_ > > static bool > vect_is_simple_cond (tree cond, vec_info *vinfo, > - tree *comp_vectype, enum vect_def_type *dts) > + tree *comp_vectype, enum vect_def_type *dts, > + tree vectype) > { > tree lhs, rhs; > tree vectype1 = NULL_TREE, vectype2 = NULL_TREE; > @@ -7845,6 +7846,20 @@ vect_is_simple_cond (tree cond, vec_info > return false; > > *comp_vectype = vectype1 ? vectype1 : vectype2; > + /* Invariant comparison. */ > + if (! *comp_vectype) > + { > + tree scalar_type = TREE_TYPE (lhs); > + /* If we can widen the comparison to match vectype do so. */ > + if (INTEGRAL_TYPE_P (scalar_type) > + && tree_int_cst_lt (TYPE_SIZE (scalar_type), > + TYPE_SIZE (TREE_TYPE (vectype)))) > + scalar_type = build_nonstandard_integer_type > + (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype))), > + TYPE_UNSIGNED (scalar_type)); > + *comp_vectype = get_vectype_for_scalar_type (scalar_type); > + } > + > return true; > } > > @@ -7942,7 +7957,7 @@ vectorizable_condition (gimple *stmt, gi > else_clause = gimple_assign_rhs3 (stmt); > > if (!vect_is_simple_cond (cond_expr, stmt_info->vinfo, > - &comp_vectype, &dts[0]) > + &comp_vectype, &dts[0], vectype) > || !comp_vectype) > return false; > > Index: gcc/tree-vect-patterns.c > =================================================================== > --- gcc/tree-vect-patterns.c (revision 255438) > +++ gcc/tree-vect-patterns.c (working copy) > @@ -3976,6 +3976,32 @@ vect_recog_mask_conversion_pattern (vec< > || TYPE_VECTOR_SUBPARTS (vectype1) == TYPE_VECTOR_SUBPARTS (vectype2)) > return NULL; > > + /* If rhs1 is invariant and we can promote it leave the COND_EXPR > + in place, we can handle it in vectorizable_condition. This avoids > + unnecessary promotion stmts and increased vectorization factor. */ > + if (COMPARISON_CLASS_P (rhs1) > + && INTEGRAL_TYPE_P (rhs1_type) > + && TYPE_VECTOR_SUBPARTS (vectype1) < TYPE_VECTOR_SUBPARTS (vectype2)) > + { > + gimple *dummy; > + enum vect_def_type dt; > + if (vect_is_simple_use (TREE_OPERAND (rhs1, 0), stmt_vinfo->vinfo, > + &dummy, &dt) > + && dt == vect_external_def > + && vect_is_simple_use (TREE_OPERAND (rhs1, 1), stmt_vinfo->vinfo, > + &dummy, &dt) > + && (dt == vect_external_def > + || dt == vect_constant_def)) > + { > + tree wide_scalar_type = build_nonstandard_integer_type > + (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype1))), > + TYPE_UNSIGNED (rhs1_type)); > + tree vectype3 = get_vectype_for_scalar_type (wide_scalar_type); > + if (expand_vec_cond_expr_p (vectype1, vectype3, TREE_CODE (rhs1))) > + return NULL; > + } > + } > + > /* If rhs1 is a comparison we need to move it into a > separate statement. */ > if (TREE_CODE (rhs1) != SSA_NAME) > Index: gcc/testsuite/gcc.target/i386/vectorize9.c > =================================================================== > --- gcc/testsuite/gcc.target/i386/vectorize9.c (nonexistent) > +++ gcc/testsuite/gcc.target/i386/vectorize9.c (working copy) > @@ -0,0 +1,16 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O -ftree-loop-vectorize -fdump-tree-vect-details -fno-tree-loop-im -mavx2 -mprefer-vector-width=256" } */ > + > +double x[1024][1024], red[1024]; > +void foo (void) > +{ > + for (int i = 0; i < 1024; ++i) > + for (int j = 0; j < 1024; ++j) > + { > + double v = i == 0 ? 0.0 : red[j]; > + v = v + x[i][j]; > + red[j] = v; > + } > +} > + > +/* { dg-final { scan-tree-dump "vectorization factor = 4" "vect" } } */ > Index: gcc/testsuite/gcc.target/i386/vectorize10.c > =================================================================== > --- gcc/testsuite/gcc.target/i386/vectorize10.c (nonexistent) > +++ gcc/testsuite/gcc.target/i386/vectorize10.c (working copy) > @@ -0,0 +1,16 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O -ftree-loop-vectorize -fdump-tree-vect-details -fno-tree-loop-im -msse2 -mno-avx" } */ > + > +double x[1024][1024], red[1024]; > +void foo (void) > +{ > + for (int i = 0; i < 1024; ++i) > + for (int j = 0; j < 1024; ++j) > + { > + double v = i == 0 ? 0.0 : red[j]; > + v = v + x[i][j]; > + red[j] = v; > + } > +} > + > +/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */
On Thu, 7 Dec 2017, Bin.Cheng wrote: > On Wed, Dec 6, 2017 at 1:29 PM, Richard Biener <rguenther@suse.de> wrote: > > > > The following fixes a vectorization issue that appears when trying > > to vectorize the bwaves mat_times_vec kernel after interchange was > > performed by the interchange pass. That interchange inserts the > > following code for the former reduction created by LIM store-motion: > I do observe more cases are vectorized by this patch on AArch64. > Still want to find a way not generating the cond_expr, but for the moment > I will have another patch make interchange even more conservative for > small cases. In which the new cmp/select instructions do cost a lot against > the small loop body. Yeah. I thought about what it takes to avoid the conditional - basically we'd need to turn the init value to a (non-nested) loop that we'd need to insert on the preheader of the outer loop. Richard. > Thanks, > bin > > > > <bb 13> [local count: 161061274]: > > # m_58 = PHI <1(10), m_84(20)> > > ... > > <bb 14> [local count: 912680551]: > > # l_35 = PHI <1(13), l_57(21)> > > ... > > y__I_lsm.113_140 = *y_139(D)[_31]; > > y__I_lsm.113_94 = m_58 != 1 ? y__I_lsm.113_140 : 0.0; > > ... > > *y_139(D)[_31] = _101; > > > > > > so we have a COND_EXPR with a test on an integer IV m_58 with > > double values. Note that the m_58 != 1 condition is invariant > > in the l loop. > > > > Currently we vectorize this condition using V8SImode vectors > > causing a vectorization factor of 8 and thus forcing the scalar > > path for the bwaves case (the loops have an iteration count of 5). > > > > The following patch makes the vectorizer handle invariant conditions > > in the first place and second handle widening of operands of invariant > > conditions transparently (the promotion will happen on the invariant > > scalars). This makes it possible to use a vectorization factor of 4, > > reducing the bwaves runtime from 208s before interchange > > (via 190s after interchange) to 172s after interchange and vectorization > > with AVX256 (on a Haswell machine). > > > > For the vectorizable_condition part to work I need to avoid > > pulling apart the condition from the COND_EXPR during pattern > > detection. > > > > Bootstrapped on x86_64-unknown-linux-gnu, testing in progress. > > > > Richard. > > > > 2017-12-06 Richard Biener <rguenther@suse.de> > > > > PR tree-optimization/81303 > > * tree-vect-stmts.c (vect_is_simple_cond): For invariant > > conditions try to create a comparison vector type matching > > the data vector type. > > (vectorizable_condition): Adjust. > > * tree-vect-patterns.c (vect_recog_mask_conversion_pattern): > > Leave invariant conditions alone in case we can vectorize those. > > > > * gcc.target/i386/vectorize9.c: New testcase. > > * gcc.target/i386/vectorize10.c: New testcase. > > > > Index: gcc/tree-vect-stmts.c > > =================================================================== > > --- gcc/tree-vect-stmts.c (revision 255438) > > +++ gcc/tree-vect-stmts.c (working copy) > > @@ -7792,7 +7792,8 @@ vectorizable_load (gimple *stmt, gimple_ > > > > static bool > > vect_is_simple_cond (tree cond, vec_info *vinfo, > > - tree *comp_vectype, enum vect_def_type *dts) > > + tree *comp_vectype, enum vect_def_type *dts, > > + tree vectype) > > { > > tree lhs, rhs; > > tree vectype1 = NULL_TREE, vectype2 = NULL_TREE; > > @@ -7845,6 +7846,20 @@ vect_is_simple_cond (tree cond, vec_info > > return false; > > > > *comp_vectype = vectype1 ? vectype1 : vectype2; > > + /* Invariant comparison. */ > > + if (! *comp_vectype) > > + { > > + tree scalar_type = TREE_TYPE (lhs); > > + /* If we can widen the comparison to match vectype do so. */ > > + if (INTEGRAL_TYPE_P (scalar_type) > > + && tree_int_cst_lt (TYPE_SIZE (scalar_type), > > + TYPE_SIZE (TREE_TYPE (vectype)))) > > + scalar_type = build_nonstandard_integer_type > > + (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype))), > > + TYPE_UNSIGNED (scalar_type)); > > + *comp_vectype = get_vectype_for_scalar_type (scalar_type); > > + } > > + > > return true; > > } > > > > @@ -7942,7 +7957,7 @@ vectorizable_condition (gimple *stmt, gi > > else_clause = gimple_assign_rhs3 (stmt); > > > > if (!vect_is_simple_cond (cond_expr, stmt_info->vinfo, > > - &comp_vectype, &dts[0]) > > + &comp_vectype, &dts[0], vectype) > > || !comp_vectype) > > return false; > > > > Index: gcc/tree-vect-patterns.c > > =================================================================== > > --- gcc/tree-vect-patterns.c (revision 255438) > > +++ gcc/tree-vect-patterns.c (working copy) > > @@ -3976,6 +3976,32 @@ vect_recog_mask_conversion_pattern (vec< > > || TYPE_VECTOR_SUBPARTS (vectype1) == TYPE_VECTOR_SUBPARTS (vectype2)) > > return NULL; > > > > + /* If rhs1 is invariant and we can promote it leave the COND_EXPR > > + in place, we can handle it in vectorizable_condition. This avoids > > + unnecessary promotion stmts and increased vectorization factor. */ > > + if (COMPARISON_CLASS_P (rhs1) > > + && INTEGRAL_TYPE_P (rhs1_type) > > + && TYPE_VECTOR_SUBPARTS (vectype1) < TYPE_VECTOR_SUBPARTS (vectype2)) > > + { > > + gimple *dummy; > > + enum vect_def_type dt; > > + if (vect_is_simple_use (TREE_OPERAND (rhs1, 0), stmt_vinfo->vinfo, > > + &dummy, &dt) > > + && dt == vect_external_def > > + && vect_is_simple_use (TREE_OPERAND (rhs1, 1), stmt_vinfo->vinfo, > > + &dummy, &dt) > > + && (dt == vect_external_def > > + || dt == vect_constant_def)) > > + { > > + tree wide_scalar_type = build_nonstandard_integer_type > > + (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype1))), > > + TYPE_UNSIGNED (rhs1_type)); > > + tree vectype3 = get_vectype_for_scalar_type (wide_scalar_type); > > + if (expand_vec_cond_expr_p (vectype1, vectype3, TREE_CODE (rhs1))) > > + return NULL; > > + } > > + } > > + > > /* If rhs1 is a comparison we need to move it into a > > separate statement. */ > > if (TREE_CODE (rhs1) != SSA_NAME) > > Index: gcc/testsuite/gcc.target/i386/vectorize9.c > > =================================================================== > > --- gcc/testsuite/gcc.target/i386/vectorize9.c (nonexistent) > > +++ gcc/testsuite/gcc.target/i386/vectorize9.c (working copy) > > @@ -0,0 +1,16 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O -ftree-loop-vectorize -fdump-tree-vect-details -fno-tree-loop-im -mavx2 -mprefer-vector-width=256" } */ > > + > > +double x[1024][1024], red[1024]; > > +void foo (void) > > +{ > > + for (int i = 0; i < 1024; ++i) > > + for (int j = 0; j < 1024; ++j) > > + { > > + double v = i == 0 ? 0.0 : red[j]; > > + v = v + x[i][j]; > > + red[j] = v; > > + } > > +} > > + > > +/* { dg-final { scan-tree-dump "vectorization factor = 4" "vect" } } */ > > Index: gcc/testsuite/gcc.target/i386/vectorize10.c > > =================================================================== > > --- gcc/testsuite/gcc.target/i386/vectorize10.c (nonexistent) > > +++ gcc/testsuite/gcc.target/i386/vectorize10.c (working copy) > > @@ -0,0 +1,16 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O -ftree-loop-vectorize -fdump-tree-vect-details -fno-tree-loop-im -msse2 -mno-avx" } */ > > + > > +double x[1024][1024], red[1024]; > > +void foo (void) > > +{ > > + for (int i = 0; i < 1024; ++i) > > + for (int j = 0; j < 1024; ++j) > > + { > > + double v = i == 0 ? 0.0 : red[j]; > > + v = v + x[i][j]; > > + red[j] = v; > > + } > > +} > > + > > +/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */ > >
On 8 December 2017 at 09:07, Richard Biener <rguenther@suse.de> wrote: > On Thu, 7 Dec 2017, Bin.Cheng wrote: > >> On Wed, Dec 6, 2017 at 1:29 PM, Richard Biener <rguenther@suse.de> wrote: >> > >> > The following fixes a vectorization issue that appears when trying >> > to vectorize the bwaves mat_times_vec kernel after interchange was >> > performed by the interchange pass. That interchange inserts the >> > following code for the former reduction created by LIM store-motion: >> I do observe more cases are vectorized by this patch on AArch64. >> Still want to find a way not generating the cond_expr, but for the moment >> I will have another patch make interchange even more conservative for >> small cases. In which the new cmp/select instructions do cost a lot against >> the small loop body. > > Yeah. I thought about what it takes to avoid the conditional - basically > we'd need to turn the init value to a (non-nested) loop that we'd need > to insert on the preheader of the outer loop. > Hi, I noticed a regression on aarch64 after Bin's commit r255472: gcc.target/aarch64/pr62178.c scan-assembler ldr\\tq[0-9]+, \\[x[0-9]+\\], [0-9]+ gcc.target/aarch64/pr62178.c scan-assembler ldr\\ts[0-9]+, \\[x[0-9]+, [0-9]+\\]! gcc.target/aarch64/pr62178.c scan-assembler mla\\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.s\\[0\\] Is this patch supposed to fix it? Thanks, Christophe > Richard. > >> Thanks, >> bin >> > >> > <bb 13> [local count: 161061274]: >> > # m_58 = PHI <1(10), m_84(20)> >> > ... >> > <bb 14> [local count: 912680551]: >> > # l_35 = PHI <1(13), l_57(21)> >> > ... >> > y__I_lsm.113_140 = *y_139(D)[_31]; >> > y__I_lsm.113_94 = m_58 != 1 ? y__I_lsm.113_140 : 0.0; >> > ... >> > *y_139(D)[_31] = _101; >> > >> > >> > so we have a COND_EXPR with a test on an integer IV m_58 with >> > double values. Note that the m_58 != 1 condition is invariant >> > in the l loop. >> > >> > Currently we vectorize this condition using V8SImode vectors >> > causing a vectorization factor of 8 and thus forcing the scalar >> > path for the bwaves case (the loops have an iteration count of 5). >> > >> > The following patch makes the vectorizer handle invariant conditions >> > in the first place and second handle widening of operands of invariant >> > conditions transparently (the promotion will happen on the invariant >> > scalars). This makes it possible to use a vectorization factor of 4, >> > reducing the bwaves runtime from 208s before interchange >> > (via 190s after interchange) to 172s after interchange and vectorization >> > with AVX256 (on a Haswell machine). >> > >> > For the vectorizable_condition part to work I need to avoid >> > pulling apart the condition from the COND_EXPR during pattern >> > detection. >> > >> > Bootstrapped on x86_64-unknown-linux-gnu, testing in progress. >> > >> > Richard. >> > >> > 2017-12-06 Richard Biener <rguenther@suse.de> >> > >> > PR tree-optimization/81303 >> > * tree-vect-stmts.c (vect_is_simple_cond): For invariant >> > conditions try to create a comparison vector type matching >> > the data vector type. >> > (vectorizable_condition): Adjust. >> > * tree-vect-patterns.c (vect_recog_mask_conversion_pattern): >> > Leave invariant conditions alone in case we can vectorize those. >> > >> > * gcc.target/i386/vectorize9.c: New testcase. >> > * gcc.target/i386/vectorize10.c: New testcase. >> > >> > Index: gcc/tree-vect-stmts.c >> > =================================================================== >> > --- gcc/tree-vect-stmts.c (revision 255438) >> > +++ gcc/tree-vect-stmts.c (working copy) >> > @@ -7792,7 +7792,8 @@ vectorizable_load (gimple *stmt, gimple_ >> > >> > static bool >> > vect_is_simple_cond (tree cond, vec_info *vinfo, >> > - tree *comp_vectype, enum vect_def_type *dts) >> > + tree *comp_vectype, enum vect_def_type *dts, >> > + tree vectype) >> > { >> > tree lhs, rhs; >> > tree vectype1 = NULL_TREE, vectype2 = NULL_TREE; >> > @@ -7845,6 +7846,20 @@ vect_is_simple_cond (tree cond, vec_info >> > return false; >> > >> > *comp_vectype = vectype1 ? vectype1 : vectype2; >> > + /* Invariant comparison. */ >> > + if (! *comp_vectype) >> > + { >> > + tree scalar_type = TREE_TYPE (lhs); >> > + /* If we can widen the comparison to match vectype do so. */ >> > + if (INTEGRAL_TYPE_P (scalar_type) >> > + && tree_int_cst_lt (TYPE_SIZE (scalar_type), >> > + TYPE_SIZE (TREE_TYPE (vectype)))) >> > + scalar_type = build_nonstandard_integer_type >> > + (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype))), >> > + TYPE_UNSIGNED (scalar_type)); >> > + *comp_vectype = get_vectype_for_scalar_type (scalar_type); >> > + } >> > + >> > return true; >> > } >> > >> > @@ -7942,7 +7957,7 @@ vectorizable_condition (gimple *stmt, gi >> > else_clause = gimple_assign_rhs3 (stmt); >> > >> > if (!vect_is_simple_cond (cond_expr, stmt_info->vinfo, >> > - &comp_vectype, &dts[0]) >> > + &comp_vectype, &dts[0], vectype) >> > || !comp_vectype) >> > return false; >> > >> > Index: gcc/tree-vect-patterns.c >> > =================================================================== >> > --- gcc/tree-vect-patterns.c (revision 255438) >> > +++ gcc/tree-vect-patterns.c (working copy) >> > @@ -3976,6 +3976,32 @@ vect_recog_mask_conversion_pattern (vec< >> > || TYPE_VECTOR_SUBPARTS (vectype1) == TYPE_VECTOR_SUBPARTS (vectype2)) >> > return NULL; >> > >> > + /* If rhs1 is invariant and we can promote it leave the COND_EXPR >> > + in place, we can handle it in vectorizable_condition. This avoids >> > + unnecessary promotion stmts and increased vectorization factor. */ >> > + if (COMPARISON_CLASS_P (rhs1) >> > + && INTEGRAL_TYPE_P (rhs1_type) >> > + && TYPE_VECTOR_SUBPARTS (vectype1) < TYPE_VECTOR_SUBPARTS (vectype2)) >> > + { >> > + gimple *dummy; >> > + enum vect_def_type dt; >> > + if (vect_is_simple_use (TREE_OPERAND (rhs1, 0), stmt_vinfo->vinfo, >> > + &dummy, &dt) >> > + && dt == vect_external_def >> > + && vect_is_simple_use (TREE_OPERAND (rhs1, 1), stmt_vinfo->vinfo, >> > + &dummy, &dt) >> > + && (dt == vect_external_def >> > + || dt == vect_constant_def)) >> > + { >> > + tree wide_scalar_type = build_nonstandard_integer_type >> > + (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype1))), >> > + TYPE_UNSIGNED (rhs1_type)); >> > + tree vectype3 = get_vectype_for_scalar_type (wide_scalar_type); >> > + if (expand_vec_cond_expr_p (vectype1, vectype3, TREE_CODE (rhs1))) >> > + return NULL; >> > + } >> > + } >> > + >> > /* If rhs1 is a comparison we need to move it into a >> > separate statement. */ >> > if (TREE_CODE (rhs1) != SSA_NAME) >> > Index: gcc/testsuite/gcc.target/i386/vectorize9.c >> > =================================================================== >> > --- gcc/testsuite/gcc.target/i386/vectorize9.c (nonexistent) >> > +++ gcc/testsuite/gcc.target/i386/vectorize9.c (working copy) >> > @@ -0,0 +1,16 @@ >> > +/* { dg-do compile } */ >> > +/* { dg-options "-O -ftree-loop-vectorize -fdump-tree-vect-details -fno-tree-loop-im -mavx2 -mprefer-vector-width=256" } */ >> > + >> > +double x[1024][1024], red[1024]; >> > +void foo (void) >> > +{ >> > + for (int i = 0; i < 1024; ++i) >> > + for (int j = 0; j < 1024; ++j) >> > + { >> > + double v = i == 0 ? 0.0 : red[j]; >> > + v = v + x[i][j]; >> > + red[j] = v; >> > + } >> > +} >> > + >> > +/* { dg-final { scan-tree-dump "vectorization factor = 4" "vect" } } */ >> > Index: gcc/testsuite/gcc.target/i386/vectorize10.c >> > =================================================================== >> > --- gcc/testsuite/gcc.target/i386/vectorize10.c (nonexistent) >> > +++ gcc/testsuite/gcc.target/i386/vectorize10.c (working copy) >> > @@ -0,0 +1,16 @@ >> > +/* { dg-do compile } */ >> > +/* { dg-options "-O -ftree-loop-vectorize -fdump-tree-vect-details -fno-tree-loop-im -msse2 -mno-avx" } */ >> > + >> > +double x[1024][1024], red[1024]; >> > +void foo (void) >> > +{ >> > + for (int i = 0; i < 1024; ++i) >> > + for (int j = 0; j < 1024; ++j) >> > + { >> > + double v = i == 0 ? 0.0 : red[j]; >> > + v = v + x[i][j]; >> > + red[j] = v; >> > + } >> > +} >> > + >> > +/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */ >> >> > > -- > Richard Biener <rguenther@suse.de> > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg)
On Fri, 8 Dec 2017, Christophe Lyon wrote: > On 8 December 2017 at 09:07, Richard Biener <rguenther@suse.de> wrote: > > On Thu, 7 Dec 2017, Bin.Cheng wrote: > > > >> On Wed, Dec 6, 2017 at 1:29 PM, Richard Biener <rguenther@suse.de> wrote: > >> > > >> > The following fixes a vectorization issue that appears when trying > >> > to vectorize the bwaves mat_times_vec kernel after interchange was > >> > performed by the interchange pass. That interchange inserts the > >> > following code for the former reduction created by LIM store-motion: > >> I do observe more cases are vectorized by this patch on AArch64. > >> Still want to find a way not generating the cond_expr, but for the moment > >> I will have another patch make interchange even more conservative for > >> small cases. In which the new cmp/select instructions do cost a lot against > >> the small loop body. > > > > Yeah. I thought about what it takes to avoid the conditional - basically > > we'd need to turn the init value to a (non-nested) loop that we'd need > > to insert on the preheader of the outer loop. > > > > Hi, > > I noticed a regression on aarch64 after Bin's commit r255472: > gcc.target/aarch64/pr62178.c scan-assembler ldr\\tq[0-9]+, > \\[x[0-9]+\\], [0-9]+ > gcc.target/aarch64/pr62178.c scan-assembler ldr\\ts[0-9]+, > \\[x[0-9]+, [0-9]+\\]! > gcc.target/aarch64/pr62178.c scan-assembler mla\\tv[0-9]+.4s, > v[0-9]+.4s, v[0-9]+.s\\[0\\] > > Is this patch supposed to fix it? No, from what I can see the patch shouldn't affect it. But it's not clear what the testcase tests for - it just scans assembler. Clearly we want to interchange the loop here so the scan assembler needs to be adjusted and one has to revisit PR62178 to check whether the result is still ok (or simply add -fno-loop-interchange to it). Richard. > Thanks, > > Christophe > > > Richard. > > > >> Thanks, > >> bin > >> > > >> > <bb 13> [local count: 161061274]: > >> > # m_58 = PHI <1(10), m_84(20)> > >> > ... > >> > <bb 14> [local count: 912680551]: > >> > # l_35 = PHI <1(13), l_57(21)> > >> > ... > >> > y__I_lsm.113_140 = *y_139(D)[_31]; > >> > y__I_lsm.113_94 = m_58 != 1 ? y__I_lsm.113_140 : 0.0; > >> > ... > >> > *y_139(D)[_31] = _101; > >> > > >> > > >> > so we have a COND_EXPR with a test on an integer IV m_58 with > >> > double values. Note that the m_58 != 1 condition is invariant > >> > in the l loop. > >> > > >> > Currently we vectorize this condition using V8SImode vectors > >> > causing a vectorization factor of 8 and thus forcing the scalar > >> > path for the bwaves case (the loops have an iteration count of 5). > >> > > >> > The following patch makes the vectorizer handle invariant conditions > >> > in the first place and second handle widening of operands of invariant > >> > conditions transparently (the promotion will happen on the invariant > >> > scalars). This makes it possible to use a vectorization factor of 4, > >> > reducing the bwaves runtime from 208s before interchange > >> > (via 190s after interchange) to 172s after interchange and vectorization > >> > with AVX256 (on a Haswell machine). > >> > > >> > For the vectorizable_condition part to work I need to avoid > >> > pulling apart the condition from the COND_EXPR during pattern > >> > detection. > >> > > >> > Bootstrapped on x86_64-unknown-linux-gnu, testing in progress. > >> > > >> > Richard. > >> > > >> > 2017-12-06 Richard Biener <rguenther@suse.de> > >> > > >> > PR tree-optimization/81303 > >> > * tree-vect-stmts.c (vect_is_simple_cond): For invariant > >> > conditions try to create a comparison vector type matching > >> > the data vector type. > >> > (vectorizable_condition): Adjust. > >> > * tree-vect-patterns.c (vect_recog_mask_conversion_pattern): > >> > Leave invariant conditions alone in case we can vectorize those. > >> > > >> > * gcc.target/i386/vectorize9.c: New testcase. > >> > * gcc.target/i386/vectorize10.c: New testcase. > >> > > >> > Index: gcc/tree-vect-stmts.c > >> > =================================================================== > >> > --- gcc/tree-vect-stmts.c (revision 255438) > >> > +++ gcc/tree-vect-stmts.c (working copy) > >> > @@ -7792,7 +7792,8 @@ vectorizable_load (gimple *stmt, gimple_ > >> > > >> > static bool > >> > vect_is_simple_cond (tree cond, vec_info *vinfo, > >> > - tree *comp_vectype, enum vect_def_type *dts) > >> > + tree *comp_vectype, enum vect_def_type *dts, > >> > + tree vectype) > >> > { > >> > tree lhs, rhs; > >> > tree vectype1 = NULL_TREE, vectype2 = NULL_TREE; > >> > @@ -7845,6 +7846,20 @@ vect_is_simple_cond (tree cond, vec_info > >> > return false; > >> > > >> > *comp_vectype = vectype1 ? vectype1 : vectype2; > >> > + /* Invariant comparison. */ > >> > + if (! *comp_vectype) > >> > + { > >> > + tree scalar_type = TREE_TYPE (lhs); > >> > + /* If we can widen the comparison to match vectype do so. */ > >> > + if (INTEGRAL_TYPE_P (scalar_type) > >> > + && tree_int_cst_lt (TYPE_SIZE (scalar_type), > >> > + TYPE_SIZE (TREE_TYPE (vectype)))) > >> > + scalar_type = build_nonstandard_integer_type > >> > + (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype))), > >> > + TYPE_UNSIGNED (scalar_type)); > >> > + *comp_vectype = get_vectype_for_scalar_type (scalar_type); > >> > + } > >> > + > >> > return true; > >> > } > >> > > >> > @@ -7942,7 +7957,7 @@ vectorizable_condition (gimple *stmt, gi > >> > else_clause = gimple_assign_rhs3 (stmt); > >> > > >> > if (!vect_is_simple_cond (cond_expr, stmt_info->vinfo, > >> > - &comp_vectype, &dts[0]) > >> > + &comp_vectype, &dts[0], vectype) > >> > || !comp_vectype) > >> > return false; > >> > > >> > Index: gcc/tree-vect-patterns.c > >> > =================================================================== > >> > --- gcc/tree-vect-patterns.c (revision 255438) > >> > +++ gcc/tree-vect-patterns.c (working copy) > >> > @@ -3976,6 +3976,32 @@ vect_recog_mask_conversion_pattern (vec< > >> > || TYPE_VECTOR_SUBPARTS (vectype1) == TYPE_VECTOR_SUBPARTS (vectype2)) > >> > return NULL; > >> > > >> > + /* If rhs1 is invariant and we can promote it leave the COND_EXPR > >> > + in place, we can handle it in vectorizable_condition. This avoids > >> > + unnecessary promotion stmts and increased vectorization factor. */ > >> > + if (COMPARISON_CLASS_P (rhs1) > >> > + && INTEGRAL_TYPE_P (rhs1_type) > >> > + && TYPE_VECTOR_SUBPARTS (vectype1) < TYPE_VECTOR_SUBPARTS (vectype2)) > >> > + { > >> > + gimple *dummy; > >> > + enum vect_def_type dt; > >> > + if (vect_is_simple_use (TREE_OPERAND (rhs1, 0), stmt_vinfo->vinfo, > >> > + &dummy, &dt) > >> > + && dt == vect_external_def > >> > + && vect_is_simple_use (TREE_OPERAND (rhs1, 1), stmt_vinfo->vinfo, > >> > + &dummy, &dt) > >> > + && (dt == vect_external_def > >> > + || dt == vect_constant_def)) > >> > + { > >> > + tree wide_scalar_type = build_nonstandard_integer_type > >> > + (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype1))), > >> > + TYPE_UNSIGNED (rhs1_type)); > >> > + tree vectype3 = get_vectype_for_scalar_type (wide_scalar_type); > >> > + if (expand_vec_cond_expr_p (vectype1, vectype3, TREE_CODE (rhs1))) > >> > + return NULL; > >> > + } > >> > + } > >> > + > >> > /* If rhs1 is a comparison we need to move it into a > >> > separate statement. */ > >> > if (TREE_CODE (rhs1) != SSA_NAME) > >> > Index: gcc/testsuite/gcc.target/i386/vectorize9.c > >> > =================================================================== > >> > --- gcc/testsuite/gcc.target/i386/vectorize9.c (nonexistent) > >> > +++ gcc/testsuite/gcc.target/i386/vectorize9.c (working copy) > >> > @@ -0,0 +1,16 @@ > >> > +/* { dg-do compile } */ > >> > +/* { dg-options "-O -ftree-loop-vectorize -fdump-tree-vect-details -fno-tree-loop-im -mavx2 -mprefer-vector-width=256" } */ > >> > + > >> > +double x[1024][1024], red[1024]; > >> > +void foo (void) > >> > +{ > >> > + for (int i = 0; i < 1024; ++i) > >> > + for (int j = 0; j < 1024; ++j) > >> > + { > >> > + double v = i == 0 ? 0.0 : red[j]; > >> > + v = v + x[i][j]; > >> > + red[j] = v; > >> > + } > >> > +} > >> > + > >> > +/* { dg-final { scan-tree-dump "vectorization factor = 4" "vect" } } */ > >> > Index: gcc/testsuite/gcc.target/i386/vectorize10.c > >> > =================================================================== > >> > --- gcc/testsuite/gcc.target/i386/vectorize10.c (nonexistent) > >> > +++ gcc/testsuite/gcc.target/i386/vectorize10.c (working copy) > >> > @@ -0,0 +1,16 @@ > >> > +/* { dg-do compile } */ > >> > +/* { dg-options "-O -ftree-loop-vectorize -fdump-tree-vect-details -fno-tree-loop-im -msse2 -mno-avx" } */ > >> > + > >> > +double x[1024][1024], red[1024]; > >> > +void foo (void) > >> > +{ > >> > + for (int i = 0; i < 1024; ++i) > >> > + for (int j = 0; j < 1024; ++j) > >> > + { > >> > + double v = i == 0 ? 0.0 : red[j]; > >> > + v = v + x[i][j]; > >> > + red[j] = v; > >> > + } > >> > +} > >> > + > >> > +/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */ > >> > >> > > > > -- > > Richard Biener <rguenther@suse.de> > > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg) > >
On Fri, Dec 8, 2017 at 8:29 AM, Richard Biener <rguenther@suse.de> wrote: > On Fri, 8 Dec 2017, Christophe Lyon wrote: > >> On 8 December 2017 at 09:07, Richard Biener <rguenther@suse.de> wrote: >> > On Thu, 7 Dec 2017, Bin.Cheng wrote: >> > >> >> On Wed, Dec 6, 2017 at 1:29 PM, Richard Biener <rguenther@suse.de> wrote: >> >> > >> >> > The following fixes a vectorization issue that appears when trying >> >> > to vectorize the bwaves mat_times_vec kernel after interchange was >> >> > performed by the interchange pass. That interchange inserts the >> >> > following code for the former reduction created by LIM store-motion: >> >> I do observe more cases are vectorized by this patch on AArch64. >> >> Still want to find a way not generating the cond_expr, but for the moment >> >> I will have another patch make interchange even more conservative for >> >> small cases. In which the new cmp/select instructions do cost a lot against >> >> the small loop body. >> > >> > Yeah. I thought about what it takes to avoid the conditional - basically >> > we'd need to turn the init value to a (non-nested) loop that we'd need >> > to insert on the preheader of the outer loop. >> > >> >> Hi, >> >> I noticed a regression on aarch64 after Bin's commit r255472: >> gcc.target/aarch64/pr62178.c scan-assembler ldr\\tq[0-9]+, >> \\[x[0-9]+\\], [0-9]+ >> gcc.target/aarch64/pr62178.c scan-assembler ldr\\ts[0-9]+, >> \\[x[0-9]+, [0-9]+\\]! >> gcc.target/aarch64/pr62178.c scan-assembler mla\\tv[0-9]+.4s, >> v[0-9]+.4s, v[0-9]+.s\\[0\\] >> >> Is this patch supposed to fix it? > > No, from what I can see the patch shouldn't affect it. But it's not > clear what the testcase tests for - it just scans assembler. > Clearly we want to interchange the loop here so the scan assembler I am not very sure. Though interchanging gives better cache behavior, but the loop is relatively small here, the introduced cond_expr results in two more instructions, as well as one additional memory access from undoing reduction. Together with addressing mode chosen in ivopts, it leads to obvious regression. Ah, another issue is the cond_expr blocks vectorization without your patch here. This case is what I meant small loops in which more conservative interchange may be wanted. Thanks, bin > needs to be adjusted and one has to revisit PR62178 to check whether > the result is still ok (or simply add -fno-loop-interchange to it). > > Richard. > >> Thanks, >> >> Christophe >> >> > Richard. >> > >> >> Thanks, >> >> bin >> >> > >> >> > <bb 13> [local count: 161061274]: >> >> > # m_58 = PHI <1(10), m_84(20)> >> >> > ... >> >> > <bb 14> [local count: 912680551]: >> >> > # l_35 = PHI <1(13), l_57(21)> >> >> > ... >> >> > y__I_lsm.113_140 = *y_139(D)[_31]; >> >> > y__I_lsm.113_94 = m_58 != 1 ? y__I_lsm.113_140 : 0.0; >> >> > ... >> >> > *y_139(D)[_31] = _101; >> >> > >> >> > >> >> > so we have a COND_EXPR with a test on an integer IV m_58 with >> >> > double values. Note that the m_58 != 1 condition is invariant >> >> > in the l loop. >> >> > >> >> > Currently we vectorize this condition using V8SImode vectors >> >> > causing a vectorization factor of 8 and thus forcing the scalar >> >> > path for the bwaves case (the loops have an iteration count of 5). >> >> > >> >> > The following patch makes the vectorizer handle invariant conditions >> >> > in the first place and second handle widening of operands of invariant >> >> > conditions transparently (the promotion will happen on the invariant >> >> > scalars). This makes it possible to use a vectorization factor of 4, >> >> > reducing the bwaves runtime from 208s before interchange >> >> > (via 190s after interchange) to 172s after interchange and vectorization >> >> > with AVX256 (on a Haswell machine). >> >> > >> >> > For the vectorizable_condition part to work I need to avoid >> >> > pulling apart the condition from the COND_EXPR during pattern >> >> > detection. >> >> > >> >> > Bootstrapped on x86_64-unknown-linux-gnu, testing in progress. >> >> > >> >> > Richard. >> >> > >> >> > 2017-12-06 Richard Biener <rguenther@suse.de> >> >> > >> >> > PR tree-optimization/81303 >> >> > * tree-vect-stmts.c (vect_is_simple_cond): For invariant >> >> > conditions try to create a comparison vector type matching >> >> > the data vector type. >> >> > (vectorizable_condition): Adjust. >> >> > * tree-vect-patterns.c (vect_recog_mask_conversion_pattern): >> >> > Leave invariant conditions alone in case we can vectorize those. >> >> > >> >> > * gcc.target/i386/vectorize9.c: New testcase. >> >> > * gcc.target/i386/vectorize10.c: New testcase. >> >> > >> >> > Index: gcc/tree-vect-stmts.c >> >> > =================================================================== >> >> > --- gcc/tree-vect-stmts.c (revision 255438) >> >> > +++ gcc/tree-vect-stmts.c (working copy) >> >> > @@ -7792,7 +7792,8 @@ vectorizable_load (gimple *stmt, gimple_ >> >> > >> >> > static bool >> >> > vect_is_simple_cond (tree cond, vec_info *vinfo, >> >> > - tree *comp_vectype, enum vect_def_type *dts) >> >> > + tree *comp_vectype, enum vect_def_type *dts, >> >> > + tree vectype) >> >> > { >> >> > tree lhs, rhs; >> >> > tree vectype1 = NULL_TREE, vectype2 = NULL_TREE; >> >> > @@ -7845,6 +7846,20 @@ vect_is_simple_cond (tree cond, vec_info >> >> > return false; >> >> > >> >> > *comp_vectype = vectype1 ? vectype1 : vectype2; >> >> > + /* Invariant comparison. */ >> >> > + if (! *comp_vectype) >> >> > + { >> >> > + tree scalar_type = TREE_TYPE (lhs); >> >> > + /* If we can widen the comparison to match vectype do so. */ >> >> > + if (INTEGRAL_TYPE_P (scalar_type) >> >> > + && tree_int_cst_lt (TYPE_SIZE (scalar_type), >> >> > + TYPE_SIZE (TREE_TYPE (vectype)))) >> >> > + scalar_type = build_nonstandard_integer_type >> >> > + (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype))), >> >> > + TYPE_UNSIGNED (scalar_type)); >> >> > + *comp_vectype = get_vectype_for_scalar_type (scalar_type); >> >> > + } >> >> > + >> >> > return true; >> >> > } >> >> > >> >> > @@ -7942,7 +7957,7 @@ vectorizable_condition (gimple *stmt, gi >> >> > else_clause = gimple_assign_rhs3 (stmt); >> >> > >> >> > if (!vect_is_simple_cond (cond_expr, stmt_info->vinfo, >> >> > - &comp_vectype, &dts[0]) >> >> > + &comp_vectype, &dts[0], vectype) >> >> > || !comp_vectype) >> >> > return false; >> >> > >> >> > Index: gcc/tree-vect-patterns.c >> >> > =================================================================== >> >> > --- gcc/tree-vect-patterns.c (revision 255438) >> >> > +++ gcc/tree-vect-patterns.c (working copy) >> >> > @@ -3976,6 +3976,32 @@ vect_recog_mask_conversion_pattern (vec< >> >> > || TYPE_VECTOR_SUBPARTS (vectype1) == TYPE_VECTOR_SUBPARTS (vectype2)) >> >> > return NULL; >> >> > >> >> > + /* If rhs1 is invariant and we can promote it leave the COND_EXPR >> >> > + in place, we can handle it in vectorizable_condition. This avoids >> >> > + unnecessary promotion stmts and increased vectorization factor. */ >> >> > + if (COMPARISON_CLASS_P (rhs1) >> >> > + && INTEGRAL_TYPE_P (rhs1_type) >> >> > + && TYPE_VECTOR_SUBPARTS (vectype1) < TYPE_VECTOR_SUBPARTS (vectype2)) >> >> > + { >> >> > + gimple *dummy; >> >> > + enum vect_def_type dt; >> >> > + if (vect_is_simple_use (TREE_OPERAND (rhs1, 0), stmt_vinfo->vinfo, >> >> > + &dummy, &dt) >> >> > + && dt == vect_external_def >> >> > + && vect_is_simple_use (TREE_OPERAND (rhs1, 1), stmt_vinfo->vinfo, >> >> > + &dummy, &dt) >> >> > + && (dt == vect_external_def >> >> > + || dt == vect_constant_def)) >> >> > + { >> >> > + tree wide_scalar_type = build_nonstandard_integer_type >> >> > + (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype1))), >> >> > + TYPE_UNSIGNED (rhs1_type)); >> >> > + tree vectype3 = get_vectype_for_scalar_type (wide_scalar_type); >> >> > + if (expand_vec_cond_expr_p (vectype1, vectype3, TREE_CODE (rhs1))) >> >> > + return NULL; >> >> > + } >> >> > + } >> >> > + >> >> > /* If rhs1 is a comparison we need to move it into a >> >> > separate statement. */ >> >> > if (TREE_CODE (rhs1) != SSA_NAME) >> >> > Index: gcc/testsuite/gcc.target/i386/vectorize9.c >> >> > =================================================================== >> >> > --- gcc/testsuite/gcc.target/i386/vectorize9.c (nonexistent) >> >> > +++ gcc/testsuite/gcc.target/i386/vectorize9.c (working copy) >> >> > @@ -0,0 +1,16 @@ >> >> > +/* { dg-do compile } */ >> >> > +/* { dg-options "-O -ftree-loop-vectorize -fdump-tree-vect-details -fno-tree-loop-im -mavx2 -mprefer-vector-width=256" } */ >> >> > + >> >> > +double x[1024][1024], red[1024]; >> >> > +void foo (void) >> >> > +{ >> >> > + for (int i = 0; i < 1024; ++i) >> >> > + for (int j = 0; j < 1024; ++j) >> >> > + { >> >> > + double v = i == 0 ? 0.0 : red[j]; >> >> > + v = v + x[i][j]; >> >> > + red[j] = v; >> >> > + } >> >> > +} >> >> > + >> >> > +/* { dg-final { scan-tree-dump "vectorization factor = 4" "vect" } } */ >> >> > Index: gcc/testsuite/gcc.target/i386/vectorize10.c >> >> > =================================================================== >> >> > --- gcc/testsuite/gcc.target/i386/vectorize10.c (nonexistent) >> >> > +++ gcc/testsuite/gcc.target/i386/vectorize10.c (working copy) >> >> > @@ -0,0 +1,16 @@ >> >> > +/* { dg-do compile } */ >> >> > +/* { dg-options "-O -ftree-loop-vectorize -fdump-tree-vect-details -fno-tree-loop-im -msse2 -mno-avx" } */ >> >> > + >> >> > +double x[1024][1024], red[1024]; >> >> > +void foo (void) >> >> > +{ >> >> > + for (int i = 0; i < 1024; ++i) >> >> > + for (int j = 0; j < 1024; ++j) >> >> > + { >> >> > + double v = i == 0 ? 0.0 : red[j]; >> >> > + v = v + x[i][j]; >> >> > + red[j] = v; >> >> > + } >> >> > +} >> >> > + >> >> > +/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */ >> >> >> >> >> > >> > -- >> > Richard Biener <rguenther@suse.de> >> > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg) >> >> > > -- > Richard Biener <rguenther@suse.de> > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg)
On Fri, 8 Dec 2017, Bin.Cheng wrote: > On Fri, Dec 8, 2017 at 8:29 AM, Richard Biener <rguenther@suse.de> wrote: > > On Fri, 8 Dec 2017, Christophe Lyon wrote: > > > >> On 8 December 2017 at 09:07, Richard Biener <rguenther@suse.de> wrote: > >> > On Thu, 7 Dec 2017, Bin.Cheng wrote: > >> > > >> >> On Wed, Dec 6, 2017 at 1:29 PM, Richard Biener <rguenther@suse.de> wrote: > >> >> > > >> >> > The following fixes a vectorization issue that appears when trying > >> >> > to vectorize the bwaves mat_times_vec kernel after interchange was > >> >> > performed by the interchange pass. That interchange inserts the > >> >> > following code for the former reduction created by LIM store-motion: > >> >> I do observe more cases are vectorized by this patch on AArch64. > >> >> Still want to find a way not generating the cond_expr, but for the moment > >> >> I will have another patch make interchange even more conservative for > >> >> small cases. In which the new cmp/select instructions do cost a lot against > >> >> the small loop body. > >> > > >> > Yeah. I thought about what it takes to avoid the conditional - basically > >> > we'd need to turn the init value to a (non-nested) loop that we'd need > >> > to insert on the preheader of the outer loop. > >> > > >> > >> Hi, > >> > >> I noticed a regression on aarch64 after Bin's commit r255472: > >> gcc.target/aarch64/pr62178.c scan-assembler ldr\\tq[0-9]+, > >> \\[x[0-9]+\\], [0-9]+ > >> gcc.target/aarch64/pr62178.c scan-assembler ldr\\ts[0-9]+, > >> \\[x[0-9]+, [0-9]+\\]! > >> gcc.target/aarch64/pr62178.c scan-assembler mla\\tv[0-9]+.4s, > >> v[0-9]+.4s, v[0-9]+.s\\[0\\] > >> > >> Is this patch supposed to fix it? > > > > No, from what I can see the patch shouldn't affect it. But it's not > > clear what the testcase tests for - it just scans assembler. > > Clearly we want to interchange the loop here so the scan assembler > I am not very sure. Though interchanging gives better cache behavior, > but the loop is relatively small here, the introduced cond_expr > results in two more instructions, as well as one additional memory > access from undoing reduction. Together with addressing mode chosen > in ivopts, it leads to obvious regression. > Ah, another issue is the cond_expr blocks vectorization without your > patch here. This case is what I meant small loops in which more > conservative interchange may be wanted. The loop has int data and int IVs so my patch shouldn't be necessary to vectorize the loop. Richard. > Thanks, > bin > > > needs to be adjusted and one has to revisit PR62178 to check whether > > the result is still ok (or simply add -fno-loop-interchange to it). > > > > Richard. > > > >> Thanks, > >> > >> Christophe > >> > >> > Richard. > >> > > >> >> Thanks, > >> >> bin > >> >> > > >> >> > <bb 13> [local count: 161061274]: > >> >> > # m_58 = PHI <1(10), m_84(20)> > >> >> > ... > >> >> > <bb 14> [local count: 912680551]: > >> >> > # l_35 = PHI <1(13), l_57(21)> > >> >> > ... > >> >> > y__I_lsm.113_140 = *y_139(D)[_31]; > >> >> > y__I_lsm.113_94 = m_58 != 1 ? y__I_lsm.113_140 : 0.0; > >> >> > ... > >> >> > *y_139(D)[_31] = _101; > >> >> > > >> >> > > >> >> > so we have a COND_EXPR with a test on an integer IV m_58 with > >> >> > double values. Note that the m_58 != 1 condition is invariant > >> >> > in the l loop. > >> >> > > >> >> > Currently we vectorize this condition using V8SImode vectors > >> >> > causing a vectorization factor of 8 and thus forcing the scalar > >> >> > path for the bwaves case (the loops have an iteration count of 5). > >> >> > > >> >> > The following patch makes the vectorizer handle invariant conditions > >> >> > in the first place and second handle widening of operands of invariant > >> >> > conditions transparently (the promotion will happen on the invariant > >> >> > scalars). This makes it possible to use a vectorization factor of 4, > >> >> > reducing the bwaves runtime from 208s before interchange > >> >> > (via 190s after interchange) to 172s after interchange and vectorization > >> >> > with AVX256 (on a Haswell machine). > >> >> > > >> >> > For the vectorizable_condition part to work I need to avoid > >> >> > pulling apart the condition from the COND_EXPR during pattern > >> >> > detection. > >> >> > > >> >> > Bootstrapped on x86_64-unknown-linux-gnu, testing in progress. > >> >> > > >> >> > Richard. > >> >> > > >> >> > 2017-12-06 Richard Biener <rguenther@suse.de> > >> >> > > >> >> > PR tree-optimization/81303 > >> >> > * tree-vect-stmts.c (vect_is_simple_cond): For invariant > >> >> > conditions try to create a comparison vector type matching > >> >> > the data vector type. > >> >> > (vectorizable_condition): Adjust. > >> >> > * tree-vect-patterns.c (vect_recog_mask_conversion_pattern): > >> >> > Leave invariant conditions alone in case we can vectorize those. > >> >> > > >> >> > * gcc.target/i386/vectorize9.c: New testcase. > >> >> > * gcc.target/i386/vectorize10.c: New testcase. > >> >> > > >> >> > Index: gcc/tree-vect-stmts.c > >> >> > =================================================================== > >> >> > --- gcc/tree-vect-stmts.c (revision 255438) > >> >> > +++ gcc/tree-vect-stmts.c (working copy) > >> >> > @@ -7792,7 +7792,8 @@ vectorizable_load (gimple *stmt, gimple_ > >> >> > > >> >> > static bool > >> >> > vect_is_simple_cond (tree cond, vec_info *vinfo, > >> >> > - tree *comp_vectype, enum vect_def_type *dts) > >> >> > + tree *comp_vectype, enum vect_def_type *dts, > >> >> > + tree vectype) > >> >> > { > >> >> > tree lhs, rhs; > >> >> > tree vectype1 = NULL_TREE, vectype2 = NULL_TREE; > >> >> > @@ -7845,6 +7846,20 @@ vect_is_simple_cond (tree cond, vec_info > >> >> > return false; > >> >> > > >> >> > *comp_vectype = vectype1 ? vectype1 : vectype2; > >> >> > + /* Invariant comparison. */ > >> >> > + if (! *comp_vectype) > >> >> > + { > >> >> > + tree scalar_type = TREE_TYPE (lhs); > >> >> > + /* If we can widen the comparison to match vectype do so. */ > >> >> > + if (INTEGRAL_TYPE_P (scalar_type) > >> >> > + && tree_int_cst_lt (TYPE_SIZE (scalar_type), > >> >> > + TYPE_SIZE (TREE_TYPE (vectype)))) > >> >> > + scalar_type = build_nonstandard_integer_type > >> >> > + (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype))), > >> >> > + TYPE_UNSIGNED (scalar_type)); > >> >> > + *comp_vectype = get_vectype_for_scalar_type (scalar_type); > >> >> > + } > >> >> > + > >> >> > return true; > >> >> > } > >> >> > > >> >> > @@ -7942,7 +7957,7 @@ vectorizable_condition (gimple *stmt, gi > >> >> > else_clause = gimple_assign_rhs3 (stmt); > >> >> > > >> >> > if (!vect_is_simple_cond (cond_expr, stmt_info->vinfo, > >> >> > - &comp_vectype, &dts[0]) > >> >> > + &comp_vectype, &dts[0], vectype) > >> >> > || !comp_vectype) > >> >> > return false; > >> >> > > >> >> > Index: gcc/tree-vect-patterns.c > >> >> > =================================================================== > >> >> > --- gcc/tree-vect-patterns.c (revision 255438) > >> >> > +++ gcc/tree-vect-patterns.c (working copy) > >> >> > @@ -3976,6 +3976,32 @@ vect_recog_mask_conversion_pattern (vec< > >> >> > || TYPE_VECTOR_SUBPARTS (vectype1) == TYPE_VECTOR_SUBPARTS (vectype2)) > >> >> > return NULL; > >> >> > > >> >> > + /* If rhs1 is invariant and we can promote it leave the COND_EXPR > >> >> > + in place, we can handle it in vectorizable_condition. This avoids > >> >> > + unnecessary promotion stmts and increased vectorization factor. */ > >> >> > + if (COMPARISON_CLASS_P (rhs1) > >> >> > + && INTEGRAL_TYPE_P (rhs1_type) > >> >> > + && TYPE_VECTOR_SUBPARTS (vectype1) < TYPE_VECTOR_SUBPARTS (vectype2)) > >> >> > + { > >> >> > + gimple *dummy; > >> >> > + enum vect_def_type dt; > >> >> > + if (vect_is_simple_use (TREE_OPERAND (rhs1, 0), stmt_vinfo->vinfo, > >> >> > + &dummy, &dt) > >> >> > + && dt == vect_external_def > >> >> > + && vect_is_simple_use (TREE_OPERAND (rhs1, 1), stmt_vinfo->vinfo, > >> >> > + &dummy, &dt) > >> >> > + && (dt == vect_external_def > >> >> > + || dt == vect_constant_def)) > >> >> > + { > >> >> > + tree wide_scalar_type = build_nonstandard_integer_type > >> >> > + (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype1))), > >> >> > + TYPE_UNSIGNED (rhs1_type)); > >> >> > + tree vectype3 = get_vectype_for_scalar_type (wide_scalar_type); > >> >> > + if (expand_vec_cond_expr_p (vectype1, vectype3, TREE_CODE (rhs1))) > >> >> > + return NULL; > >> >> > + } > >> >> > + } > >> >> > + > >> >> > /* If rhs1 is a comparison we need to move it into a > >> >> > separate statement. */ > >> >> > if (TREE_CODE (rhs1) != SSA_NAME) > >> >> > Index: gcc/testsuite/gcc.target/i386/vectorize9.c > >> >> > =================================================================== > >> >> > --- gcc/testsuite/gcc.target/i386/vectorize9.c (nonexistent) > >> >> > +++ gcc/testsuite/gcc.target/i386/vectorize9.c (working copy) > >> >> > @@ -0,0 +1,16 @@ > >> >> > +/* { dg-do compile } */ > >> >> > +/* { dg-options "-O -ftree-loop-vectorize -fdump-tree-vect-details -fno-tree-loop-im -mavx2 -mprefer-vector-width=256" } */ > >> >> > + > >> >> > +double x[1024][1024], red[1024]; > >> >> > +void foo (void) > >> >> > +{ > >> >> > + for (int i = 0; i < 1024; ++i) > >> >> > + for (int j = 0; j < 1024; ++j) > >> >> > + { > >> >> > + double v = i == 0 ? 0.0 : red[j]; > >> >> > + v = v + x[i][j]; > >> >> > + red[j] = v; > >> >> > + } > >> >> > +} > >> >> > + > >> >> > +/* { dg-final { scan-tree-dump "vectorization factor = 4" "vect" } } */ > >> >> > Index: gcc/testsuite/gcc.target/i386/vectorize10.c > >> >> > =================================================================== > >> >> > --- gcc/testsuite/gcc.target/i386/vectorize10.c (nonexistent) > >> >> > +++ gcc/testsuite/gcc.target/i386/vectorize10.c (working copy) > >> >> > @@ -0,0 +1,16 @@ > >> >> > +/* { dg-do compile } */ > >> >> > +/* { dg-options "-O -ftree-loop-vectorize -fdump-tree-vect-details -fno-tree-loop-im -msse2 -mno-avx" } */ > >> >> > + > >> >> > +double x[1024][1024], red[1024]; > >> >> > +void foo (void) > >> >> > +{ > >> >> > + for (int i = 0; i < 1024; ++i) > >> >> > + for (int j = 0; j < 1024; ++j) > >> >> > + { > >> >> > + double v = i == 0 ? 0.0 : red[j]; > >> >> > + v = v + x[i][j]; > >> >> > + red[j] = v; > >> >> > + } > >> >> > +} > >> >> > + > >> >> > +/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */ > >> >> > >> >> > >> > > >> > -- > >> > Richard Biener <rguenther@suse.de> > >> > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg) > >> > >> > > > > -- > > Richard Biener <rguenther@suse.de> > > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg) > >
On Fri, Dec 8, 2017 at 9:54 AM, Richard Biener <rguenther@suse.de> wrote: > On Fri, 8 Dec 2017, Bin.Cheng wrote: > >> On Fri, Dec 8, 2017 at 8:29 AM, Richard Biener <rguenther@suse.de> wrote: >> > On Fri, 8 Dec 2017, Christophe Lyon wrote: >> > >> >> On 8 December 2017 at 09:07, Richard Biener <rguenther@suse.de> wrote: >> >> > On Thu, 7 Dec 2017, Bin.Cheng wrote: >> >> > >> >> >> On Wed, Dec 6, 2017 at 1:29 PM, Richard Biener <rguenther@suse.de> wrote: >> >> >> > >> >> >> > The following fixes a vectorization issue that appears when trying >> >> >> > to vectorize the bwaves mat_times_vec kernel after interchange was >> >> >> > performed by the interchange pass. That interchange inserts the >> >> >> > following code for the former reduction created by LIM store-motion: >> >> >> I do observe more cases are vectorized by this patch on AArch64. >> >> >> Still want to find a way not generating the cond_expr, but for the moment >> >> >> I will have another patch make interchange even more conservative for >> >> >> small cases. In which the new cmp/select instructions do cost a lot against >> >> >> the small loop body. >> >> > >> >> > Yeah. I thought about what it takes to avoid the conditional - basically >> >> > we'd need to turn the init value to a (non-nested) loop that we'd need >> >> > to insert on the preheader of the outer loop. >> >> > >> >> >> >> Hi, >> >> >> >> I noticed a regression on aarch64 after Bin's commit r255472: >> >> gcc.target/aarch64/pr62178.c scan-assembler ldr\\tq[0-9]+, >> >> \\[x[0-9]+\\], [0-9]+ >> >> gcc.target/aarch64/pr62178.c scan-assembler ldr\\ts[0-9]+, >> >> \\[x[0-9]+, [0-9]+\\]! >> >> gcc.target/aarch64/pr62178.c scan-assembler mla\\tv[0-9]+.4s, >> >> v[0-9]+.4s, v[0-9]+.s\\[0\\] >> >> >> >> Is this patch supposed to fix it? >> > >> > No, from what I can see the patch shouldn't affect it. But it's not >> > clear what the testcase tests for - it just scans assembler. >> > Clearly we want to interchange the loop here so the scan assembler >> I am not very sure. Though interchanging gives better cache behavior, >> but the loop is relatively small here, the introduced cond_expr >> results in two more instructions, as well as one additional memory >> access from undoing reduction. Together with addressing mode chosen >> in ivopts, it leads to obvious regression. >> Ah, another issue is the cond_expr blocks vectorization without your >> patch here. This case is what I meant small loops in which more >> conservative interchange may be wanted. > > The loop has int data and int IVs so my patch shouldn't be necessary > to vectorize the loop. I haven't got time look into vectorizer part yet, but there is below in dump file: pr62178.c:12:7: note: vect_is_simple_use: operand k_9 pr62178.c:12:7: note: def_stmt: k_9 = PHI <1(7), k_38(10)> pr62178.c:12:7: note: type of def: external pr62178.c:12:7: note: not vectorized: relevant stmt not supported: r_I_I_lsm.0_19 = k_9 != 1 ? r_I_I_lsm.0_14 : 0; pr62178.c:12:7: note: bad operation or unsupported loop bound. pr62178.c:12:7: note: ***** Re-trying analysis with vector size 8 Thanks, bin > > Richard. > >> Thanks, >> bin >> >> > needs to be adjusted and one has to revisit PR62178 to check whether >> > the result is still ok (or simply add -fno-loop-interchange to it). >> > >> > Richard. >> > >> >> Thanks, >> >> >> >> Christophe >> >> >> >> > Richard. >> >> > >> >> >> Thanks, >> >> >> bin >> >> >> > >> >> >> > <bb 13> [local count: 161061274]: >> >> >> > # m_58 = PHI <1(10), m_84(20)> >> >> >> > ... >> >> >> > <bb 14> [local count: 912680551]: >> >> >> > # l_35 = PHI <1(13), l_57(21)> >> >> >> > ... >> >> >> > y__I_lsm.113_140 = *y_139(D)[_31]; >> >> >> > y__I_lsm.113_94 = m_58 != 1 ? y__I_lsm.113_140 : 0.0; >> >> >> > ... >> >> >> > *y_139(D)[_31] = _101; >> >> >> > >> >> >> > >> >> >> > so we have a COND_EXPR with a test on an integer IV m_58 with >> >> >> > double values. Note that the m_58 != 1 condition is invariant >> >> >> > in the l loop. >> >> >> > >> >> >> > Currently we vectorize this condition using V8SImode vectors >> >> >> > causing a vectorization factor of 8 and thus forcing the scalar >> >> >> > path for the bwaves case (the loops have an iteration count of 5). >> >> >> > >> >> >> > The following patch makes the vectorizer handle invariant conditions >> >> >> > in the first place and second handle widening of operands of invariant >> >> >> > conditions transparently (the promotion will happen on the invariant >> >> >> > scalars). This makes it possible to use a vectorization factor of 4, >> >> >> > reducing the bwaves runtime from 208s before interchange >> >> >> > (via 190s after interchange) to 172s after interchange and vectorization >> >> >> > with AVX256 (on a Haswell machine). >> >> >> > >> >> >> > For the vectorizable_condition part to work I need to avoid >> >> >> > pulling apart the condition from the COND_EXPR during pattern >> >> >> > detection. >> >> >> > >> >> >> > Bootstrapped on x86_64-unknown-linux-gnu, testing in progress. >> >> >> > >> >> >> > Richard. >> >> >> > >> >> >> > 2017-12-06 Richard Biener <rguenther@suse.de> >> >> >> > >> >> >> > PR tree-optimization/81303 >> >> >> > * tree-vect-stmts.c (vect_is_simple_cond): For invariant >> >> >> > conditions try to create a comparison vector type matching >> >> >> > the data vector type. >> >> >> > (vectorizable_condition): Adjust. >> >> >> > * tree-vect-patterns.c (vect_recog_mask_conversion_pattern): >> >> >> > Leave invariant conditions alone in case we can vectorize those. >> >> >> > >> >> >> > * gcc.target/i386/vectorize9.c: New testcase. >> >> >> > * gcc.target/i386/vectorize10.c: New testcase. >> >> >> > >> >> >> > Index: gcc/tree-vect-stmts.c >> >> >> > =================================================================== >> >> >> > --- gcc/tree-vect-stmts.c (revision 255438) >> >> >> > +++ gcc/tree-vect-stmts.c (working copy) >> >> >> > @@ -7792,7 +7792,8 @@ vectorizable_load (gimple *stmt, gimple_ >> >> >> > >> >> >> > static bool >> >> >> > vect_is_simple_cond (tree cond, vec_info *vinfo, >> >> >> > - tree *comp_vectype, enum vect_def_type *dts) >> >> >> > + tree *comp_vectype, enum vect_def_type *dts, >> >> >> > + tree vectype) >> >> >> > { >> >> >> > tree lhs, rhs; >> >> >> > tree vectype1 = NULL_TREE, vectype2 = NULL_TREE; >> >> >> > @@ -7845,6 +7846,20 @@ vect_is_simple_cond (tree cond, vec_info >> >> >> > return false; >> >> >> > >> >> >> > *comp_vectype = vectype1 ? vectype1 : vectype2; >> >> >> > + /* Invariant comparison. */ >> >> >> > + if (! *comp_vectype) >> >> >> > + { >> >> >> > + tree scalar_type = TREE_TYPE (lhs); >> >> >> > + /* If we can widen the comparison to match vectype do so. */ >> >> >> > + if (INTEGRAL_TYPE_P (scalar_type) >> >> >> > + && tree_int_cst_lt (TYPE_SIZE (scalar_type), >> >> >> > + TYPE_SIZE (TREE_TYPE (vectype)))) >> >> >> > + scalar_type = build_nonstandard_integer_type >> >> >> > + (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype))), >> >> >> > + TYPE_UNSIGNED (scalar_type)); >> >> >> > + *comp_vectype = get_vectype_for_scalar_type (scalar_type); >> >> >> > + } >> >> >> > + >> >> >> > return true; >> >> >> > } >> >> >> > >> >> >> > @@ -7942,7 +7957,7 @@ vectorizable_condition (gimple *stmt, gi >> >> >> > else_clause = gimple_assign_rhs3 (stmt); >> >> >> > >> >> >> > if (!vect_is_simple_cond (cond_expr, stmt_info->vinfo, >> >> >> > - &comp_vectype, &dts[0]) >> >> >> > + &comp_vectype, &dts[0], vectype) >> >> >> > || !comp_vectype) >> >> >> > return false; >> >> >> > >> >> >> > Index: gcc/tree-vect-patterns.c >> >> >> > =================================================================== >> >> >> > --- gcc/tree-vect-patterns.c (revision 255438) >> >> >> > +++ gcc/tree-vect-patterns.c (working copy) >> >> >> > @@ -3976,6 +3976,32 @@ vect_recog_mask_conversion_pattern (vec< >> >> >> > || TYPE_VECTOR_SUBPARTS (vectype1) == TYPE_VECTOR_SUBPARTS (vectype2)) >> >> >> > return NULL; >> >> >> > >> >> >> > + /* If rhs1 is invariant and we can promote it leave the COND_EXPR >> >> >> > + in place, we can handle it in vectorizable_condition. This avoids >> >> >> > + unnecessary promotion stmts and increased vectorization factor. */ >> >> >> > + if (COMPARISON_CLASS_P (rhs1) >> >> >> > + && INTEGRAL_TYPE_P (rhs1_type) >> >> >> > + && TYPE_VECTOR_SUBPARTS (vectype1) < TYPE_VECTOR_SUBPARTS (vectype2)) >> >> >> > + { >> >> >> > + gimple *dummy; >> >> >> > + enum vect_def_type dt; >> >> >> > + if (vect_is_simple_use (TREE_OPERAND (rhs1, 0), stmt_vinfo->vinfo, >> >> >> > + &dummy, &dt) >> >> >> > + && dt == vect_external_def >> >> >> > + && vect_is_simple_use (TREE_OPERAND (rhs1, 1), stmt_vinfo->vinfo, >> >> >> > + &dummy, &dt) >> >> >> > + && (dt == vect_external_def >> >> >> > + || dt == vect_constant_def)) >> >> >> > + { >> >> >> > + tree wide_scalar_type = build_nonstandard_integer_type >> >> >> > + (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype1))), >> >> >> > + TYPE_UNSIGNED (rhs1_type)); >> >> >> > + tree vectype3 = get_vectype_for_scalar_type (wide_scalar_type); >> >> >> > + if (expand_vec_cond_expr_p (vectype1, vectype3, TREE_CODE (rhs1))) >> >> >> > + return NULL; >> >> >> > + } >> >> >> > + } >> >> >> > + >> >> >> > /* If rhs1 is a comparison we need to move it into a >> >> >> > separate statement. */ >> >> >> > if (TREE_CODE (rhs1) != SSA_NAME) >> >> >> > Index: gcc/testsuite/gcc.target/i386/vectorize9.c >> >> >> > =================================================================== >> >> >> > --- gcc/testsuite/gcc.target/i386/vectorize9.c (nonexistent) >> >> >> > +++ gcc/testsuite/gcc.target/i386/vectorize9.c (working copy) >> >> >> > @@ -0,0 +1,16 @@ >> >> >> > +/* { dg-do compile } */ >> >> >> > +/* { dg-options "-O -ftree-loop-vectorize -fdump-tree-vect-details -fno-tree-loop-im -mavx2 -mprefer-vector-width=256" } */ >> >> >> > + >> >> >> > +double x[1024][1024], red[1024]; >> >> >> > +void foo (void) >> >> >> > +{ >> >> >> > + for (int i = 0; i < 1024; ++i) >> >> >> > + for (int j = 0; j < 1024; ++j) >> >> >> > + { >> >> >> > + double v = i == 0 ? 0.0 : red[j]; >> >> >> > + v = v + x[i][j]; >> >> >> > + red[j] = v; >> >> >> > + } >> >> >> > +} >> >> >> > + >> >> >> > +/* { dg-final { scan-tree-dump "vectorization factor = 4" "vect" } } */ >> >> >> > Index: gcc/testsuite/gcc.target/i386/vectorize10.c >> >> >> > =================================================================== >> >> >> > --- gcc/testsuite/gcc.target/i386/vectorize10.c (nonexistent) >> >> >> > +++ gcc/testsuite/gcc.target/i386/vectorize10.c (working copy) >> >> >> > @@ -0,0 +1,16 @@ >> >> >> > +/* { dg-do compile } */ >> >> >> > +/* { dg-options "-O -ftree-loop-vectorize -fdump-tree-vect-details -fno-tree-loop-im -msse2 -mno-avx" } */ >> >> >> > + >> >> >> > +double x[1024][1024], red[1024]; >> >> >> > +void foo (void) >> >> >> > +{ >> >> >> > + for (int i = 0; i < 1024; ++i) >> >> >> > + for (int j = 0; j < 1024; ++j) >> >> >> > + { >> >> >> > + double v = i == 0 ? 0.0 : red[j]; >> >> >> > + v = v + x[i][j]; >> >> >> > + red[j] = v; >> >> >> > + } >> >> >> > +} >> >> >> > + >> >> >> > +/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */ >> >> >> >> >> >> >> >> > >> >> > -- >> >> > Richard Biener <rguenther@suse.de> >> >> > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg) >> >> >> >> >> > >> > -- >> > Richard Biener <rguenther@suse.de> >> > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg) >> >> > > -- > Richard Biener <rguenther@suse.de> > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg)
On Fri, 8 Dec 2017, Bin.Cheng wrote: > On Fri, Dec 8, 2017 at 9:54 AM, Richard Biener <rguenther@suse.de> wrote: > > On Fri, 8 Dec 2017, Bin.Cheng wrote: > > > >> On Fri, Dec 8, 2017 at 8:29 AM, Richard Biener <rguenther@suse.de> wrote: > >> > On Fri, 8 Dec 2017, Christophe Lyon wrote: > >> > > >> >> On 8 December 2017 at 09:07, Richard Biener <rguenther@suse.de> wrote: > >> >> > On Thu, 7 Dec 2017, Bin.Cheng wrote: > >> >> > > >> >> >> On Wed, Dec 6, 2017 at 1:29 PM, Richard Biener <rguenther@suse.de> wrote: > >> >> >> > > >> >> >> > The following fixes a vectorization issue that appears when trying > >> >> >> > to vectorize the bwaves mat_times_vec kernel after interchange was > >> >> >> > performed by the interchange pass. That interchange inserts the > >> >> >> > following code for the former reduction created by LIM store-motion: > >> >> >> I do observe more cases are vectorized by this patch on AArch64. > >> >> >> Still want to find a way not generating the cond_expr, but for the moment > >> >> >> I will have another patch make interchange even more conservative for > >> >> >> small cases. In which the new cmp/select instructions do cost a lot against > >> >> >> the small loop body. > >> >> > > >> >> > Yeah. I thought about what it takes to avoid the conditional - basically > >> >> > we'd need to turn the init value to a (non-nested) loop that we'd need > >> >> > to insert on the preheader of the outer loop. > >> >> > > >> >> > >> >> Hi, > >> >> > >> >> I noticed a regression on aarch64 after Bin's commit r255472: > >> >> gcc.target/aarch64/pr62178.c scan-assembler ldr\\tq[0-9]+, > >> >> \\[x[0-9]+\\], [0-9]+ > >> >> gcc.target/aarch64/pr62178.c scan-assembler ldr\\ts[0-9]+, > >> >> \\[x[0-9]+, [0-9]+\\]! > >> >> gcc.target/aarch64/pr62178.c scan-assembler mla\\tv[0-9]+.4s, > >> >> v[0-9]+.4s, v[0-9]+.s\\[0\\] > >> >> > >> >> Is this patch supposed to fix it? > >> > > >> > No, from what I can see the patch shouldn't affect it. But it's not > >> > clear what the testcase tests for - it just scans assembler. > >> > Clearly we want to interchange the loop here so the scan assembler > >> I am not very sure. Though interchanging gives better cache behavior, > >> but the loop is relatively small here, the introduced cond_expr > >> results in two more instructions, as well as one additional memory > >> access from undoing reduction. Together with addressing mode chosen > >> in ivopts, it leads to obvious regression. > >> Ah, another issue is the cond_expr blocks vectorization without your > >> patch here. This case is what I meant small loops in which more > >> conservative interchange may be wanted. > > > > The loop has int data and int IVs so my patch shouldn't be necessary > > to vectorize the loop. > I haven't got time look into vectorizer part yet, but there is below > in dump file: > > pr62178.c:12:7: note: vect_is_simple_use: operand k_9 > pr62178.c:12:7: note: def_stmt: k_9 = PHI <1(7), k_38(10)> > pr62178.c:12:7: note: type of def: external > pr62178.c:12:7: note: not vectorized: relevant stmt not supported: > r_I_I_lsm.0_19 = k_9 != 1 ? r_I_I_lsm.0_14 : 0; > pr62178.c:12:7: note: bad operation or unsupported loop bound. > pr62178.c:12:7: note: ***** Re-trying analysis with vector size 8 Yes, so neon doesn't have a way to vectorize such conditionals? It does set vect_condition and even vect_cond_mixed though. You'd have to dive into why it thinks it cannot vectorize it. I suggest opening a bug if my patch didn't help. Richard. > Thanks, > bin > > > > Richard. > > > >> Thanks, > >> bin > >> > >> > needs to be adjusted and one has to revisit PR62178 to check whether > >> > the result is still ok (or simply add -fno-loop-interchange to it). > >> > > >> > Richard. > >> > > >> >> Thanks, > >> >> > >> >> Christophe > >> >> > >> >> > Richard. > >> >> > > >> >> >> Thanks, > >> >> >> bin > >> >> >> > > >> >> >> > <bb 13> [local count: 161061274]: > >> >> >> > # m_58 = PHI <1(10), m_84(20)> > >> >> >> > ... > >> >> >> > <bb 14> [local count: 912680551]: > >> >> >> > # l_35 = PHI <1(13), l_57(21)> > >> >> >> > ... > >> >> >> > y__I_lsm.113_140 = *y_139(D)[_31]; > >> >> >> > y__I_lsm.113_94 = m_58 != 1 ? y__I_lsm.113_140 : 0.0; > >> >> >> > ... > >> >> >> > *y_139(D)[_31] = _101; > >> >> >> > > >> >> >> > > >> >> >> > so we have a COND_EXPR with a test on an integer IV m_58 with > >> >> >> > double values. Note that the m_58 != 1 condition is invariant > >> >> >> > in the l loop. > >> >> >> > > >> >> >> > Currently we vectorize this condition using V8SImode vectors > >> >> >> > causing a vectorization factor of 8 and thus forcing the scalar > >> >> >> > path for the bwaves case (the loops have an iteration count of 5). > >> >> >> > > >> >> >> > The following patch makes the vectorizer handle invariant conditions > >> >> >> > in the first place and second handle widening of operands of invariant > >> >> >> > conditions transparently (the promotion will happen on the invariant > >> >> >> > scalars). This makes it possible to use a vectorization factor of 4, > >> >> >> > reducing the bwaves runtime from 208s before interchange > >> >> >> > (via 190s after interchange) to 172s after interchange and vectorization > >> >> >> > with AVX256 (on a Haswell machine). > >> >> >> > > >> >> >> > For the vectorizable_condition part to work I need to avoid > >> >> >> > pulling apart the condition from the COND_EXPR during pattern > >> >> >> > detection. > >> >> >> > > >> >> >> > Bootstrapped on x86_64-unknown-linux-gnu, testing in progress. > >> >> >> > > >> >> >> > Richard. > >> >> >> > > >> >> >> > 2017-12-06 Richard Biener <rguenther@suse.de> > >> >> >> > > >> >> >> > PR tree-optimization/81303 > >> >> >> > * tree-vect-stmts.c (vect_is_simple_cond): For invariant > >> >> >> > conditions try to create a comparison vector type matching > >> >> >> > the data vector type. > >> >> >> > (vectorizable_condition): Adjust. > >> >> >> > * tree-vect-patterns.c (vect_recog_mask_conversion_pattern): > >> >> >> > Leave invariant conditions alone in case we can vectorize those. > >> >> >> > > >> >> >> > * gcc.target/i386/vectorize9.c: New testcase. > >> >> >> > * gcc.target/i386/vectorize10.c: New testcase. > >> >> >> > > >> >> >> > Index: gcc/tree-vect-stmts.c > >> >> >> > =================================================================== > >> >> >> > --- gcc/tree-vect-stmts.c (revision 255438) > >> >> >> > +++ gcc/tree-vect-stmts.c (working copy) > >> >> >> > @@ -7792,7 +7792,8 @@ vectorizable_load (gimple *stmt, gimple_ > >> >> >> > > >> >> >> > static bool > >> >> >> > vect_is_simple_cond (tree cond, vec_info *vinfo, > >> >> >> > - tree *comp_vectype, enum vect_def_type *dts) > >> >> >> > + tree *comp_vectype, enum vect_def_type *dts, > >> >> >> > + tree vectype) > >> >> >> > { > >> >> >> > tree lhs, rhs; > >> >> >> > tree vectype1 = NULL_TREE, vectype2 = NULL_TREE; > >> >> >> > @@ -7845,6 +7846,20 @@ vect_is_simple_cond (tree cond, vec_info > >> >> >> > return false; > >> >> >> > > >> >> >> > *comp_vectype = vectype1 ? vectype1 : vectype2; > >> >> >> > + /* Invariant comparison. */ > >> >> >> > + if (! *comp_vectype) > >> >> >> > + { > >> >> >> > + tree scalar_type = TREE_TYPE (lhs); > >> >> >> > + /* If we can widen the comparison to match vectype do so. */ > >> >> >> > + if (INTEGRAL_TYPE_P (scalar_type) > >> >> >> > + && tree_int_cst_lt (TYPE_SIZE (scalar_type), > >> >> >> > + TYPE_SIZE (TREE_TYPE (vectype)))) > >> >> >> > + scalar_type = build_nonstandard_integer_type > >> >> >> > + (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype))), > >> >> >> > + TYPE_UNSIGNED (scalar_type)); > >> >> >> > + *comp_vectype = get_vectype_for_scalar_type (scalar_type); > >> >> >> > + } > >> >> >> > + > >> >> >> > return true; > >> >> >> > } > >> >> >> > > >> >> >> > @@ -7942,7 +7957,7 @@ vectorizable_condition (gimple *stmt, gi > >> >> >> > else_clause = gimple_assign_rhs3 (stmt); > >> >> >> > > >> >> >> > if (!vect_is_simple_cond (cond_expr, stmt_info->vinfo, > >> >> >> > - &comp_vectype, &dts[0]) > >> >> >> > + &comp_vectype, &dts[0], vectype) > >> >> >> > || !comp_vectype) > >> >> >> > return false; > >> >> >> > > >> >> >> > Index: gcc/tree-vect-patterns.c > >> >> >> > =================================================================== > >> >> >> > --- gcc/tree-vect-patterns.c (revision 255438) > >> >> >> > +++ gcc/tree-vect-patterns.c (working copy) > >> >> >> > @@ -3976,6 +3976,32 @@ vect_recog_mask_conversion_pattern (vec< > >> >> >> > || TYPE_VECTOR_SUBPARTS (vectype1) == TYPE_VECTOR_SUBPARTS (vectype2)) > >> >> >> > return NULL; > >> >> >> > > >> >> >> > + /* If rhs1 is invariant and we can promote it leave the COND_EXPR > >> >> >> > + in place, we can handle it in vectorizable_condition. This avoids > >> >> >> > + unnecessary promotion stmts and increased vectorization factor. */ > >> >> >> > + if (COMPARISON_CLASS_P (rhs1) > >> >> >> > + && INTEGRAL_TYPE_P (rhs1_type) > >> >> >> > + && TYPE_VECTOR_SUBPARTS (vectype1) < TYPE_VECTOR_SUBPARTS (vectype2)) > >> >> >> > + { > >> >> >> > + gimple *dummy; > >> >> >> > + enum vect_def_type dt; > >> >> >> > + if (vect_is_simple_use (TREE_OPERAND (rhs1, 0), stmt_vinfo->vinfo, > >> >> >> > + &dummy, &dt) > >> >> >> > + && dt == vect_external_def > >> >> >> > + && vect_is_simple_use (TREE_OPERAND (rhs1, 1), stmt_vinfo->vinfo, > >> >> >> > + &dummy, &dt) > >> >> >> > + && (dt == vect_external_def > >> >> >> > + || dt == vect_constant_def)) > >> >> >> > + { > >> >> >> > + tree wide_scalar_type = build_nonstandard_integer_type > >> >> >> > + (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype1))), > >> >> >> > + TYPE_UNSIGNED (rhs1_type)); > >> >> >> > + tree vectype3 = get_vectype_for_scalar_type (wide_scalar_type); > >> >> >> > + if (expand_vec_cond_expr_p (vectype1, vectype3, TREE_CODE (rhs1))) > >> >> >> > + return NULL; > >> >> >> > + } > >> >> >> > + } > >> >> >> > + > >> >> >> > /* If rhs1 is a comparison we need to move it into a > >> >> >> > separate statement. */ > >> >> >> > if (TREE_CODE (rhs1) != SSA_NAME) > >> >> >> > Index: gcc/testsuite/gcc.target/i386/vectorize9.c > >> >> >> > =================================================================== > >> >> >> > --- gcc/testsuite/gcc.target/i386/vectorize9.c (nonexistent) > >> >> >> > +++ gcc/testsuite/gcc.target/i386/vectorize9.c (working copy) > >> >> >> > @@ -0,0 +1,16 @@ > >> >> >> > +/* { dg-do compile } */ > >> >> >> > +/* { dg-options "-O -ftree-loop-vectorize -fdump-tree-vect-details -fno-tree-loop-im -mavx2 -mprefer-vector-width=256" } */ > >> >> >> > + > >> >> >> > +double x[1024][1024], red[1024]; > >> >> >> > +void foo (void) > >> >> >> > +{ > >> >> >> > + for (int i = 0; i < 1024; ++i) > >> >> >> > + for (int j = 0; j < 1024; ++j) > >> >> >> > + { > >> >> >> > + double v = i == 0 ? 0.0 : red[j]; > >> >> >> > + v = v + x[i][j]; > >> >> >> > + red[j] = v; > >> >> >> > + } > >> >> >> > +} > >> >> >> > + > >> >> >> > +/* { dg-final { scan-tree-dump "vectorization factor = 4" "vect" } } */ > >> >> >> > Index: gcc/testsuite/gcc.target/i386/vectorize10.c > >> >> >> > =================================================================== > >> >> >> > --- gcc/testsuite/gcc.target/i386/vectorize10.c (nonexistent) > >> >> >> > +++ gcc/testsuite/gcc.target/i386/vectorize10.c (working copy) > >> >> >> > @@ -0,0 +1,16 @@ > >> >> >> > +/* { dg-do compile } */ > >> >> >> > +/* { dg-options "-O -ftree-loop-vectorize -fdump-tree-vect-details -fno-tree-loop-im -msse2 -mno-avx" } */ > >> >> >> > + > >> >> >> > +double x[1024][1024], red[1024]; > >> >> >> > +void foo (void) > >> >> >> > +{ > >> >> >> > + for (int i = 0; i < 1024; ++i) > >> >> >> > + for (int j = 0; j < 1024; ++j) > >> >> >> > + { > >> >> >> > + double v = i == 0 ? 0.0 : red[j]; > >> >> >> > + v = v + x[i][j]; > >> >> >> > + red[j] = v; > >> >> >> > + } > >> >> >> > +} > >> >> >> > + > >> >> >> > +/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */ > >> >> >> > >> >> >> > >> >> > > >> >> > -- > >> >> > Richard Biener <rguenther@suse.de> > >> >> > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg) > >> >> > >> >> > >> > > >> > -- > >> > Richard Biener <rguenther@suse.de> > >> > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg) > >> > >> > > > > -- > > Richard Biener <rguenther@suse.de> > > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg) > >
On Fri, Dec 8, 2017 at 10:39 AM, Richard Biener <rguenther@suse.de> wrote: > On Fri, 8 Dec 2017, Bin.Cheng wrote: > >> On Fri, Dec 8, 2017 at 9:54 AM, Richard Biener <rguenther@suse.de> wrote: >> > On Fri, 8 Dec 2017, Bin.Cheng wrote: >> > >> >> On Fri, Dec 8, 2017 at 8:29 AM, Richard Biener <rguenther@suse.de> wrote: >> >> > On Fri, 8 Dec 2017, Christophe Lyon wrote: >> >> > >> >> >> On 8 December 2017 at 09:07, Richard Biener <rguenther@suse.de> wrote: >> >> >> > On Thu, 7 Dec 2017, Bin.Cheng wrote: >> >> >> > >> >> >> >> On Wed, Dec 6, 2017 at 1:29 PM, Richard Biener <rguenther@suse.de> wrote: >> >> >> >> > >> >> >> >> > The following fixes a vectorization issue that appears when trying >> >> >> >> > to vectorize the bwaves mat_times_vec kernel after interchange was >> >> >> >> > performed by the interchange pass. That interchange inserts the >> >> >> >> > following code for the former reduction created by LIM store-motion: >> >> >> >> I do observe more cases are vectorized by this patch on AArch64. >> >> >> >> Still want to find a way not generating the cond_expr, but for the moment >> >> >> >> I will have another patch make interchange even more conservative for >> >> >> >> small cases. In which the new cmp/select instructions do cost a lot against >> >> >> >> the small loop body. >> >> >> > >> >> >> > Yeah. I thought about what it takes to avoid the conditional - basically >> >> >> > we'd need to turn the init value to a (non-nested) loop that we'd need >> >> >> > to insert on the preheader of the outer loop. >> >> >> > >> >> >> >> >> >> Hi, >> >> >> >> >> >> I noticed a regression on aarch64 after Bin's commit r255472: >> >> >> gcc.target/aarch64/pr62178.c scan-assembler ldr\\tq[0-9]+, >> >> >> \\[x[0-9]+\\], [0-9]+ >> >> >> gcc.target/aarch64/pr62178.c scan-assembler ldr\\ts[0-9]+, >> >> >> \\[x[0-9]+, [0-9]+\\]! >> >> >> gcc.target/aarch64/pr62178.c scan-assembler mla\\tv[0-9]+.4s, >> >> >> v[0-9]+.4s, v[0-9]+.s\\[0\\] >> >> >> >> >> >> Is this patch supposed to fix it? >> >> > >> >> > No, from what I can see the patch shouldn't affect it. But it's not >> >> > clear what the testcase tests for - it just scans assembler. >> >> > Clearly we want to interchange the loop here so the scan assembler >> >> I am not very sure. Though interchanging gives better cache behavior, >> >> but the loop is relatively small here, the introduced cond_expr >> >> results in two more instructions, as well as one additional memory >> >> access from undoing reduction. Together with addressing mode chosen >> >> in ivopts, it leads to obvious regression. >> >> Ah, another issue is the cond_expr blocks vectorization without your >> >> patch here. This case is what I meant small loops in which more >> >> conservative interchange may be wanted. >> > >> > The loop has int data and int IVs so my patch shouldn't be necessary >> > to vectorize the loop. >> I haven't got time look into vectorizer part yet, but there is below >> in dump file: >> >> pr62178.c:12:7: note: vect_is_simple_use: operand k_9 >> pr62178.c:12:7: note: def_stmt: k_9 = PHI <1(7), k_38(10)> >> pr62178.c:12:7: note: type of def: external >> pr62178.c:12:7: note: not vectorized: relevant stmt not supported: >> r_I_I_lsm.0_19 = k_9 != 1 ? r_I_I_lsm.0_14 : 0; >> pr62178.c:12:7: note: bad operation or unsupported loop bound. >> pr62178.c:12:7: note: ***** Re-trying analysis with vector size 8 > > Yes, so neon doesn't have a way to vectorize such conditionals? > It does set vect_condition and even vect_cond_mixed though. It should. IIRC, it was me enables vect_cond* stuff on AArch64? Will look into it after fixing more urgent bugs. > You'd have to dive into why it thinks it cannot vectorize it. > > I suggest opening a bug if my patch didn't help. You patch does help vectorizing the loop, but not sure if it's in a perfect way. I guess vect_factor is a problem here. Thanks, bin > > Richard. > >> Thanks, >> bin >> > >> > Richard. >> > >> >> Thanks, >> >> bin >> >> >> >> > needs to be adjusted and one has to revisit PR62178 to check whether >> >> > the result is still ok (or simply add -fno-loop-interchange to it). >> >> > >> >> > Richard. >> >> > >> >> >> Thanks, >> >> >> >> >> >> Christophe >> >> >> >> >> >> > Richard. >> >> >> > >> >> >> >> Thanks, >> >> >> >> bin >> >> >> >> > >> >> >> >> > <bb 13> [local count: 161061274]: >> >> >> >> > # m_58 = PHI <1(10), m_84(20)> >> >> >> >> > ... >> >> >> >> > <bb 14> [local count: 912680551]: >> >> >> >> > # l_35 = PHI <1(13), l_57(21)> >> >> >> >> > ... >> >> >> >> > y__I_lsm.113_140 = *y_139(D)[_31]; >> >> >> >> > y__I_lsm.113_94 = m_58 != 1 ? y__I_lsm.113_140 : 0.0; >> >> >> >> > ... >> >> >> >> > *y_139(D)[_31] = _101; >> >> >> >> > >> >> >> >> > >> >> >> >> > so we have a COND_EXPR with a test on an integer IV m_58 with >> >> >> >> > double values. Note that the m_58 != 1 condition is invariant >> >> >> >> > in the l loop. >> >> >> >> > >> >> >> >> > Currently we vectorize this condition using V8SImode vectors >> >> >> >> > causing a vectorization factor of 8 and thus forcing the scalar >> >> >> >> > path for the bwaves case (the loops have an iteration count of 5). >> >> >> >> > >> >> >> >> > The following patch makes the vectorizer handle invariant conditions >> >> >> >> > in the first place and second handle widening of operands of invariant >> >> >> >> > conditions transparently (the promotion will happen on the invariant >> >> >> >> > scalars). This makes it possible to use a vectorization factor of 4, >> >> >> >> > reducing the bwaves runtime from 208s before interchange >> >> >> >> > (via 190s after interchange) to 172s after interchange and vectorization >> >> >> >> > with AVX256 (on a Haswell machine). >> >> >> >> > >> >> >> >> > For the vectorizable_condition part to work I need to avoid >> >> >> >> > pulling apart the condition from the COND_EXPR during pattern >> >> >> >> > detection. >> >> >> >> > >> >> >> >> > Bootstrapped on x86_64-unknown-linux-gnu, testing in progress. >> >> >> >> > >> >> >> >> > Richard. >> >> >> >> > >> >> >> >> > 2017-12-06 Richard Biener <rguenther@suse.de> >> >> >> >> > >> >> >> >> > PR tree-optimization/81303 >> >> >> >> > * tree-vect-stmts.c (vect_is_simple_cond): For invariant >> >> >> >> > conditions try to create a comparison vector type matching >> >> >> >> > the data vector type. >> >> >> >> > (vectorizable_condition): Adjust. >> >> >> >> > * tree-vect-patterns.c (vect_recog_mask_conversion_pattern): >> >> >> >> > Leave invariant conditions alone in case we can vectorize those. >> >> >> >> > >> >> >> >> > * gcc.target/i386/vectorize9.c: New testcase. >> >> >> >> > * gcc.target/i386/vectorize10.c: New testcase. >> >> >> >> > >> >> >> >> > Index: gcc/tree-vect-stmts.c >> >> >> >> > =================================================================== >> >> >> >> > --- gcc/tree-vect-stmts.c (revision 255438) >> >> >> >> > +++ gcc/tree-vect-stmts.c (working copy) >> >> >> >> > @@ -7792,7 +7792,8 @@ vectorizable_load (gimple *stmt, gimple_ >> >> >> >> > >> >> >> >> > static bool >> >> >> >> > vect_is_simple_cond (tree cond, vec_info *vinfo, >> >> >> >> > - tree *comp_vectype, enum vect_def_type *dts) >> >> >> >> > + tree *comp_vectype, enum vect_def_type *dts, >> >> >> >> > + tree vectype) >> >> >> >> > { >> >> >> >> > tree lhs, rhs; >> >> >> >> > tree vectype1 = NULL_TREE, vectype2 = NULL_TREE; >> >> >> >> > @@ -7845,6 +7846,20 @@ vect_is_simple_cond (tree cond, vec_info >> >> >> >> > return false; >> >> >> >> > >> >> >> >> > *comp_vectype = vectype1 ? vectype1 : vectype2; >> >> >> >> > + /* Invariant comparison. */ >> >> >> >> > + if (! *comp_vectype) >> >> >> >> > + { >> >> >> >> > + tree scalar_type = TREE_TYPE (lhs); >> >> >> >> > + /* If we can widen the comparison to match vectype do so. */ >> >> >> >> > + if (INTEGRAL_TYPE_P (scalar_type) >> >> >> >> > + && tree_int_cst_lt (TYPE_SIZE (scalar_type), >> >> >> >> > + TYPE_SIZE (TREE_TYPE (vectype)))) >> >> >> >> > + scalar_type = build_nonstandard_integer_type >> >> >> >> > + (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype))), >> >> >> >> > + TYPE_UNSIGNED (scalar_type)); >> >> >> >> > + *comp_vectype = get_vectype_for_scalar_type (scalar_type); >> >> >> >> > + } >> >> >> >> > + >> >> >> >> > return true; >> >> >> >> > } >> >> >> >> > >> >> >> >> > @@ -7942,7 +7957,7 @@ vectorizable_condition (gimple *stmt, gi >> >> >> >> > else_clause = gimple_assign_rhs3 (stmt); >> >> >> >> > >> >> >> >> > if (!vect_is_simple_cond (cond_expr, stmt_info->vinfo, >> >> >> >> > - &comp_vectype, &dts[0]) >> >> >> >> > + &comp_vectype, &dts[0], vectype) >> >> >> >> > || !comp_vectype) >> >> >> >> > return false; >> >> >> >> > >> >> >> >> > Index: gcc/tree-vect-patterns.c >> >> >> >> > =================================================================== >> >> >> >> > --- gcc/tree-vect-patterns.c (revision 255438) >> >> >> >> > +++ gcc/tree-vect-patterns.c (working copy) >> >> >> >> > @@ -3976,6 +3976,32 @@ vect_recog_mask_conversion_pattern (vec< >> >> >> >> > || TYPE_VECTOR_SUBPARTS (vectype1) == TYPE_VECTOR_SUBPARTS (vectype2)) >> >> >> >> > return NULL; >> >> >> >> > >> >> >> >> > + /* If rhs1 is invariant and we can promote it leave the COND_EXPR >> >> >> >> > + in place, we can handle it in vectorizable_condition. This avoids >> >> >> >> > + unnecessary promotion stmts and increased vectorization factor. */ >> >> >> >> > + if (COMPARISON_CLASS_P (rhs1) >> >> >> >> > + && INTEGRAL_TYPE_P (rhs1_type) >> >> >> >> > + && TYPE_VECTOR_SUBPARTS (vectype1) < TYPE_VECTOR_SUBPARTS (vectype2)) >> >> >> >> > + { >> >> >> >> > + gimple *dummy; >> >> >> >> > + enum vect_def_type dt; >> >> >> >> > + if (vect_is_simple_use (TREE_OPERAND (rhs1, 0), stmt_vinfo->vinfo, >> >> >> >> > + &dummy, &dt) >> >> >> >> > + && dt == vect_external_def >> >> >> >> > + && vect_is_simple_use (TREE_OPERAND (rhs1, 1), stmt_vinfo->vinfo, >> >> >> >> > + &dummy, &dt) >> >> >> >> > + && (dt == vect_external_def >> >> >> >> > + || dt == vect_constant_def)) >> >> >> >> > + { >> >> >> >> > + tree wide_scalar_type = build_nonstandard_integer_type >> >> >> >> > + (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype1))), >> >> >> >> > + TYPE_UNSIGNED (rhs1_type)); >> >> >> >> > + tree vectype3 = get_vectype_for_scalar_type (wide_scalar_type); >> >> >> >> > + if (expand_vec_cond_expr_p (vectype1, vectype3, TREE_CODE (rhs1))) >> >> >> >> > + return NULL; >> >> >> >> > + } >> >> >> >> > + } >> >> >> >> > + >> >> >> >> > /* If rhs1 is a comparison we need to move it into a >> >> >> >> > separate statement. */ >> >> >> >> > if (TREE_CODE (rhs1) != SSA_NAME) >> >> >> >> > Index: gcc/testsuite/gcc.target/i386/vectorize9.c >> >> >> >> > =================================================================== >> >> >> >> > --- gcc/testsuite/gcc.target/i386/vectorize9.c (nonexistent) >> >> >> >> > +++ gcc/testsuite/gcc.target/i386/vectorize9.c (working copy) >> >> >> >> > @@ -0,0 +1,16 @@ >> >> >> >> > +/* { dg-do compile } */ >> >> >> >> > +/* { dg-options "-O -ftree-loop-vectorize -fdump-tree-vect-details -fno-tree-loop-im -mavx2 -mprefer-vector-width=256" } */ >> >> >> >> > + >> >> >> >> > +double x[1024][1024], red[1024]; >> >> >> >> > +void foo (void) >> >> >> >> > +{ >> >> >> >> > + for (int i = 0; i < 1024; ++i) >> >> >> >> > + for (int j = 0; j < 1024; ++j) >> >> >> >> > + { >> >> >> >> > + double v = i == 0 ? 0.0 : red[j]; >> >> >> >> > + v = v + x[i][j]; >> >> >> >> > + red[j] = v; >> >> >> >> > + } >> >> >> >> > +} >> >> >> >> > + >> >> >> >> > +/* { dg-final { scan-tree-dump "vectorization factor = 4" "vect" } } */ >> >> >> >> > Index: gcc/testsuite/gcc.target/i386/vectorize10.c >> >> >> >> > =================================================================== >> >> >> >> > --- gcc/testsuite/gcc.target/i386/vectorize10.c (nonexistent) >> >> >> >> > +++ gcc/testsuite/gcc.target/i386/vectorize10.c (working copy) >> >> >> >> > @@ -0,0 +1,16 @@ >> >> >> >> > +/* { dg-do compile } */ >> >> >> >> > +/* { dg-options "-O -ftree-loop-vectorize -fdump-tree-vect-details -fno-tree-loop-im -msse2 -mno-avx" } */ >> >> >> >> > + >> >> >> >> > +double x[1024][1024], red[1024]; >> >> >> >> > +void foo (void) >> >> >> >> > +{ >> >> >> >> > + for (int i = 0; i < 1024; ++i) >> >> >> >> > + for (int j = 0; j < 1024; ++j) >> >> >> >> > + { >> >> >> >> > + double v = i == 0 ? 0.0 : red[j]; >> >> >> >> > + v = v + x[i][j]; >> >> >> >> > + red[j] = v; >> >> >> >> > + } >> >> >> >> > +} >> >> >> >> > + >> >> >> >> > +/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */ >> >> >> >> >> >> >> >> >> >> >> > >> >> >> > -- >> >> >> > Richard Biener <rguenther@suse.de> >> >> >> > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg) >> >> >> >> >> >> >> >> > >> >> > -- >> >> > Richard Biener <rguenther@suse.de> >> >> > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg) >> >> >> >> >> > >> > -- >> > Richard Biener <rguenther@suse.de> >> > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg) >> >> > > -- > Richard Biener <rguenther@suse.de> > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg)
Index: gcc/tree-vect-stmts.c =================================================================== --- gcc/tree-vect-stmts.c (revision 255438) +++ gcc/tree-vect-stmts.c (working copy) @@ -7792,7 +7792,8 @@ vectorizable_load (gimple *stmt, gimple_ static bool vect_is_simple_cond (tree cond, vec_info *vinfo, - tree *comp_vectype, enum vect_def_type *dts) + tree *comp_vectype, enum vect_def_type *dts, + tree vectype) { tree lhs, rhs; tree vectype1 = NULL_TREE, vectype2 = NULL_TREE; @@ -7845,6 +7846,20 @@ vect_is_simple_cond (tree cond, vec_info return false; *comp_vectype = vectype1 ? vectype1 : vectype2; + /* Invariant comparison. */ + if (! *comp_vectype) + { + tree scalar_type = TREE_TYPE (lhs); + /* If we can widen the comparison to match vectype do so. */ + if (INTEGRAL_TYPE_P (scalar_type) + && tree_int_cst_lt (TYPE_SIZE (scalar_type), + TYPE_SIZE (TREE_TYPE (vectype)))) + scalar_type = build_nonstandard_integer_type + (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype))), + TYPE_UNSIGNED (scalar_type)); + *comp_vectype = get_vectype_for_scalar_type (scalar_type); + } + return true; } @@ -7942,7 +7957,7 @@ vectorizable_condition (gimple *stmt, gi else_clause = gimple_assign_rhs3 (stmt); if (!vect_is_simple_cond (cond_expr, stmt_info->vinfo, - &comp_vectype, &dts[0]) + &comp_vectype, &dts[0], vectype) || !comp_vectype) return false; Index: gcc/tree-vect-patterns.c =================================================================== --- gcc/tree-vect-patterns.c (revision 255438) +++ gcc/tree-vect-patterns.c (working copy) @@ -3976,6 +3976,32 @@ vect_recog_mask_conversion_pattern (vec< || TYPE_VECTOR_SUBPARTS (vectype1) == TYPE_VECTOR_SUBPARTS (vectype2)) return NULL; + /* If rhs1 is invariant and we can promote it leave the COND_EXPR + in place, we can handle it in vectorizable_condition. This avoids + unnecessary promotion stmts and increased vectorization factor. */ + if (COMPARISON_CLASS_P (rhs1) + && INTEGRAL_TYPE_P (rhs1_type) + && TYPE_VECTOR_SUBPARTS (vectype1) < TYPE_VECTOR_SUBPARTS (vectype2)) + { + gimple *dummy; + enum vect_def_type dt; + if (vect_is_simple_use (TREE_OPERAND (rhs1, 0), stmt_vinfo->vinfo, + &dummy, &dt) + && dt == vect_external_def + && vect_is_simple_use (TREE_OPERAND (rhs1, 1), stmt_vinfo->vinfo, + &dummy, &dt) + && (dt == vect_external_def + || dt == vect_constant_def)) + { + tree wide_scalar_type = build_nonstandard_integer_type + (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype1))), + TYPE_UNSIGNED (rhs1_type)); + tree vectype3 = get_vectype_for_scalar_type (wide_scalar_type); + if (expand_vec_cond_expr_p (vectype1, vectype3, TREE_CODE (rhs1))) + return NULL; + } + } + /* If rhs1 is a comparison we need to move it into a separate statement. */ if (TREE_CODE (rhs1) != SSA_NAME) Index: gcc/testsuite/gcc.target/i386/vectorize9.c =================================================================== --- gcc/testsuite/gcc.target/i386/vectorize9.c (nonexistent) +++ gcc/testsuite/gcc.target/i386/vectorize9.c (working copy) @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-options "-O -ftree-loop-vectorize -fdump-tree-vect-details -fno-tree-loop-im -mavx2 -mprefer-vector-width=256" } */ + +double x[1024][1024], red[1024]; +void foo (void) +{ + for (int i = 0; i < 1024; ++i) + for (int j = 0; j < 1024; ++j) + { + double v = i == 0 ? 0.0 : red[j]; + v = v + x[i][j]; + red[j] = v; + } +} + +/* { dg-final { scan-tree-dump "vectorization factor = 4" "vect" } } */ Index: gcc/testsuite/gcc.target/i386/vectorize10.c =================================================================== --- gcc/testsuite/gcc.target/i386/vectorize10.c (nonexistent) +++ gcc/testsuite/gcc.target/i386/vectorize10.c (working copy) @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-options "-O -ftree-loop-vectorize -fdump-tree-vect-details -fno-tree-loop-im -msse2 -mno-avx" } */ + +double x[1024][1024], red[1024]; +void foo (void) +{ + for (int i = 0; i < 1024; ++i) + for (int j = 0; j < 1024; ++j) + { + double v = i == 0 ? 0.0 : red[j]; + v = v + x[i][j]; + red[j] = v; + } +} + +/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */