Message ID | 20150316162102.GR1746@tucnak.redhat.com |
---|---|
State | New |
Headers | show |
On March 16, 2015 5:21:02 PM GMT+01:00, Jakub Jelinek <jakub@redhat.com> wrote: >Hi! > >On the following testcase, gimple LIM creates a vector COND_EXPR >(scalar >condition, vector lhs, rhs2 and rhs3), but if we don't have >corresponding >vector mode for it, we ICE trying to expand the BLKmode COND_EXPR, as >it is >unprepared for that. > >This patch lowers those (parallel or piecewise). >Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? OK. Though maybe LIM should not create these for cost reasons? I also wonder if we should lower them to control flow? Thanks, Richard. >2015-03-16 Jakub Jelinek <jakub@redhat.com> > > PR tree-optimization/65427 > * tree-vect-generic.c (do_cond, expand_vector_scalar_condition): New > functions. > (expand_vector_operations_1): Handle BLKmode vector COND_EXPR. > > * gcc.c-torture/execute/pr65427.c: New test. > >--- gcc/tree-vect-generic.c.jj 2015-01-15 20:25:40.000000000 +0100 >+++ gcc/tree-vect-generic.c 2015-03-16 14:25:37.391932269 +0100 >@@ -1417,6 +1417,57 @@ count_type_subparts (tree type) > return VECTOR_TYPE_P (type) ? TYPE_VECTOR_SUBPARTS (type) : 1; > } > >+static tree >+do_cond (gimple_stmt_iterator *gsi, tree inner_type, tree a, tree b, >+ tree bitpos, tree bitsize, enum tree_code code) >+{ >+ if (TREE_CODE (TREE_TYPE (a)) == VECTOR_TYPE) >+ a = tree_vec_extract (gsi, inner_type, a, bitsize, bitpos); >+ if (TREE_CODE (TREE_TYPE (b)) == VECTOR_TYPE) >+ b = tree_vec_extract (gsi, inner_type, b, bitsize, bitpos); >+ tree cond = gimple_assign_rhs1 (gsi_stmt (*gsi)); >+ return gimplify_build3 (gsi, code, inner_type, cond, a, b); >+} >+ >+/* Expand a vector COND_EXPR to scalars, piecewise. */ >+static void >+expand_vector_scalar_condition (gimple_stmt_iterator *gsi) >+{ >+ gassign *stmt = as_a <gassign *> (gsi_stmt (*gsi)); >+ tree type = gimple_expr_type (stmt); >+ tree compute_type = get_compute_type (COND_EXPR, mov_optab, type); >+ machine_mode compute_mode = TYPE_MODE (compute_type); >+ gcc_assert (compute_mode != BLKmode); >+ tree lhs = gimple_assign_lhs (stmt); >+ tree rhs2 = gimple_assign_rhs2 (stmt); >+ tree rhs3 = gimple_assign_rhs3 (stmt); >+ tree new_rhs; >+ >+ /* If the compute mode is not a vector mode (hence we are not >decomposing >+ a BLKmode vector to smaller, hardware-supported vectors), we may >want >+ to expand the operations in parallel. */ >+ if (GET_MODE_CLASS (compute_mode) != MODE_VECTOR_INT >+ && GET_MODE_CLASS (compute_mode) != MODE_VECTOR_FLOAT >+ && GET_MODE_CLASS (compute_mode) != MODE_VECTOR_FRACT >+ && GET_MODE_CLASS (compute_mode) != MODE_VECTOR_UFRACT >+ && GET_MODE_CLASS (compute_mode) != MODE_VECTOR_ACCUM >+ && GET_MODE_CLASS (compute_mode) != MODE_VECTOR_UACCUM) >+ new_rhs = expand_vector_parallel (gsi, do_cond, type, rhs2, rhs3, >+ COND_EXPR); >+ else >+ new_rhs = expand_vector_piecewise (gsi, do_cond, type, >compute_type, >+ rhs2, rhs3, COND_EXPR); >+ if (!useless_type_conversion_p (TREE_TYPE (lhs), TREE_TYPE >(new_rhs))) >+ new_rhs = gimplify_build1 (gsi, VIEW_CONVERT_EXPR, TREE_TYPE >(lhs), >+ new_rhs); >+ >+ /* NOTE: We should avoid using gimple_assign_set_rhs_from_tree. One >+ way to do it is change expand_vector_operation and its callees to >+ return a tree_code, RHS1 and RHS2 instead of a tree. */ >+ gimple_assign_set_rhs_from_tree (gsi, new_rhs); >+ update_stmt (gsi_stmt (*gsi)); >+} >+ >/* Process one statement. If we identify a vector operation, expand >it. */ > > static void >@@ -1449,6 +1500,14 @@ expand_vector_operations_1 (gimple_stmt_ > return; > } > >+ if (code == COND_EXPR >+ && TREE_CODE (TREE_TYPE (gimple_assign_lhs (stmt))) == >VECTOR_TYPE >+ && TYPE_MODE (TREE_TYPE (gimple_assign_lhs (stmt))) == BLKmode) >+ { >+ expand_vector_scalar_condition (gsi); >+ return; >+ } >+ > if (code == CONSTRUCTOR > && TREE_CODE (lhs) == SSA_NAME > && VECTOR_MODE_P (TYPE_MODE (TREE_TYPE (lhs))) >--- gcc/testsuite/gcc.c-torture/execute/pr65427.c.jj 2015-03-16 >14:36:29.489254701 +0100 >+++ gcc/testsuite/gcc.c-torture/execute/pr65427.c 2015-03-16 >14:40:58.789851433 +0100 >@@ -0,0 +1,34 @@ >+/* PR tree-optimization/65427 */ >+ >+typedef int V __attribute__ ((vector_size (8 * sizeof (int)))); >+V a, b, c, d, e, f; >+ >+__attribute__((noinline, noclone)) void >+foo (int x, int y) >+{ >+ do >+ { >+ if (x) >+ d = a ^ c; >+ else >+ d = a ^ b; >+ } >+ while (y); >+} >+ >+int >+main () >+{ >+ a = (V) { 1, 2, 3, 4, 5, 6, 7, 8 }; >+ b = (V) { 0x40, 0x80, 0x40, 0x80, 0x40, 0x80, 0x40, 0x80 }; >+ e = (V) { 0x41, 0x82, 0x43, 0x84, 0x45, 0x86, 0x47, 0x88 }; >+ foo (0, 0); >+ if (__builtin_memcmp (&d, &e, sizeof (V)) != 0) >+ __builtin_abort (); >+ c = (V) { 0x80, 0x40, 0x80, 0x40, 0x80, 0x40, 0x80, 0x40 }; >+ f = (V) { 0x81, 0x42, 0x83, 0x44, 0x85, 0x46, 0x87, 0x48 }; >+ foo (1, 0); >+ if (__builtin_memcmp (&d, &f, sizeof (V)) != 0) >+ __builtin_abort (); >+ return 0; >+} > > Jakub
On Mon, Mar 16, 2015 at 07:15:59PM +0100, Richard Biener wrote: > On March 16, 2015 5:21:02 PM GMT+01:00, Jakub Jelinek <jakub@redhat.com> wrote: > >On the following testcase, gimple LIM creates a vector COND_EXPR > >(scalar > >condition, vector lhs, rhs2 and rhs3), but if we don't have > >corresponding > >vector mode for it, we ICE trying to expand the BLKmode COND_EXPR, as > >it is > >unprepared for that. > > > >This patch lowers those (parallel or piecewise). > >Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? > > OK. Though maybe LIM should not create these for cost reasons? I also wonder if we should lower them to control flow? Yeah, I've been thinking about teaching LIM not to do that if it is BLKmode. But then found how many other spots create COND_EXPRs and thought I just can't catch all of them. But guess I can change LIM incrementally too. I've also been thinking about lowering it to control flow, but only if it couldn't be done in say two halves comparison as in the testcase. I suppose doing say 2 V4SImode COND_EXPRs would still be beneficial over control flow, but if it is more than that (say 4 or 8+) it might be already better to just expand it as a PHI. But, as we don't create basic blocks in tree-vect-generic.c right now, I thought it might be too much for stage4. Jakub
On March 16, 2015 7:26:58 PM GMT+01:00, Jakub Jelinek <jakub@redhat.com> wrote: >On Mon, Mar 16, 2015 at 07:15:59PM +0100, Richard Biener wrote: >> On March 16, 2015 5:21:02 PM GMT+01:00, Jakub Jelinek ><jakub@redhat.com> wrote: >> >On the following testcase, gimple LIM creates a vector COND_EXPR >> >(scalar >> >condition, vector lhs, rhs2 and rhs3), but if we don't have >> >corresponding >> >vector mode for it, we ICE trying to expand the BLKmode COND_EXPR, >as >> >it is >> >unprepared for that. >> > >> >This patch lowers those (parallel or piecewise). >> >Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? >> >> OK. Though maybe LIM should not create these for cost reasons? I >also wonder if we should lower them to control flow? > >Yeah, I've been thinking about teaching LIM not to do that if it is >BLKmode. >But then found how many other spots create COND_EXPRs and thought I >just >can't catch all of them. But guess I can change LIM incrementally too. > >I've also been thinking about lowering it to control flow, but only if >it >couldn't be done in say two halves comparison as in the testcase. I >suppose >doing say 2 V4SImode COND_EXPRs would still be beneficial over control >flow, >but if it is more than that (say 4 or 8+) it might be already better to >just >expand it as a PHI. But, as we don't create basic blocks in >tree-vect-generic.c right now, I thought it might be too much for >stage4. Yeah, the patch is fine for stage 4. I just wrote down my random thoughts. Richard. > Jakub
On 16/03/15 18:26, Jakub Jelinek wrote: > On Mon, Mar 16, 2015 at 07:15:59PM +0100, Richard Biener wrote: >> On March 16, 2015 5:21:02 PM GMT+01:00, Jakub Jelinek <jakub@redhat.com> wrote: >>> On the following testcase, gimple LIM creates a vector COND_EXPR >>> (scalar >>> condition, vector lhs, rhs2 and rhs3), but if we don't have >>> corresponding >>> vector mode for it, we ICE trying to expand the BLKmode COND_EXPR, as >>> it is >>> unprepared for that. >>> >>> This patch lowers those (parallel or piecewise). >>> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? >> OK. Though maybe LIM should not create these for cost reasons? I also wonder if we should lower them to control flow? > Yeah, I've been thinking about teaching LIM not to do that if it is BLKmode. > But then found how many other spots create COND_EXPRs and thought I just > can't catch all of them. But guess I can change LIM incrementally too. > > I've also been thinking about lowering it to control flow, but only if it > couldn't be done in say two halves comparison as in the testcase. I suppose > doing say 2 V4SImode COND_EXPRs would still be beneficial over control flow, > but if it is more than that (say 4 or 8+) it might be already better to just > expand it as a PHI. But, as we don't create basic blocks in > tree-vect-generic.c right now, I thought it might be too much for stage4. Hi Jakub, The testcase ICEs on arm-none-eabi at -Os (only): 0x7e6f11 process_insert_insn $TOP/gcc/gcse.c:2174 0x7e8a80 insert_insn_end_basic_block $TOP/gcc/gcse.c:2196 0x7eab01 hoist_code $TOP/gcc/gcse.c:3492 0x7eab01 one_code_hoisting_pass $TOP/gcc/gcse.c:3722 0x7eab01 execute_rtl_hoist $TOP/gcc/gcse.c:4212 0x7eab01 execute $TOP/gcc/gcse.c:4293 Thanks, Kyrill > Jakub >
On Wed, Mar 18, 2015 at 03:10:44PM +0000, Kyrill Tkachov wrote: > >I've also been thinking about lowering it to control flow, but only if it > >couldn't be done in say two halves comparison as in the testcase. I suppose > >doing say 2 V4SImode COND_EXPRs would still be beneficial over control flow, > >but if it is more than that (say 4 or 8+) it might be already better to just > >expand it as a PHI. But, as we don't create basic blocks in > >tree-vect-generic.c right now, I thought it might be too much for stage4. > The testcase ICEs on arm-none-eabi at -Os (only): > 0x7e6f11 process_insert_insn > $TOP/gcc/gcse.c:2174 > 0x7e8a80 insert_insn_end_basic_block > $TOP/gcc/gcse.c:2196 > 0x7eab01 hoist_code > $TOP/gcc/gcse.c:3492 > 0x7eab01 one_code_hoisting_pass > $TOP/gcc/gcse.c:3722 > 0x7eab01 execute_rtl_hoist > $TOP/gcc/gcse.c:4212 > 0x7eab01 execute > $TOP/gcc/gcse.c:4293 Must be some unrelated latent issue, either in arm backend or in gcse. I think the lowering I've done could be very easily just present in user code, so shouldn't be that hard to adjust the testcase so that it ICEs already before the commit. Jakub
On 18/03/15 15:14, Jakub Jelinek wrote: > On Wed, Mar 18, 2015 at 03:10:44PM +0000, Kyrill Tkachov wrote: >>> I've also been thinking about lowering it to control flow, but only if it >>> couldn't be done in say two halves comparison as in the testcase. I suppose >>> doing say 2 V4SImode COND_EXPRs would still be beneficial over control flow, >>> but if it is more than that (say 4 or 8+) it might be already better to just >>> expand it as a PHI. But, as we don't create basic blocks in >>> tree-vect-generic.c right now, I thought it might be too much for stage4. >> The testcase ICEs on arm-none-eabi at -Os (only): >> 0x7e6f11 process_insert_insn >> $TOP/gcc/gcse.c:2174 >> 0x7e8a80 insert_insn_end_basic_block >> $TOP/gcc/gcse.c:2196 >> 0x7eab01 hoist_code >> $TOP/gcc/gcse.c:3492 >> 0x7eab01 one_code_hoisting_pass >> $TOP/gcc/gcse.c:3722 >> 0x7eab01 execute_rtl_hoist >> $TOP/gcc/gcse.c:4212 >> 0x7eab01 execute >> $TOP/gcc/gcse.c:4293 > Must be some unrelated latent issue, either in arm backend or in gcse. > I think the lowering I've done could be very easily just present in user > code, so shouldn't be that hard to adjust the testcase so that it ICEs > already before the commit. Yep, I'm seeing the same ICE in the testcsase before the patch. Kyrill > > Jakub >
On 18/03/15 15:20, Kyrill Tkachov wrote: > On 18/03/15 15:14, Jakub Jelinek wrote: >> On Wed, Mar 18, 2015 at 03:10:44PM +0000, Kyrill Tkachov wrote: >>>> I've also been thinking about lowering it to control flow, but only if it >>>> couldn't be done in say two halves comparison as in the testcase. I suppose >>>> doing say 2 V4SImode COND_EXPRs would still be beneficial over control flow, >>>> but if it is more than that (say 4 or 8+) it might be already better to just >>>> expand it as a PHI. But, as we don't create basic blocks in >>>> tree-vect-generic.c right now, I thought it might be too much for stage4. >>> The testcase ICEs on arm-none-eabi at -Os (only): >>> 0x7e6f11 process_insert_insn >>> $TOP/gcc/gcse.c:2174 >>> 0x7e8a80 insert_insn_end_basic_block >>> $TOP/gcc/gcse.c:2196 >>> 0x7eab01 hoist_code >>> $TOP/gcc/gcse.c:3492 >>> 0x7eab01 one_code_hoisting_pass >>> $TOP/gcc/gcse.c:3722 >>> 0x7eab01 execute_rtl_hoist >>> $TOP/gcc/gcse.c:4212 >>> 0x7eab01 execute >>> $TOP/gcc/gcse.c:4293 >> Must be some unrelated latent issue, either in arm backend or in gcse. >> I think the lowering I've done could be very easily just present in user >> code, so shouldn't be that hard to adjust the testcase so that it ICEs >> already before the commit. > Yep, I'm seeing the same ICE in the testcsase before the patch. With some help from James G, I've got a potential fix to gcse.c in testing. Kyrill > Kyrill > >> Jakub >> > >
--- gcc/tree-vect-generic.c.jj 2015-01-15 20:25:40.000000000 +0100 +++ gcc/tree-vect-generic.c 2015-03-16 14:25:37.391932269 +0100 @@ -1417,6 +1417,57 @@ count_type_subparts (tree type) return VECTOR_TYPE_P (type) ? TYPE_VECTOR_SUBPARTS (type) : 1; } +static tree +do_cond (gimple_stmt_iterator *gsi, tree inner_type, tree a, tree b, + tree bitpos, tree bitsize, enum tree_code code) +{ + if (TREE_CODE (TREE_TYPE (a)) == VECTOR_TYPE) + a = tree_vec_extract (gsi, inner_type, a, bitsize, bitpos); + if (TREE_CODE (TREE_TYPE (b)) == VECTOR_TYPE) + b = tree_vec_extract (gsi, inner_type, b, bitsize, bitpos); + tree cond = gimple_assign_rhs1 (gsi_stmt (*gsi)); + return gimplify_build3 (gsi, code, inner_type, cond, a, b); +} + +/* Expand a vector COND_EXPR to scalars, piecewise. */ +static void +expand_vector_scalar_condition (gimple_stmt_iterator *gsi) +{ + gassign *stmt = as_a <gassign *> (gsi_stmt (*gsi)); + tree type = gimple_expr_type (stmt); + tree compute_type = get_compute_type (COND_EXPR, mov_optab, type); + machine_mode compute_mode = TYPE_MODE (compute_type); + gcc_assert (compute_mode != BLKmode); + tree lhs = gimple_assign_lhs (stmt); + tree rhs2 = gimple_assign_rhs2 (stmt); + tree rhs3 = gimple_assign_rhs3 (stmt); + tree new_rhs; + + /* If the compute mode is not a vector mode (hence we are not decomposing + a BLKmode vector to smaller, hardware-supported vectors), we may want + to expand the operations in parallel. */ + if (GET_MODE_CLASS (compute_mode) != MODE_VECTOR_INT + && GET_MODE_CLASS (compute_mode) != MODE_VECTOR_FLOAT + && GET_MODE_CLASS (compute_mode) != MODE_VECTOR_FRACT + && GET_MODE_CLASS (compute_mode) != MODE_VECTOR_UFRACT + && GET_MODE_CLASS (compute_mode) != MODE_VECTOR_ACCUM + && GET_MODE_CLASS (compute_mode) != MODE_VECTOR_UACCUM) + new_rhs = expand_vector_parallel (gsi, do_cond, type, rhs2, rhs3, + COND_EXPR); + else + new_rhs = expand_vector_piecewise (gsi, do_cond, type, compute_type, + rhs2, rhs3, COND_EXPR); + if (!useless_type_conversion_p (TREE_TYPE (lhs), TREE_TYPE (new_rhs))) + new_rhs = gimplify_build1 (gsi, VIEW_CONVERT_EXPR, TREE_TYPE (lhs), + new_rhs); + + /* NOTE: We should avoid using gimple_assign_set_rhs_from_tree. One + way to do it is change expand_vector_operation and its callees to + return a tree_code, RHS1 and RHS2 instead of a tree. */ + gimple_assign_set_rhs_from_tree (gsi, new_rhs); + update_stmt (gsi_stmt (*gsi)); +} + /* Process one statement. If we identify a vector operation, expand it. */ static void @@ -1449,6 +1500,14 @@ expand_vector_operations_1 (gimple_stmt_ return; } + if (code == COND_EXPR + && TREE_CODE (TREE_TYPE (gimple_assign_lhs (stmt))) == VECTOR_TYPE + && TYPE_MODE (TREE_TYPE (gimple_assign_lhs (stmt))) == BLKmode) + { + expand_vector_scalar_condition (gsi); + return; + } + if (code == CONSTRUCTOR && TREE_CODE (lhs) == SSA_NAME && VECTOR_MODE_P (TYPE_MODE (TREE_TYPE (lhs))) --- gcc/testsuite/gcc.c-torture/execute/pr65427.c.jj 2015-03-16 14:36:29.489254701 +0100 +++ gcc/testsuite/gcc.c-torture/execute/pr65427.c 2015-03-16 14:40:58.789851433 +0100 @@ -0,0 +1,34 @@ +/* PR tree-optimization/65427 */ + +typedef int V __attribute__ ((vector_size (8 * sizeof (int)))); +V a, b, c, d, e, f; + +__attribute__((noinline, noclone)) void +foo (int x, int y) +{ + do + { + if (x) + d = a ^ c; + else + d = a ^ b; + } + while (y); +} + +int +main () +{ + a = (V) { 1, 2, 3, 4, 5, 6, 7, 8 }; + b = (V) { 0x40, 0x80, 0x40, 0x80, 0x40, 0x80, 0x40, 0x80 }; + e = (V) { 0x41, 0x82, 0x43, 0x84, 0x45, 0x86, 0x47, 0x88 }; + foo (0, 0); + if (__builtin_memcmp (&d, &e, sizeof (V)) != 0) + __builtin_abort (); + c = (V) { 0x80, 0x40, 0x80, 0x40, 0x80, 0x40, 0x80, 0x40 }; + f = (V) { 0x81, 0x42, 0x83, 0x44, 0x85, 0x46, 0x87, 0x48 }; + foo (1, 0); + if (__builtin_memcmp (&d, &f, sizeof (V)) != 0) + __builtin_abort (); + return 0; +}