Message ID | 56C47D5C.7010804@redhat.com |
---|---|
State | New |
Headers | show |
On Wed, Feb 17, 2016 at 3:02 PM, Jeff Law <law@redhat.com> wrote: > On 02/17/2016 03:48 AM, Richard Biener wrote: > >>> I instrumented a bootstrap -- the improved DSE finds ~20k additional DSE >>> opportunities during a GCC bootstrap that could not be found by the >>> current >>> DSE. Yes, 20k additional statements deleted by tree DSE. Yow! >> >> >> Well, DCE also can do quite some DSE and it runs after DSE - did that 20k >> more DSE affect the overall end-result? > > I haven't looked at that yet. I just got the instrumentation data last > night. > > >>> Of those additional opportunities > 99% are for sizes of 64 bytes or >>> smaller. Thus we can pack those into 1 or 2 bitmap elements, depending >>> on >>> the starting offset. So the bitmap side will be efficient with no real >>> searching if we choose our PARAM value wisely. >> >> >> So then please use a uint64_t or even uint32_t mask please. Which means >> a fixed size SBITMAP (32 bits) if you like to use the bitmap interface. > > I actually prefer the standard bitmap interface as it seamlessly handles > differences in the starting offset for the writes. > >> >> Can you share your work-in-progress patch? > > Easy 'nuff. This will bootstrap and regression test. Was planning to spend > today generating some additional testcodes from new cases it catches and > looking at impacts on code generation. > > I'm particularly interested in any impact on the zero-sized object clobbers. > I'd like to remove the bits which filter those out. > > It feels like there's some refactoring that ought to happen in this code. > Both in terms of the mostly duplicated test that a particular ref is > "interesting" and with mostly duplicated code to extract a ref from a mem* > or assignment. > > jeff > > > commit d49afd895524df98c5e53280b1c77f4b61a45ba3 > Author: Jeff Law <law@tor.usersys.redhat.com> > Date: Tue Feb 16 13:44:20 2016 -0500 > > Checkpoint > > CHeckpoint > > Another checkpoint > > Checkpoint > > diff --git a/gcc/params.def b/gcc/params.def > index c0494fa..5aa146b 100644 > --- a/gcc/params.def > +++ b/gcc/params.def > @@ -520,6 +520,11 @@ DEFPARAM(PARAM_IV_ALWAYS_PRUNE_CAND_SET_BOUND, > "If number of candidates in the set is smaller, we always try to > remove unused ivs during its optimization.", > 10, 0, 0) > > +DEFPARAM(PARAM_DSE_MAX_OBJECT_SIZE, > + "dse-max-object-size", > + "Maximum size (in bytes) of objects tracked by dead store > elimination.", > + 64, 0, 0) > + > DEFPARAM(PARAM_SCEV_MAX_EXPR_SIZE, > "scev-max-expr-size", > "Bound on size of expressions used in the scalar evolutions > analyzer.", > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/complex-4.c > b/gcc/testsuite/gcc.dg/tree-ssa/complex-4.c > index 87a2638..3155741 100644 > --- a/gcc/testsuite/gcc.dg/tree-ssa/complex-4.c > +++ b/gcc/testsuite/gcc.dg/tree-ssa/complex-4.c > @@ -10,4 +10,4 @@ int f(void) > return g(&t); > } > > -/* { dg-final { scan-tree-dump-times "__complex__" 0 "optimized" { xfail > *-*-* } } } */ > +/* { dg-final { scan-tree-dump-times "__complex__" 0 "optimized" } } */ > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/complex-5.c > b/gcc/testsuite/gcc.dg/tree-ssa/complex-5.c > index e2cd403..e6d027f 100644 > --- a/gcc/testsuite/gcc.dg/tree-ssa/complex-5.c > +++ b/gcc/testsuite/gcc.dg/tree-ssa/complex-5.c > @@ -8,4 +8,4 @@ int f(void) > __imag__ t = 2; > } > > -/* { dg-final { scan-tree-dump-times "__complex__" 0 "optimized" { xfail > *-*-* } } } */ > +/* { dg-final { scan-tree-dump-times "__complex__" 0 "optimized" } } */ > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-9.c > b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-9.c > index 594c20c..ae48ddd 100644 > --- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-9.c > +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-9.c > @@ -11,4 +11,4 @@ foo () > } > > /* We should eliminate the first assignment. */ > -/* { dg-final { scan-tree-dump-times "VDEF" 2 "dse1" { xfail *-*-* } } } */ > +/* { dg-final { scan-tree-dump-times "VDEF" 2 "dse1" } } */ > diff --git a/gcc/tree-ssa-dse.c b/gcc/tree-ssa-dse.c > index 372a0be..97a091b 100644 > --- a/gcc/tree-ssa-dse.c > +++ b/gcc/tree-ssa-dse.c > @@ -33,6 +33,7 @@ along with GCC; see the file COPYING3. If not see > #include "tree-dfa.h" > #include "domwalk.h" > #include "tree-cfgcleanup.h" > +#include "params.h" > > /* This file implements dead store elimination. > > @@ -68,6 +69,58 @@ along with GCC; see the file COPYING3. If not see > remove their dead edges eventually. */ > static bitmap need_eh_cleanup; > > +/* Clear any bytes written by STMT from the bitmap LIVE_BYTES. The base > + address written by STMT must match the one found in REF, which must > + have its base address previously initialized. > + > + This routine must be conservative. If we don't know the offset or > + actual size written, assume nothing was written. */ > + > +static void > +clear_bytes_written_by (bitmap live_bytes, gimple *stmt, ao_ref *ref) > +{ > + ao_ref write; > + write.base = NULL; > + > + /* It's advantageous to handle certain mem* functions. */ > + if (gimple_call_builtin_p (stmt, BUILT_IN_NORMAL)) > + { > + switch (DECL_FUNCTION_CODE (gimple_call_fndecl (stmt))) > + { > + case BUILT_IN_MEMCPY: > + case BUILT_IN_MEMMOVE: > + case BUILT_IN_MEMSET: > + { > + tree size = NULL_TREE; > + if (gimple_call_num_args (stmt) == 3) > + size = gimple_call_arg (stmt, 2); > + tree ptr = gimple_call_arg (stmt, 0); > + ao_ref_init_from_ptr_and_size (&write, ptr, size); > + ao_ref_base (&write); > + } > + default: > + break; > + } > + } > + else if (is_gimple_assign (stmt)) > + { > + ao_ref_init (&write, gimple_assign_lhs (stmt)); > + ao_ref_base (&write); > + } > + > + /* Verify we have the same base memory address and that the write > + has a known size. If so, then clear the appropriate bytes in > + the LIVE_BYTES bitmap. */ > + if (write.base > + && write.base == ref->base > + && write.size == write.max_size > + && (write.size % BITS_PER_UNIT) == 0 > + && (write.offset % BITS_PER_UNIT) == 0 > + && write.max_size != -1) > + bitmap_clear_range (live_bytes, > + write.offset / BITS_PER_UNIT, > + write.size / BITS_PER_UNIT); > +} > > /* A helper of dse_optimize_stmt. > Given a GIMPLE_ASSIGN in STMT that writes to REF, find a candidate > @@ -79,9 +132,33 @@ dse_possible_dead_store_p (ao_ref *ref, gimple *stmt, > gimple **use_stmt) > { > gimple *temp; > unsigned cnt = 0; > + bitmap live_bytes = NULL; > > *use_stmt = NULL; > > + /* REF is a memory write. Go ahead and get its base, size, extent > + information and encode the bytes written into LIVE_BYTES. We can > + handle any case where we have a known base and maximum size. > + > + However, experimentation has shown that bit level tracking is not > + useful in practice, so we only track at the byte level. > + > + Furthermore, experimentation has shown that 99% of the cases > + that require byte tracking are 64 bytes or less. Tracking 64 > + bytes also happens to fit nicely into a bitmap element. */ > + if (ao_ref_base (ref) > + && ref->max_size > + && (ref->offset % BITS_PER_UNIT) == 0 > + && (ref->max_size % BITS_PER_UNIT) == 0 > + && ref->max_size <= PARAM_VALUE (PARAM_DSE_MAX_OBJECT_SIZE) > + && ref->max_size != -1) > + { > + live_bytes = BITMAP_ALLOC (NULL); > + bitmap_set_range (live_bytes, > + ref->offset / BITS_PER_UNIT, > + ref->max_size / BITS_PER_UNIT); > + } > + > /* Find the first dominated statement that clobbers (part of) the > memory stmt stores to with no intermediate statement that may use > part of the memory stmt stores. That is, find a store that may > @@ -177,11 +254,18 @@ dse_possible_dead_store_p (ao_ref *ref, gimple *stmt, > gimple **use_stmt) > temp = stmt; > break; > } > + > + if (live_bytes && temp) > + clear_bytes_written_by (live_bytes, temp, ref); > } > - /* Continue walking until we reach a kill. */ > - while (!stmt_kills_ref_p (temp, ref)); > + /* Continue walking until we reach a full kill as a single statement > + or there are no more live bytes. */ > + while (!stmt_kills_ref_p (temp, ref) > + && !(live_bytes && bitmap_empty_p (live_bytes))); Just a short quick comment - the above means you only handle partial stores with no interveaning uses. You don't handle, say struct S { struct R { int x; int y; } r; int z; } s; s = { {1, 2}, 3 }; s.r.x = 1; s.r.y = 2; struct R r = s.r; s.z = 3; where s = { {1, 2}, 3} is still dead. That is, you don't make use of the live_bytes in the ref_maybe_used_by_stmt_p check (you can skip uses of only dead bytes). Not sure if it makes a difference in practice (compared to the cost it would take). Rather than building ao_refs in clear_bytes_written_by just use get_ref_base_and_extent directly. You don't handle stuff like s[i] = { 1, 2 }; s[i].x = 1; s[i].y = 1; either btw. Richard. > *use_stmt = temp; > + if (live_bytes) > + BITMAP_FREE (live_bytes); > > return true; > } >
On 02/17/2016 07:13 AM, Richard Biener wrote: >> - /* Continue walking until we reach a kill. */ >> - while (!stmt_kills_ref_p (temp, ref)); >> + /* Continue walking until we reach a full kill as a single statement >> + or there are no more live bytes. */ >> + while (!stmt_kills_ref_p (temp, ref) >> + && !(live_bytes && bitmap_empty_p (live_bytes))); > > Just a short quick comment - the above means you only handle partial stores > with no interveaning uses. You don't handle, say > > struct S { struct R { int x; int y; } r; int z; } s; > > s = { {1, 2}, 3 }; > s.r.x = 1; > s.r.y = 2; > struct R r = s.r; > s.z = 3; > > where s = { {1, 2}, 3} is still dead. Right. But handling that has never been part of DSE's design goals. Once there's a use, DSE has always given up. Having said that... > > That is, you don't make use of the live_bytes in the ref_maybe_used_by_stmt_p > check (you can skip uses of only dead bytes). > > Not sure if it makes a difference in practice (compared to the cost it > would take). Not sure either. It doesn't appear that it would be hard to experiment with that to see if it's worth the effort. My gut feeling is we're not going to see this often, if at all, in practice. > > Rather than building ao_refs in clear_bytes_written_by just use > get_ref_base_and_extent > directly. Easy enough to do, but ISTM if we use get_ref_base_and_extent in clear_bytes_written-by, then the other blob of similar code in tree-ssa-dse should be handled in the same way. ie, the code you see in clear_bytes_written_by is almost a direct copy of code already existing in tree-ssa-dse.c (hence my feeling that there's some refactoring of that code that we want to do). > > You don't handle stuff like > > s[i] = { 1, 2 }; > s[i].x = 1; > s[i].y = 1; > > either btw. Correct I believe. IIRC (I think I looked at this during debugging at some point), the ao_ref->max_size field will cover the entire array for this kind of situation because we don't know which element in the array we're hitting (or -1 if we don't know the array's size). I don't see a reasonable way to handle it with an ao_ref style interface unless the variable parts of the address computation are all rolled into the ao_ref->base field. I did look for cases where the initial store was to a varying location and thus max_size covered the entire array with killing stores that eventually covered the entire array (but with each individual killing store having size == max_size) -- the situation never came up in the codes I looked at (gcc & its runtime libraries of course). Jeff
On Wed, Feb 17, 2016 at 5:10 PM, Jeff Law <law@redhat.com> wrote: > On 02/17/2016 07:13 AM, Richard Biener wrote: >>> >>> - /* Continue walking until we reach a kill. */ >>> - while (!stmt_kills_ref_p (temp, ref)); >>> + /* Continue walking until we reach a full kill as a single statement >>> + or there are no more live bytes. */ >>> + while (!stmt_kills_ref_p (temp, ref) >>> + && !(live_bytes && bitmap_empty_p (live_bytes))); >> >> >> Just a short quick comment - the above means you only handle partial >> stores >> with no interveaning uses. You don't handle, say >> >> struct S { struct R { int x; int y; } r; int z; } s; >> >> s = { {1, 2}, 3 }; >> s.r.x = 1; >> s.r.y = 2; >> struct R r = s.r; >> s.z = 3; >> >> where s = { {1, 2}, 3} is still dead. > > Right. But handling that has never been part of DSE's design goals. Once > there's a use, DSE has always given up. Yeah, which is why I in the end said we need a "better" DSE ... > Having said that... > >> >> That is, you don't make use of the live_bytes in the >> ref_maybe_used_by_stmt_p >> check (you can skip uses of only dead bytes). >> >> Not sure if it makes a difference in practice (compared to the cost it >> would take). > > Not sure either. It doesn't appear that it would be hard to experiment with > that to see if it's worth the effort. My gut feeling is we're not going to > see this often, if at all, in practice. Yeah, I think the case we're after and that happens most is sth like a = { aggregate init }; a.a = ...; a.b = ...; ... and what you add is the ability to remove the aggregate init completely. What would be nice to have is to remove it partly as well, as for struct { int i; int j; int k; } a = {}; a.i = 1; a.k = 3; we'd like to remove the whole-a zeroing but we need to keep zeroing of a.j. I believe that SRA already has most of the analysis part, what it is lacking is that SRA works not flow-sensitive (it just gathers function-wide data) and that it doesn't consider base objects that have their address taken or are pointer-based. >> >> Rather than building ao_refs in clear_bytes_written_by just use >> get_ref_base_and_extent >> directly. > > Easy enough to do, but ISTM if we use get_ref_base_and_extent in > clear_bytes_written-by, then the other blob of similar code in tree-ssa-dse > should be handled in the same way. ie, the code you see in > clear_bytes_written_by is almost a direct copy of code already existing in > tree-ssa-dse.c (hence my feeling that there's some refactoring of that code > that we want to do). > > > >> >> You don't handle stuff like >> >> s[i] = { 1, 2 }; >> s[i].x = 1; >> s[i].y = 1; >> >> either btw. > > Correct I believe. > > IIRC (I think I looked at this during debugging at some point), the > ao_ref->max_size field will cover the entire array for this kind of > situation because we don't know which element in the array we're hitting (or > -1 if we don't know the array's size). I don't see a reasonable way to > handle it with an ao_ref style interface unless the variable parts of the > address computation are all rolled into the ao_ref->base field. > > I did look for cases where the initial store was to a varying location and > thus max_size covered the entire array with killing stores that eventually > covered the entire array (but with each individual killing store having size > == max_size) -- the situation never came up in the codes I looked at (gcc & > its runtime libraries of course). I think the only way to handle this case is to "strip" a common base with a varying address and replace it for the sake of get_ref_base_and_extent (that is, tell get_ref_base_and_extent to stop at s[i] in this case). So you split the piece-alias-test into a same-base comparison plus offset/size ontop of that. That said, your patch addresses a very specific case nothing else in the compiler handles right now, but it's also quite limited. So evaluating the compile-time impact is important here as well as trying to cover more cases of "partial inits after full inits" optimization. I don't believe we need to rush this into GCC 6. Richard. > Jeff
On 02/18/2016 02:56 AM, Richard Biener wrote: >> >> Right. But handling that has never been part of DSE's design goals. Once >> there's a use, DSE has always given up. > > Yeah, which is why I in the end said we need a "better" DSE ... Well, I don't see a rewrite/redesign in the mid-term horizon. > >> Having said that... >> >>> >>> That is, you don't make use of the live_bytes in the >>> ref_maybe_used_by_stmt_p >>> check (you can skip uses of only dead bytes). >>> >>> Not sure if it makes a difference in practice (compared to the cost it >>> would take). >> >> Not sure either. It doesn't appear that it would be hard to experiment with >> that to see if it's worth the effort. My gut feeling is we're not going to >> see this often, if at all, in practice. > > Yeah, I think the case we're after and that happens most is sth like > > a = { aggregate init }; > a.a = ...; > a.b = ...; > ... > > and what you add is the ability to remove the aggregate init completely. Essentially, yes. > > What would be nice to have is to remove it partly as well, as for > > struct { int i; int j; int k; } a = {}; > a.i = 1; > a.k = 3; > > we'd like to remove the whole-a zeroing but we need to keep zeroing > of a.j. > > I believe that SRA already has most of the analysis part, what it is > lacking is that SRA works not flow-sensitive (it just gathers > function-wide data) and that it doesn't consider base objects that > have their address taken or are pointer-based. I guess we could look at the live bytes at the end of the loop in dse_possible_dead_store_p and perhaps re-write the original statement. > > That said, your patch addresses a very specific case nothing else in > the compiler > handles right now, but it's also quite limited. So evaluating the compile-time > impact is important here as well as trying to cover more cases of "partial inits > after full inits" optimization. It's fairly narrow. Most of the stuff it's finding are just the clobbers and removing those is totally uninteresting from a code generation standpoint. I'm going to do another round of statistics gathering, filtering out all the clobbers. Some light poking would indicate there's still real world codes where this will help (in libstdc++, not surprisingly), but it's not going to be as broad as I'd like. > > I don't believe we need to rush this into GCC 6. Not looking to rush this -- it seems simple enough and fixes a long standing regression. I think it's appropriate, but I'm not going to get bent out of shape if you disagree and we table it until GCC 7. jeff
On 02/18/2016 02:56 AM, Richard Biener wrote: >>> Just a short quick comment - the above means you only handle partial >>> stores >>> with no interveaning uses. You don't handle, say >>> >>> struct S { struct R { int x; int y; } r; int z; } s; >>> >>> s = { {1, 2}, 3 }; >>> s.r.x = 1; >>> s.r.y = 2; >>> struct R r = s.r; >>> s.z = 3; >>> >>> where s = { {1, 2}, 3} is still dead. >> >> Right. But handling that has never been part of DSE's design goals. Once >> there's a use, DSE has always given up. > > Yeah, which is why I in the end said we need a "better" DSE ... So I cobbled up a quick test for this. I only handle assignments which may reference the same memory as the currently tracked store. Obviously that could be extended to handle certain builtins and the like. It triggers a bit here and there. While looking at those cases it occurred to me that, we could also look at this as a failure earlier in the optimization pipeline. In fact DOM already has code to handle a closely related situation. When DOM sees a store to memory, it creates a new fake statement with the RHS and LHS reversed. So in the case above DOM creates statements that look like: 1 = s.r.x 2 = s.r.y DOM then puts the RHS into the available expression table as equivalent to the LHS. So if it finds a later load of s.r.x, it will replace that load with "1". I haven't looked at it in a while, but it certainly was functional prior to the tuple merge. Presumably DOM is not looking at r = s.r and realizing it could look s.r piece-wise in the available expression table. If it did, it would effectively turn that fragment into: s = { {1, 2}, 3 }; s.r.x = 1; s.r.y = 2; struct R r = {1, 2} s.z = 3; At which point we no longer have the may-read of s.r.{x,y} and DSE would see the initial assignment as dead. I'm not sure if it's advisable to teach DOM how to lookup structure references piecewise or not. The code to handle this case in DSE is relatively simple, so perhaps we just go with the DSE variant. I also looked a bit at cases where we find that while an entire store (such as an aggregate initialization or mem*) may not be dead, pieces of the store may be dead. That's trivial to detect. It triggers relatively often. The trick is once detected, we have to go back and rewrite the original statement to only store the live parts. I've only written the detection code, the rewriting might be somewhat painful. I'm starting to wonder if what we have is a 3-part series. [1/3] The basic tracking to handle 33562, possibly included in gcc-6 [2/3] Ignore reads that reference stuff not in live_bytes for gcc-7 [3/3] Detect partially dead aggregate stores, rewriting the partially dead store to only store the live bytes. Also for gcc-7. Obviously [1/3] would need compile-time benchmarking, but I really don't expect any issues there. Jeff
On Fri, Feb 19, 2016 at 10:01 PM, Jeff Law <law@redhat.com> wrote: > On 02/18/2016 02:56 AM, Richard Biener wrote: >>>> >>>> Just a short quick comment - the above means you only handle partial >>>> stores >>>> with no interveaning uses. You don't handle, say >>>> >>>> struct S { struct R { int x; int y; } r; int z; } s; >>>> >>>> s = { {1, 2}, 3 }; >>>> s.r.x = 1; >>>> s.r.y = 2; >>>> struct R r = s.r; >>>> s.z = 3; >>>> >>>> where s = { {1, 2}, 3} is still dead. >>> >>> >>> Right. But handling that has never been part of DSE's design goals. Once >>> there's a use, DSE has always given up. >> >> >> Yeah, which is why I in the end said we need a "better" DSE ... > > So I cobbled up a quick test for this. I only handle assignments which may > reference the same memory as the currently tracked store. Obviously that > could be extended to handle certain builtins and the like. > > It triggers a bit here and there. While looking at those cases it occurred > to me that, we could also look at this as a failure earlier in the > optimization pipeline. In fact DOM already has code to handle a closely > related situation. > > When DOM sees a store to memory, it creates a new fake statement with the > RHS and LHS reversed. So in the case above DOM creates statements that look > like: > > 1 = s.r.x > 2 = s.r.y > > DOM then puts the RHS into the available expression table as equivalent to > the LHS. So if it finds a later load of s.r.x, it will replace that load > with "1". I haven't looked at it in a while, but it certainly was > functional prior to the tuple merge. It still is functional, that's how it CSEs across stores. > Presumably DOM is not looking at r = s.r and realizing it could look s.r > piece-wise in the available expression table. If it did, it would > effectively turn that fragment into: > > s = { {1, 2}, 3 }; > s.r.x = 1; > s.r.y = 2; > struct R r = {1, 2} > s.z = 3; > > At which point we no longer have the may-read of s.r.{x,y} and DSE would see > the initial assignment as dead. Yeah, but if it does not become dead you just increased code size or lifetime... Looking up an aggregate component-wise is also quite expensive (which components are you looking for?). > I'm not sure if it's advisable to teach DOM how to lookup structure > references piecewise or not. The code to handle this case in DSE is > relatively simple, so perhaps we just go with the DSE variant. FRE does something related - it looks at all piecewise uses of 'r' and eventually replaces them with pieces of s.r when seeing the r = s.r aggregate assignment. Of course that only makes the store to r dead if there are no uses of it left. > I also looked a bit at cases where we find that while an entire store (such > as an aggregate initialization or mem*) may not be dead, pieces of the store > may be dead. That's trivial to detect. It triggers relatively often. > The trick is once detected, we have to go back and rewrite the original > statement to only store the live parts. I've only written the detection > code, the rewriting might be somewhat painful. Yes. I think SRA has all the code to do that though, see how it does scalarization of constant pool loads like aggr = *.LC1; SRA has the additional limitation of only handling aggregate decls that don't have their address taken beause it basically assumes no aliasing and just analyzes all accesses in the whole function. So it doesn't have an idea of access ordering. But it's core data structures and routines would be suitable to build up accesses and doing replacements if you do a more conscious, alias-aware analysis of them. So if you don't use a simple bitmap but maybe share 'access' with the SRA code it may help you doing the required transform. > I'm starting to wonder if what we have is a 3-part series. > > [1/3] The basic tracking to handle 33562, possibly included in gcc-6 > [2/3] Ignore reads that reference stuff not in live_bytes for gcc-7 > [3/3] Detect partially dead aggregate stores, rewriting the partially > dead store to only store the live bytes. Also for gcc-7. > > > Obviously [1/3] would need compile-time benchmarking, but I really don't > expect any issues there. So what's the overall statistic result on [1/3] if you exclude the clobbers? Richard. > Jeff
On 02/22/2016 07:32 AM, Richard Biener wrote: >> Presumably DOM is not looking at r = s.r and realizing it could look s.r >> piece-wise in the available expression table. If it did, it would >> effectively turn that fragment into: >> >> s = { {1, 2}, 3 }; >> s.r.x = 1; >> s.r.y = 2; >> struct R r = {1, 2} >> s.z = 3; >> >> At which point we no longer have the may-read of s.r.{x,y} and DSE would see >> the initial assignment as dead. > > Yeah, but if it does not become dead you just increased code size or lifetime... Increasing lifetimes is inherent in just about any CSE optimization. But as I mentioned, I'm not sure trying to add this aggregate handling to DOM is wise. > > FRE does something related - it looks at all piecewise uses of 'r' and > eventually replaces them with pieces of s.r when seeing the r = s.r > aggregate assignment. Of course that only makes the store to r dead if there > are no uses of it left. *If* we were to try and do something similar in DOM, we'd probably want to try and share much of the infrastructure. I'll keep the FRE code in mind. > >> I also looked a bit at cases where we find that while an entire store (such >> as an aggregate initialization or mem*) may not be dead, pieces of the store >> may be dead. That's trivial to detect. It triggers relatively often. >> The trick is once detected, we have to go back and rewrite the original >> statement to only store the live parts. I've only written the detection >> code, the rewriting might be somewhat painful. > > Yes. I think SRA has all the code to do that though, see how it > does scalarization of constant pool loads like Ohhh. Good idea, I'll dig around SRA for a bit and see if there's something that can be re-used. >> I'm starting to wonder if what we have is a 3-part series. >> >> [1/3] The basic tracking to handle 33562, possibly included in gcc-6 >> [2/3] Ignore reads that reference stuff not in live_bytes for gcc-7 >> [3/3] Detect partially dead aggregate stores, rewriting the partially >> dead store to only store the live bytes. Also for gcc-7. >> >> >> Obviously [1/3] would need compile-time benchmarking, but I really don't >> expect any issues there. > > So what's the overall statistic result on [1/3] if you exclude the clobbers? Very few, call it a dozen, all in libstdc++. They weren't significantly different than ssa-dse-9.c, so I didn't try to build nice reduced testcases for them given we've got existing coverage. One could argue that with the few real world cases that 33562 could be punted to P4 and and patch series deferred to gcc-7. I wouldn't lose sleep over that option. Jeff
On Mon, Feb 22, 2016 at 5:32 PM, Jeff Law <law@redhat.com> wrote: > On 02/22/2016 07:32 AM, Richard Biener wrote: > >>> Presumably DOM is not looking at r = s.r and realizing it could look s.r >>> piece-wise in the available expression table. If it did, it would >>> effectively turn that fragment into: >>> >>> s = { {1, 2}, 3 }; >>> s.r.x = 1; >>> s.r.y = 2; >>> struct R r = {1, 2} >>> s.z = 3; >>> >>> At which point we no longer have the may-read of s.r.{x,y} and DSE would >>> see >>> the initial assignment as dead. >> >> >> Yeah, but if it does not become dead you just increased code size or >> lifetime... > > Increasing lifetimes is inherent in just about any CSE optimization. But as > I mentioned, I'm not sure trying to add this aggregate handling to DOM is > wise. > >> >> FRE does something related - it looks at all piecewise uses of 'r' and >> eventually replaces them with pieces of s.r when seeing the r = s.r >> aggregate assignment. Of course that only makes the store to r dead if >> there >> are no uses of it left. > > *If* we were to try and do something similar in DOM, we'd probably want to > try and share much of the infrastructure. I'll keep the FRE code in mind. > >> >>> I also looked a bit at cases where we find that while an entire store >>> (such >>> as an aggregate initialization or mem*) may not be dead, pieces of the >>> store >>> may be dead. That's trivial to detect. It triggers relatively often. >>> The trick is once detected, we have to go back and rewrite the original >>> statement to only store the live parts. I've only written the detection >>> code, the rewriting might be somewhat painful. >> >> >> Yes. I think SRA has all the code to do that though, see how it >> does scalarization of constant pool loads like > > Ohhh. Good idea, I'll dig around SRA for a bit and see if there's something > that can be re-used. > >>> I'm starting to wonder if what we have is a 3-part series. >>> >>> [1/3] The basic tracking to handle 33562, possibly included in gcc-6 >>> [2/3] Ignore reads that reference stuff not in live_bytes for gcc-7 >>> [3/3] Detect partially dead aggregate stores, rewriting the partially >>> dead store to only store the live bytes. Also for gcc-7. >>> >>> >>> Obviously [1/3] would need compile-time benchmarking, but I really don't >>> expect any issues there. >> >> >> So what's the overall statistic result on [1/3] if you exclude the >> clobbers? > > Very few, call it a dozen, all in libstdc++. They weren't significantly > different than ssa-dse-9.c, so I didn't try to build nice reduced testcases > for them given we've got existing coverage. > > One could argue that with the few real world cases that 33562 could be > punted to P4 and and patch series deferred to gcc-7. I wouldn't lose sleep > over that option. Way to go at this point IMHO. That is, keep at P2 please, P4 is for sth else. It's not a P1 blocker anyway. Richard. > Jeff >
On 02/18/2016 02:56 AM, Richard Biener wrote: > On Wed, Feb 17, 2016 at 5:10 PM, Jeff Law <law@redhat.com> wrote: >> On 02/17/2016 07:13 AM, Richard Biener wrote: >>>> >>>> - /* Continue walking until we reach a kill. */ >>>> - while (!stmt_kills_ref_p (temp, ref)); >>>> + /* Continue walking until we reach a full kill as a single statement >>>> + or there are no more live bytes. */ >>>> + while (!stmt_kills_ref_p (temp, ref) >>>> + && !(live_bytes && bitmap_empty_p (live_bytes))); >>> >>> >>> Just a short quick comment - the above means you only handle partial >>> stores >>> with no interveaning uses. You don't handle, say >>> >>> struct S { struct R { int x; int y; } r; int z; } s; >>> >>> s = { {1, 2}, 3 }; >>> s.r.x = 1; >>> s.r.y = 2; >>> struct R r = s.r; >>> s.z = 3; >>> >>> where s = { {1, 2}, 3} is still dead. >> >> Right. But handling that has never been part of DSE's design goals. Once >> there's a use, DSE has always given up. > > Yeah, which is why I in the end said we need a "better" DSE ... And coming back to this -- these kind of opportunities appear to be rare. I found a couple in a GCC build and some in the libstdc++ testsuite. From looking at how your test is currently handled, the combination of SRA and early FRE tend to clean things up before DSE gets a chance. That may account for the lack of "hits" for this improvement. Regardless, the code is written (#4 in the recently posted series). I'm going to add your test to the updated path (with SRA and FRE disabled obviously) as an additional test given there's very little coverage of this feature outside the libstdc++ testsuite. > > Yeah, I think the case we're after and that happens most is sth like > > a = { aggregate init }; > a.a = ...; > a.b = ...; > ... > > and what you add is the ability to remove the aggregate init completely. > > What would be nice to have is to remove it partly as well, as for > > struct { int i; int j; int k; } a = {}; > a.i = 1; > a.k = 3; > > we'd like to remove the whole-a zeroing but we need to keep zeroing > of a.j. > > I believe that SRA already has most of the analysis part, what it is > lacking is that SRA works not flow-sensitive (it just gathers > function-wide data) and that it doesn't consider base objects that > have their address taken or are pointer-based. So that's handled by patch #2 and it's (by far) the most effective part of this work in terms of hits and reducing the number of stored bytes. Patch #2 has a few tests for this case and it is well exercised by a bootstrap as well, I don't think your testcase provides any additional coverage. Jeff
diff --git a/gcc/params.def b/gcc/params.def index c0494fa..5aa146b 100644 --- a/gcc/params.def +++ b/gcc/params.def @@ -520,6 +520,11 @@ DEFPARAM(PARAM_IV_ALWAYS_PRUNE_CAND_SET_BOUND, "If number of candidates in the set is smaller, we always try to remove unused ivs during its optimization.", 10, 0, 0) +DEFPARAM(PARAM_DSE_MAX_OBJECT_SIZE, + "dse-max-object-size", + "Maximum size (in bytes) of objects tracked by dead store elimination.", + 64, 0, 0) + DEFPARAM(PARAM_SCEV_MAX_EXPR_SIZE, "scev-max-expr-size", "Bound on size of expressions used in the scalar evolutions analyzer.", diff --git a/gcc/testsuite/gcc.dg/tree-ssa/complex-4.c b/gcc/testsuite/gcc.dg/tree-ssa/complex-4.c index 87a2638..3155741 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/complex-4.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/complex-4.c @@ -10,4 +10,4 @@ int f(void) return g(&t); } -/* { dg-final { scan-tree-dump-times "__complex__" 0 "optimized" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump-times "__complex__" 0 "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/complex-5.c b/gcc/testsuite/gcc.dg/tree-ssa/complex-5.c index e2cd403..e6d027f 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/complex-5.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/complex-5.c @@ -8,4 +8,4 @@ int f(void) __imag__ t = 2; } -/* { dg-final { scan-tree-dump-times "__complex__" 0 "optimized" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump-times "__complex__" 0 "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-9.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-9.c index 594c20c..ae48ddd 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-9.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-9.c @@ -11,4 +11,4 @@ foo () } /* We should eliminate the first assignment. */ -/* { dg-final { scan-tree-dump-times "VDEF" 2 "dse1" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump-times "VDEF" 2 "dse1" } } */ diff --git a/gcc/tree-ssa-dse.c b/gcc/tree-ssa-dse.c index 372a0be..97a091b 100644 --- a/gcc/tree-ssa-dse.c +++ b/gcc/tree-ssa-dse.c @@ -33,6 +33,7 @@ along with GCC; see the file COPYING3. If not see #include "tree-dfa.h" #include "domwalk.h" #include "tree-cfgcleanup.h" +#include "params.h" /* This file implements dead store elimination. @@ -68,6 +69,58 @@ along with GCC; see the file COPYING3. If not see remove their dead edges eventually. */ static bitmap need_eh_cleanup; +/* Clear any bytes written by STMT from the bitmap LIVE_BYTES. The base + address written by STMT must match the one found in REF, which must + have its base address previously initialized. + + This routine must be conservative. If we don't know the offset or + actual size written, assume nothing was written. */ + +static void +clear_bytes_written_by (bitmap live_bytes, gimple *stmt, ao_ref *ref) +{ + ao_ref write; + write.base = NULL; + + /* It's advantageous to handle certain mem* functions. */ + if (gimple_call_builtin_p (stmt, BUILT_IN_NORMAL)) + { + switch (DECL_FUNCTION_CODE (gimple_call_fndecl (stmt))) + { + case BUILT_IN_MEMCPY: + case BUILT_IN_MEMMOVE: + case BUILT_IN_MEMSET: + { + tree size = NULL_TREE; + if (gimple_call_num_args (stmt) == 3) + size = gimple_call_arg (stmt, 2); + tree ptr = gimple_call_arg (stmt, 0); + ao_ref_init_from_ptr_and_size (&write, ptr, size); + ao_ref_base (&write); + } + default: + break; + } + } + else if (is_gimple_assign (stmt)) + { + ao_ref_init (&write, gimple_assign_lhs (stmt)); + ao_ref_base (&write); + } + + /* Verify we have the same base memory address and that the write + has a known size. If so, then clear the appropriate bytes in + the LIVE_BYTES bitmap. */ + if (write.base + && write.base == ref->base + && write.size == write.max_size + && (write.size % BITS_PER_UNIT) == 0 + && (write.offset % BITS_PER_UNIT) == 0 + && write.max_size != -1) + bitmap_clear_range (live_bytes, + write.offset / BITS_PER_UNIT, + write.size / BITS_PER_UNIT); +} /* A helper of dse_optimize_stmt. Given a GIMPLE_ASSIGN in STMT that writes to REF, find a candidate @@ -79,9 +132,33 @@ dse_possible_dead_store_p (ao_ref *ref, gimple *stmt, gimple **use_stmt) { gimple *temp; unsigned cnt = 0; + bitmap live_bytes = NULL; *use_stmt = NULL; + /* REF is a memory write. Go ahead and get its base, size, extent + information and encode the bytes written into LIVE_BYTES. We can + handle any case where we have a known base and maximum size. + + However, experimentation has shown that bit level tracking is not + useful in practice, so we only track at the byte level. + + Furthermore, experimentation has shown that 99% of the cases + that require byte tracking are 64 bytes or less. Tracking 64 + bytes also happens to fit nicely into a bitmap element. */ + if (ao_ref_base (ref) + && ref->max_size + && (ref->offset % BITS_PER_UNIT) == 0 + && (ref->max_size % BITS_PER_UNIT) == 0 + && ref->max_size <= PARAM_VALUE (PARAM_DSE_MAX_OBJECT_SIZE) + && ref->max_size != -1) + { + live_bytes = BITMAP_ALLOC (NULL); + bitmap_set_range (live_bytes, + ref->offset / BITS_PER_UNIT, + ref->max_size / BITS_PER_UNIT); + } + /* Find the first dominated statement that clobbers (part of) the memory stmt stores to with no intermediate statement that may use part of the memory stmt stores. That is, find a store that may @@ -177,11 +254,18 @@ dse_possible_dead_store_p (ao_ref *ref, gimple *stmt, gimple **use_stmt) temp = stmt; break; } + + if (live_bytes && temp) + clear_bytes_written_by (live_bytes, temp, ref); } - /* Continue walking until we reach a kill. */ - while (!stmt_kills_ref_p (temp, ref)); + /* Continue walking until we reach a full kill as a single statement + or there are no more live bytes. */ + while (!stmt_kills_ref_p (temp, ref) + && !(live_bytes && bitmap_empty_p (live_bytes))); *use_stmt = temp; + if (live_bytes) + BITMAP_FREE (live_bytes); return true; }