diff mbox

Optimiza aggregate a = b = c = {} (PR c/78408, take 2)

Message ID 20161215164126.GJ21933@tucnak
State New
Headers show

Commit Message

Jakub Jelinek Dec. 15, 2016, 4:41 p.m. UTC
On Wed, Dec 14, 2016 at 01:27:56PM +0100, Richard Biener wrote:
> Ah, ok.  But then the size of the memset shouldn't be compared
> against the get_ref_base_and_extend size from src2 but to the
> size of the access of SRC/DEST (clearly looking at the "size" of
> the ADDR_EXPR argument is somewhat bogus).
> And as you compare src and dest
> with operand_equal_p there is no need to reject ssize != max_size
> either (you'd of course not see memset (&a[i].x, 0, 400) because
> &a[i].x is not invariant, you'd need to lookup the SSA def for a pointer
> here).
> 
> You can get at the size of an actual access simpler than by
> the full-blown get_ref_base_and_extent (just outline a
> get_ref_size () from the head part of it.
> 
> I still think that using get_addr_base_and_unit_offset on the
> address is better and passing decomposed (base, offset, size)
> triplets to optimize_memcpy would also save you the
> MEM[(char * {ref-all})&b] = MEM[(char * {ref-all})&a]; special-case.

Here is an updated patch that does that (i.e. always work with base, offset,
length triplets) and drops the alias check for dest vs. src overlap.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2016-12-15  Jakub Jelinek  <jakub@redhat.com>

	PR c/78408
	* tree-ssa-ccp.c: Include tree-dfa.h.
	(optimize_memcpy): New function.
	(pass_fold_builtins::execute): Use it.  Remove useless conditional
	break after BUILT_IN_VA_*.

	* gcc.dg/pr78408-1.c: New test.
	* gcc.dg/pr78408-2.c: New test.



	Jakub

Comments

Richard Biener Dec. 16, 2016, 12:50 p.m. UTC | #1
On Thu, 15 Dec 2016, Jakub Jelinek wrote:

> On Wed, Dec 14, 2016 at 01:27:56PM +0100, Richard Biener wrote:
> > Ah, ok.  But then the size of the memset shouldn't be compared
> > against the get_ref_base_and_extend size from src2 but to the
> > size of the access of SRC/DEST (clearly looking at the "size" of
> > the ADDR_EXPR argument is somewhat bogus).
> > And as you compare src and dest
> > with operand_equal_p there is no need to reject ssize != max_size
> > either (you'd of course not see memset (&a[i].x, 0, 400) because
> > &a[i].x is not invariant, you'd need to lookup the SSA def for a pointer
> > here).
> > 
> > You can get at the size of an actual access simpler than by
> > the full-blown get_ref_base_and_extent (just outline a
> > get_ref_size () from the head part of it.
> > 
> > I still think that using get_addr_base_and_unit_offset on the
> > address is better and passing decomposed (base, offset, size)
> > triplets to optimize_memcpy would also save you the
> > MEM[(char * {ref-all})&b] = MEM[(char * {ref-all})&a]; special-case.
> 
> Here is an updated patch that does that (i.e. always work with base, offset,
> length triplets) and drops the alias check for dest vs. src overlap.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
> 
> 2016-12-15  Jakub Jelinek  <jakub@redhat.com>
> 
> 	PR c/78408
> 	* tree-ssa-ccp.c: Include tree-dfa.h.
> 	(optimize_memcpy): New function.
> 	(pass_fold_builtins::execute): Use it.  Remove useless conditional
> 	break after BUILT_IN_VA_*.
> 
> 	* gcc.dg/pr78408-1.c: New test.
> 	* gcc.dg/pr78408-2.c: New test.
> 
> --- gcc/tree-ssa-ccp.c.jj	2016-11-28 16:19:11.000000000 +0100
> +++ gcc/tree-ssa-ccp.c	2016-12-15 15:09:03.180993745 +0100
> @@ -143,6 +143,7 @@ along with GCC; see the file COPYING3.
>  #include "stor-layout.h"
>  #include "optabs-query.h"
>  #include "tree-ssa-ccp.h"
> +#include "tree-dfa.h"
>  
>  /* Possible lattice values.  */
>  typedef enum
> @@ -2933,6 +2934,113 @@ optimize_atomic_bit_test_and (gimple_stm
>    release_ssa_name (lhs);
>  }
>  
> +/* Optimize
> +   a = {};
> +   b = a;
> +   into
> +   a = {};
> +   b = {};
> +   Similarly for memset (&a, ..., sizeof (a)); instead of a = {};
> +   and/or memcpy (&b, &a, sizeof (a)); instead of b = a;  */
> +
> +static void
> +optimize_memcpy (gimple_stmt_iterator *gsip, tree dest, tree src, tree len)
> +{
> +  gimple *stmt = gsi_stmt (*gsip);
> +  if (gimple_has_volatile_ops (stmt)
> +      || TREE_THIS_VOLATILE (dest)
> +      || TREE_THIS_VOLATILE (src))

The last two should be redundant and/or not necessary.

> +    return;
> +
> +  tree vuse = gimple_vuse (stmt);
> +  if (vuse == NULL)
> +    return;
> +
> +  gimple *defstmt = SSA_NAME_DEF_STMT (vuse);
> +  tree src2 = NULL_TREE, len2 = NULL_TREE;
> +  HOST_WIDE_INT offset, offset2;
> +  tree val = integer_zero_node;
> +  if (gimple_store_p (defstmt)
> +      && gimple_assign_single_p (defstmt)
> +      && TREE_CODE (gimple_assign_rhs1 (defstmt)) == CONSTRUCTOR
> +      && CONSTRUCTOR_NELTS (gimple_assign_rhs1 (defstmt)) == 0

Should be always true for stores from constructors.

> +      && !gimple_clobber_p (defstmt))
> +    src2 = gimple_assign_lhs (defstmt);
> +  else if (gimple_call_builtin_p (defstmt, BUILT_IN_MEMSET)
> +	   && TREE_CODE (gimple_call_arg (defstmt, 0)) == ADDR_EXPR
> +	   && TREE_CODE (gimple_call_arg (defstmt, 1)) == INTEGER_CST)
> +    {
> +      src2 = TREE_OPERAND (gimple_call_arg (defstmt, 0), 0);
> +      len2 = gimple_call_arg (defstmt, 2);
> +      val = gimple_call_arg (defstmt, 1);
> +      if (!integer_zerop (val) && is_gimple_assign (stmt))

Please add a comment here.  I think below ... (*)

> +	src2 = NULL_TREE;
> +    }
> +
> +  if (src2 == NULL_TREE)
> +    return;
> +
> +  if (len == NULL_TREE)
> +    len = (TREE_CODE (src) == COMPONENT_REF
> +	   ? DECL_SIZE_UNIT (TREE_OPERAND (src, 1))
> +	   : TYPE_SIZE_UNIT (TREE_TYPE (src)));
> +  if (len2 == NULL_TREE)
> +    len2 = (TREE_CODE (src2) == COMPONENT_REF
> +	    ? DECL_SIZE_UNIT (TREE_OPERAND (src2, 1))
> +	    : TYPE_SIZE_UNIT (TREE_TYPE (src2)));
> +  if (len == NULL_TREE
> +      || TREE_CODE (len) != INTEGER_CST
> +      || len2 == NULL_TREE
> +      || TREE_CODE (len2) != INTEGER_CST)
> +    return;
> +
> +  src = get_addr_base_and_unit_offset (src, &offset);
> +  src2 = get_addr_base_and_unit_offset (src2, &offset2);
> +  if (src == NULL_TREE
> +      || src2 == NULL_TREE
> +      || offset < offset2)
> +    return;
> +
> +  if (!operand_equal_p (src, src2, 0))
> +    return;
> +
> +  /* [ src + offset2, src + offset2 + len2 - 1 ] is set to val.
> +     Make sure that
> +     [ src + offset, src + offset + len - 1 ] is a subset of that.  */
> +  if (wi::to_widest (len) + (offset - offset2) > wi::to_widest (len2))

I think you can use ::to_offset which is more efficient.

> +    return;
> +
> +  if (dump_file && (dump_flags & TDF_DETAILS))
> +    {
> +      fprintf (dump_file, "Simplified\n  ");
> +      print_gimple_stmt (dump_file, stmt, 0, dump_flags);
> +      fprintf (dump_file, "after previous\n  ");
> +      print_gimple_stmt (dump_file, defstmt, 0, dump_flags);
> +    }
> +
> +  if (is_gimple_assign (stmt))

(*) a better check would be if val is zero because even a memcpy
could be optimized to an assignment from {}.  I suppose you're
doing it this way because of simplicity and because we're
late in the compilation and thus it doesn't matter in practice
for optimization?  (the 2nd destination could become
non-TREE_ADDRESSABLE, enabling better alias disambiguation)

Ok with the above changes, some more comments are ok to resolve
the last one.

> +    {
> +      tree ctor = build_constructor (TREE_TYPE (dest), NULL);
> +      gimple_assign_set_rhs_from_tree (gsip, ctor);
> +      update_stmt (stmt);
> +    }
> +  else

 /* stmt is memcpy */

Thanks,
Richard.

> +    {
> +      gcall *call = as_a <gcall *> (stmt);
> +      tree fndecl = builtin_decl_implicit (BUILT_IN_MEMSET);
> +      gimple_call_set_fndecl (call, fndecl);
> +      gimple_call_set_fntype (call, TREE_TYPE (fndecl));
> +      gimple_call_set_arg (call, 1, val);
> +      update_stmt (stmt);
> +    }
> +
> +  if (dump_file && (dump_flags & TDF_DETAILS))
> +    {
> +      fprintf (dump_file, "into\n  ");
> +      print_gimple_stmt (dump_file, stmt, 0, dump_flags);
> +    }
> +}
> +
>  /* A simple pass that attempts to fold all builtin functions.  This pass
>     is run after we've propagated as many constants as we can.  */
>  
> @@ -2999,6 +3107,9 @@ pass_fold_builtins::execute (function *f
>  		      continue;
>  		    }
>  		}
> +	      else if (gimple_assign_load_p (stmt) && gimple_store_p (stmt))
> +		optimize_memcpy (&i, gimple_assign_lhs (stmt),
> +				 gimple_assign_rhs1 (stmt), NULL_TREE);
>  	      gsi_next (&i);
>  	      continue;
>  	    }
> @@ -3114,14 +3225,25 @@ pass_fold_builtins::execute (function *f
>  						false, false);
>  		  break;
>  
> +		case BUILT_IN_MEMCPY:
> +		  if (gimple_call_builtin_p (stmt, BUILT_IN_NORMAL)
> +		      && TREE_CODE (gimple_call_arg (stmt, 0)) == ADDR_EXPR
> +		      && TREE_CODE (gimple_call_arg (stmt, 1)) == ADDR_EXPR
> +		      && TREE_CODE (gimple_call_arg (stmt, 2)) == INTEGER_CST)
> +		    {
> +		      tree dest = TREE_OPERAND (gimple_call_arg (stmt, 0), 0);
> +		      tree src = TREE_OPERAND (gimple_call_arg (stmt, 1), 0);
> +		      tree len = gimple_call_arg (stmt, 2);
> +		      optimize_memcpy (&i, dest, src, len);
> +		    }
> +		  break;
> +
>  		case BUILT_IN_VA_START:
>  		case BUILT_IN_VA_END:
>  		case BUILT_IN_VA_COPY:
>  		  /* These shouldn't be folded before pass_stdarg.  */
>  		  result = optimize_stdarg_builtin (stmt);
> -		  if (result)
> -		    break;
> -		  /* FALLTHRU */
> +		  break;
>  
>  		default:;
>  		}
> --- gcc/testsuite/gcc.dg/pr78408-1.c.jj	2016-12-15 13:54:08.942121305 +0100
> +++ gcc/testsuite/gcc.dg/pr78408-1.c	2016-12-15 15:13:04.900774924 +0100
> @@ -0,0 +1,88 @@
> +/* PR c/78408 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-fab1-details" } */
> +/* { dg-final { scan-tree-dump-times "after previous" 17 "fab1" } } */
> +
> +struct S { char a[32]; };
> +struct T { char a[65536]; };
> +void bar (int, struct S *, struct S *, struct T *, struct T *);
> +void baz (char *, char *);
> +
> +void
> +f1 (void)
> +{
> +  struct S a, b;
> +  struct T c, d;
> +  a = b = (struct S) {};
> +  c = d = (struct T) {};
> +  bar (1, &a, &b, &c, &d);
> +}
> +
> +void
> +f2 (void)
> +{
> +  struct S a, b;
> +  struct T c, d;
> +  b = (struct S) {};
> +  a = b;
> +  d = (struct T) {};
> +  c = d;
> +  bar (2, &a, &b, &c, &d);
> +}
> +
> +void
> +f3 (void)
> +{
> +  struct S a, b;
> +  struct T c, d;
> +  __builtin_memset (&b, 0, sizeof (b));
> +  a = b;
> +  __builtin_memset (&d, 0, sizeof (d));
> +  c = d;
> +  bar (3, &a, &b, &c, &d);
> +}
> +
> +
> +void
> +f4 (void)
> +{
> +  struct S a, b;
> +  struct T c, d;
> +  b = (struct S) {};
> +  __builtin_memcpy (&a, &b, sizeof (b));
> +  d = (struct T) {};
> +  __builtin_memcpy (&c, &d, sizeof (d));
> +  bar (4, &a, &b, &c, &d);
> +}
> +
> +void
> +f5 (void)
> +{
> +  struct S a, b;
> +  struct T c, d;
> +  __builtin_memset (&b, 0, sizeof (b));
> +  __builtin_memcpy (&a, &b, sizeof (b));
> +  __builtin_memset (&d, 0, sizeof (d));
> +  __builtin_memcpy (&c, &d, sizeof (d));
> +  bar (5, &a, &b, &c, &d);
> +}
> +
> +void
> +f6 (void)
> +{
> +  struct S a, b, e, g;
> +  struct T c, d, f, h;
> +  g = e = a = b = (struct S) {};
> +  h = f = c = d = (struct T) {};
> +  bar (6, &a, &b, &c, &d);
> +  bar (6, &e, &g, &f, &h);
> +}
> +
> +void
> +f7 (void)
> +{
> +  char a[64], b[64];
> +  __builtin_memset (a + 13, 2, 27);
> +  __builtin_memcpy (b + 4, a + 17, 23);
> +  baz (a, b);
> +}
> --- gcc/testsuite/gcc.dg/pr78408-2.c.jj	2016-12-15 15:13:48.060200199 +0100
> +++ gcc/testsuite/gcc.dg/pr78408-2.c	2016-12-15 15:15:50.880564683 +0100
> @@ -0,0 +1,39 @@
> +/* PR c/78408 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-fab1-details" } */
> +/* { dg-final { scan-tree-dump-not "after previous" "fab1" } } */
> +
> +struct S { char a[32]; };
> +struct T { char a[65536]; };
> +void bar (int, struct S *, struct S *, struct T *, struct T *);
> +void baz (char *, char *);
> +
> +void
> +f1 (void)
> +{
> +  struct S a, b;
> +  struct T c, d;
> +  __builtin_memset (&b, 2, sizeof (b));
> +  a = b;
> +  __builtin_memset (&d, 3, sizeof (d));
> +  c = d;
> +  bar (3, &a, &b, &c, &d);
> +}
> +
> +void
> +f2 (void)
> +{
> +  char a[64], b[64];
> +  __builtin_memset (a + 13, 2, 27);
> +  __builtin_memcpy (b + 4, a + 17, 24);
> +  baz (a, b);
> +}
> +
> +void
> +f3 (void)
> +{
> +  char a[64], b[64];
> +  __builtin_memset (a + 13, 2, 27);
> +  __builtin_memcpy (b + 4, a + 12, 5);
> +  baz (a, b);
> +}
> 
> 
> 	Jakub
> 
>
Jakub Jelinek Dec. 16, 2016, 1:06 p.m. UTC | #2
On Fri, Dec 16, 2016 at 01:50:54PM +0100, Richard Biener wrote:
> > +static void
> > +optimize_memcpy (gimple_stmt_iterator *gsip, tree dest, tree src, tree len)
> > +{
> > +  gimple *stmt = gsi_stmt (*gsip);
> > +  if (gimple_has_volatile_ops (stmt)
> > +      || TREE_THIS_VOLATILE (dest)
> > +      || TREE_THIS_VOLATILE (src))
> 
> The last two should be redundant and/or not necessary.

I've been doing this if stmt is __builtin_memcpy and the actual
objects are TREE_THIS_VOLATILE, then I wanted to punt.
But I guess if we never transform __builtin_memcpy into anything
other than __builtin_memset, then it isn't a problem.

> > +    return;
> > +
> > +  tree vuse = gimple_vuse (stmt);
> > +  if (vuse == NULL)
> > +    return;
> > +
> > +  gimple *defstmt = SSA_NAME_DEF_STMT (vuse);
> > +  tree src2 = NULL_TREE, len2 = NULL_TREE;
> > +  HOST_WIDE_INT offset, offset2;
> > +  tree val = integer_zero_node;
> > +  if (gimple_store_p (defstmt)
> > +      && gimple_assign_single_p (defstmt)
> > +      && TREE_CODE (gimple_assign_rhs1 (defstmt)) == CONSTRUCTOR
> > +      && CONSTRUCTOR_NELTS (gimple_assign_rhs1 (defstmt)) == 0
> 
> Should be always true for stores from constructors.

Ok, can I just gcc_checking_assert verify it or is that not worth it?

> > +      && !gimple_clobber_p (defstmt))
> > +    src2 = gimple_assign_lhs (defstmt);
> > +  else if (gimple_call_builtin_p (defstmt, BUILT_IN_MEMSET)
> > +	   && TREE_CODE (gimple_call_arg (defstmt, 0)) == ADDR_EXPR
> > +	   && TREE_CODE (gimple_call_arg (defstmt, 1)) == INTEGER_CST)
> > +    {
> > +      src2 = TREE_OPERAND (gimple_call_arg (defstmt, 0), 0);
> > +      len2 = gimple_call_arg (defstmt, 2);
> > +      val = gimple_call_arg (defstmt, 1);
> > +      if (!integer_zerop (val) && is_gimple_assign (stmt))
> 
> Please add a comment here.  I think below ... (*)
> 
> > +	src2 = NULL_TREE;
> > +    }
> > +
> > +  if (src2 == NULL_TREE)
> > +    return;
> > +
> > +  if (len == NULL_TREE)
> > +    len = (TREE_CODE (src) == COMPONENT_REF
> > +	   ? DECL_SIZE_UNIT (TREE_OPERAND (src, 1))
> > +	   : TYPE_SIZE_UNIT (TREE_TYPE (src)));
> > +  if (len2 == NULL_TREE)
> > +    len2 = (TREE_CODE (src2) == COMPONENT_REF
> > +	    ? DECL_SIZE_UNIT (TREE_OPERAND (src2, 1))
> > +	    : TYPE_SIZE_UNIT (TREE_TYPE (src2)));
> > +  if (len == NULL_TREE
> > +      || TREE_CODE (len) != INTEGER_CST
> > +      || len2 == NULL_TREE
> > +      || TREE_CODE (len2) != INTEGER_CST)
> > +    return;
> > +
> > +  src = get_addr_base_and_unit_offset (src, &offset);
> > +  src2 = get_addr_base_and_unit_offset (src2, &offset2);
> > +  if (src == NULL_TREE
> > +      || src2 == NULL_TREE
> > +      || offset < offset2)
> > +    return;
> > +
> > +  if (!operand_equal_p (src, src2, 0))
> > +    return;
> > +
> > +  /* [ src + offset2, src + offset2 + len2 - 1 ] is set to val.
> > +     Make sure that
> > +     [ src + offset, src + offset + len - 1 ] is a subset of that.  */
> > +  if (wi::to_widest (len) + (offset - offset2) > wi::to_widest (len2))
> 
> I think you can use ::to_offset which is more efficient.

That is reasonable.

> > +    return;
> > +
> > +  if (dump_file && (dump_flags & TDF_DETAILS))
> > +    {
> > +      fprintf (dump_file, "Simplified\n  ");
> > +      print_gimple_stmt (dump_file, stmt, 0, dump_flags);
> > +      fprintf (dump_file, "after previous\n  ");
> > +      print_gimple_stmt (dump_file, defstmt, 0, dump_flags);
> > +    }
> > +
> > +  if (is_gimple_assign (stmt))
> 
> (*) a better check would be if val is zero because even a memcpy
> could be optimized to an assignment from {}.  I suppose you're
> doing it this way because of simplicity and because we're
> late in the compilation and thus it doesn't matter in practice
> for optimization?  (the 2nd destination could become
> non-TREE_ADDRESSABLE, enabling better alias disambiguation)

Is memset and = {} equivalent even for aggregates with paddings?
If val is not zero, then I can only handle it if stmt is memcpy,
(or in theory if stmt is assignment, and dest is addressable it
could be transformed into memset).  If val is zero, and dest isn't
volatile and the size matches the size of dest, then perhaps it
could be transformed into = {} (if there are no paddings/bitfields
or if it doesn't matter), I haven't done that for simplicity indeed.

	Jakub
Richard Biener Dec. 16, 2016, 1:16 p.m. UTC | #3
On Fri, 16 Dec 2016, Jakub Jelinek wrote:

> On Fri, Dec 16, 2016 at 01:50:54PM +0100, Richard Biener wrote:
> > > +static void
> > > +optimize_memcpy (gimple_stmt_iterator *gsip, tree dest, tree src, tree len)
> > > +{
> > > +  gimple *stmt = gsi_stmt (*gsip);
> > > +  if (gimple_has_volatile_ops (stmt)
> > > +      || TREE_THIS_VOLATILE (dest)
> > > +      || TREE_THIS_VOLATILE (src))
> > 
> > The last two should be redundant and/or not necessary.
> 
> I've been doing this if stmt is __builtin_memcpy and the actual
> objects are TREE_THIS_VOLATILE, then I wanted to punt.
> But I guess if we never transform __builtin_memcpy into anything
> other than __builtin_memset, then it isn't a problem.

Yeah, and a memcpy of a volatile object isn't a volatile access anyway.

> > > +    return;
> > > +
> > > +  tree vuse = gimple_vuse (stmt);
> > > +  if (vuse == NULL)
> > > +    return;
> > > +
> > > +  gimple *defstmt = SSA_NAME_DEF_STMT (vuse);
> > > +  tree src2 = NULL_TREE, len2 = NULL_TREE;
> > > +  HOST_WIDE_INT offset, offset2;
> > > +  tree val = integer_zero_node;
> > > +  if (gimple_store_p (defstmt)
> > > +      && gimple_assign_single_p (defstmt)
> > > +      && TREE_CODE (gimple_assign_rhs1 (defstmt)) == CONSTRUCTOR
> > > +      && CONSTRUCTOR_NELTS (gimple_assign_rhs1 (defstmt)) == 0
> > 
> > Should be always true for stores from constructors.
> 
> Ok, can I just gcc_checking_assert verify it or is that not worth it?

Not worth it -- tree-cfg.c gimple verification contains that already
(ok, not exactly - it allows vector CONSTRUCTORs as RHS of
!DECL_GIMPLE_REG_P vectors -- see a bit above (MEM_REF:) for a
missed check.

> 
> > > +      && !gimple_clobber_p (defstmt))
> > > +    src2 = gimple_assign_lhs (defstmt);
> > > +  else if (gimple_call_builtin_p (defstmt, BUILT_IN_MEMSET)
> > > +	   && TREE_CODE (gimple_call_arg (defstmt, 0)) == ADDR_EXPR
> > > +	   && TREE_CODE (gimple_call_arg (defstmt, 1)) == INTEGER_CST)
> > > +    {
> > > +      src2 = TREE_OPERAND (gimple_call_arg (defstmt, 0), 0);
> > > +      len2 = gimple_call_arg (defstmt, 2);
> > > +      val = gimple_call_arg (defstmt, 1);
> > > +      if (!integer_zerop (val) && is_gimple_assign (stmt))
> > 
> > Please add a comment here.  I think below ... (*)
> > 
> > > +	src2 = NULL_TREE;
> > > +    }
> > > +
> > > +  if (src2 == NULL_TREE)
> > > +    return;
> > > +
> > > +  if (len == NULL_TREE)
> > > +    len = (TREE_CODE (src) == COMPONENT_REF
> > > +	   ? DECL_SIZE_UNIT (TREE_OPERAND (src, 1))
> > > +	   : TYPE_SIZE_UNIT (TREE_TYPE (src)));
> > > +  if (len2 == NULL_TREE)
> > > +    len2 = (TREE_CODE (src2) == COMPONENT_REF
> > > +	    ? DECL_SIZE_UNIT (TREE_OPERAND (src2, 1))
> > > +	    : TYPE_SIZE_UNIT (TREE_TYPE (src2)));
> > > +  if (len == NULL_TREE
> > > +      || TREE_CODE (len) != INTEGER_CST
> > > +      || len2 == NULL_TREE
> > > +      || TREE_CODE (len2) != INTEGER_CST)
> > > +    return;
> > > +
> > > +  src = get_addr_base_and_unit_offset (src, &offset);
> > > +  src2 = get_addr_base_and_unit_offset (src2, &offset2);
> > > +  if (src == NULL_TREE
> > > +      || src2 == NULL_TREE
> > > +      || offset < offset2)
> > > +    return;
> > > +
> > > +  if (!operand_equal_p (src, src2, 0))
> > > +    return;
> > > +
> > > +  /* [ src + offset2, src + offset2 + len2 - 1 ] is set to val.
> > > +     Make sure that
> > > +     [ src + offset, src + offset + len - 1 ] is a subset of that.  */
> > > +  if (wi::to_widest (len) + (offset - offset2) > wi::to_widest (len2))
> > 
> > I think you can use ::to_offset which is more efficient.
> 
> That is reasonable.
> 
> > > +    return;
> > > +
> > > +  if (dump_file && (dump_flags & TDF_DETAILS))
> > > +    {
> > > +      fprintf (dump_file, "Simplified\n  ");
> > > +      print_gimple_stmt (dump_file, stmt, 0, dump_flags);
> > > +      fprintf (dump_file, "after previous\n  ");
> > > +      print_gimple_stmt (dump_file, defstmt, 0, dump_flags);
> > > +    }
> > > +
> > > +  if (is_gimple_assign (stmt))
> > 
> > (*) a better check would be if val is zero because even a memcpy
> > could be optimized to an assignment from {}.  I suppose you're
> > doing it this way because of simplicity and because we're
> > late in the compilation and thus it doesn't matter in practice
> > for optimization?  (the 2nd destination could become
> > non-TREE_ADDRESSABLE, enabling better alias disambiguation)
> 
> Is memset and = {} equivalent even for aggregates with paddings?

When lowering memset to = {} you should use

 MEM<char[]> (&decl, off-of-original-alias-type) = {}-of-type-char[];

similar for lowering of memcpy (I think what gimple-fold.c does
for example is slightly unsafe).  But in practice I think nothing
in GCC assumes you can lower accesses and ignore padding if that
pass (like SRA) does not see all accesses to prove that is safe.

> If val is not zero, then I can only handle it if stmt is memcpy,
> (or in theory if stmt is assignment, and dest is addressable it
> could be transformed into memset).  If val is zero, and dest isn't
> volatile and the size matches the size of dest, then perhaps it
> could be transformed into = {} (if there are no paddings/bitfields
> or if it doesn't matter), I haven't done that for simplicity indeed.

Yes, please add a comment so a curious bystander doesn't have to figure
that out himself.

Richard.
Jeff Law Dec. 19, 2016, 8:17 p.m. UTC | #4
On 12/16/2016 05:50 AM, Richard Biener wrote:

>> +  gimple *defstmt = SSA_NAME_DEF_STMT (vuse);
>> +  tree src2 = NULL_TREE, len2 = NULL_TREE;
>> +  HOST_WIDE_INT offset, offset2;
>> +  tree val = integer_zero_node;
>> +  if (gimple_store_p (defstmt)
>> +      && gimple_assign_single_p (defstmt)
>> +      && TREE_CODE (gimple_assign_rhs1 (defstmt)) == CONSTRUCTOR
>> +      && CONSTRUCTOR_NELTS (gimple_assign_rhs1 (defstmt)) == 0
>
> Should be always true for stores from constructors.
Seems like I should drop similar checks from the in-progress DSE 
changes.  I thought I saw something in SRA to cover when this wasn't 
true as well that could probably be simplified.

jeff
diff mbox

Patch

--- gcc/tree-ssa-ccp.c.jj	2016-11-28 16:19:11.000000000 +0100
+++ gcc/tree-ssa-ccp.c	2016-12-15 15:09:03.180993745 +0100
@@ -143,6 +143,7 @@  along with GCC; see the file COPYING3.
 #include "stor-layout.h"
 #include "optabs-query.h"
 #include "tree-ssa-ccp.h"
+#include "tree-dfa.h"
 
 /* Possible lattice values.  */
 typedef enum
@@ -2933,6 +2934,113 @@  optimize_atomic_bit_test_and (gimple_stm
   release_ssa_name (lhs);
 }
 
+/* Optimize
+   a = {};
+   b = a;
+   into
+   a = {};
+   b = {};
+   Similarly for memset (&a, ..., sizeof (a)); instead of a = {};
+   and/or memcpy (&b, &a, sizeof (a)); instead of b = a;  */
+
+static void
+optimize_memcpy (gimple_stmt_iterator *gsip, tree dest, tree src, tree len)
+{
+  gimple *stmt = gsi_stmt (*gsip);
+  if (gimple_has_volatile_ops (stmt)
+      || TREE_THIS_VOLATILE (dest)
+      || TREE_THIS_VOLATILE (src))
+    return;
+
+  tree vuse = gimple_vuse (stmt);
+  if (vuse == NULL)
+    return;
+
+  gimple *defstmt = SSA_NAME_DEF_STMT (vuse);
+  tree src2 = NULL_TREE, len2 = NULL_TREE;
+  HOST_WIDE_INT offset, offset2;
+  tree val = integer_zero_node;
+  if (gimple_store_p (defstmt)
+      && gimple_assign_single_p (defstmt)
+      && TREE_CODE (gimple_assign_rhs1 (defstmt)) == CONSTRUCTOR
+      && CONSTRUCTOR_NELTS (gimple_assign_rhs1 (defstmt)) == 0
+      && !gimple_clobber_p (defstmt))
+    src2 = gimple_assign_lhs (defstmt);
+  else if (gimple_call_builtin_p (defstmt, BUILT_IN_MEMSET)
+	   && TREE_CODE (gimple_call_arg (defstmt, 0)) == ADDR_EXPR
+	   && TREE_CODE (gimple_call_arg (defstmt, 1)) == INTEGER_CST)
+    {
+      src2 = TREE_OPERAND (gimple_call_arg (defstmt, 0), 0);
+      len2 = gimple_call_arg (defstmt, 2);
+      val = gimple_call_arg (defstmt, 1);
+      if (!integer_zerop (val) && is_gimple_assign (stmt))
+	src2 = NULL_TREE;
+    }
+
+  if (src2 == NULL_TREE)
+    return;
+
+  if (len == NULL_TREE)
+    len = (TREE_CODE (src) == COMPONENT_REF
+	   ? DECL_SIZE_UNIT (TREE_OPERAND (src, 1))
+	   : TYPE_SIZE_UNIT (TREE_TYPE (src)));
+  if (len2 == NULL_TREE)
+    len2 = (TREE_CODE (src2) == COMPONENT_REF
+	    ? DECL_SIZE_UNIT (TREE_OPERAND (src2, 1))
+	    : TYPE_SIZE_UNIT (TREE_TYPE (src2)));
+  if (len == NULL_TREE
+      || TREE_CODE (len) != INTEGER_CST
+      || len2 == NULL_TREE
+      || TREE_CODE (len2) != INTEGER_CST)
+    return;
+
+  src = get_addr_base_and_unit_offset (src, &offset);
+  src2 = get_addr_base_and_unit_offset (src2, &offset2);
+  if (src == NULL_TREE
+      || src2 == NULL_TREE
+      || offset < offset2)
+    return;
+
+  if (!operand_equal_p (src, src2, 0))
+    return;
+
+  /* [ src + offset2, src + offset2 + len2 - 1 ] is set to val.
+     Make sure that
+     [ src + offset, src + offset + len - 1 ] is a subset of that.  */
+  if (wi::to_widest (len) + (offset - offset2) > wi::to_widest (len2))
+    return;
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "Simplified\n  ");
+      print_gimple_stmt (dump_file, stmt, 0, dump_flags);
+      fprintf (dump_file, "after previous\n  ");
+      print_gimple_stmt (dump_file, defstmt, 0, dump_flags);
+    }
+
+  if (is_gimple_assign (stmt))
+    {
+      tree ctor = build_constructor (TREE_TYPE (dest), NULL);
+      gimple_assign_set_rhs_from_tree (gsip, ctor);
+      update_stmt (stmt);
+    }
+  else
+    {
+      gcall *call = as_a <gcall *> (stmt);
+      tree fndecl = builtin_decl_implicit (BUILT_IN_MEMSET);
+      gimple_call_set_fndecl (call, fndecl);
+      gimple_call_set_fntype (call, TREE_TYPE (fndecl));
+      gimple_call_set_arg (call, 1, val);
+      update_stmt (stmt);
+    }
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "into\n  ");
+      print_gimple_stmt (dump_file, stmt, 0, dump_flags);
+    }
+}
+
 /* A simple pass that attempts to fold all builtin functions.  This pass
    is run after we've propagated as many constants as we can.  */
 
@@ -2999,6 +3107,9 @@  pass_fold_builtins::execute (function *f
 		      continue;
 		    }
 		}
+	      else if (gimple_assign_load_p (stmt) && gimple_store_p (stmt))
+		optimize_memcpy (&i, gimple_assign_lhs (stmt),
+				 gimple_assign_rhs1 (stmt), NULL_TREE);
 	      gsi_next (&i);
 	      continue;
 	    }
@@ -3114,14 +3225,25 @@  pass_fold_builtins::execute (function *f
 						false, false);
 		  break;
 
+		case BUILT_IN_MEMCPY:
+		  if (gimple_call_builtin_p (stmt, BUILT_IN_NORMAL)
+		      && TREE_CODE (gimple_call_arg (stmt, 0)) == ADDR_EXPR
+		      && TREE_CODE (gimple_call_arg (stmt, 1)) == ADDR_EXPR
+		      && TREE_CODE (gimple_call_arg (stmt, 2)) == INTEGER_CST)
+		    {
+		      tree dest = TREE_OPERAND (gimple_call_arg (stmt, 0), 0);
+		      tree src = TREE_OPERAND (gimple_call_arg (stmt, 1), 0);
+		      tree len = gimple_call_arg (stmt, 2);
+		      optimize_memcpy (&i, dest, src, len);
+		    }
+		  break;
+
 		case BUILT_IN_VA_START:
 		case BUILT_IN_VA_END:
 		case BUILT_IN_VA_COPY:
 		  /* These shouldn't be folded before pass_stdarg.  */
 		  result = optimize_stdarg_builtin (stmt);
-		  if (result)
-		    break;
-		  /* FALLTHRU */
+		  break;
 
 		default:;
 		}
--- gcc/testsuite/gcc.dg/pr78408-1.c.jj	2016-12-15 13:54:08.942121305 +0100
+++ gcc/testsuite/gcc.dg/pr78408-1.c	2016-12-15 15:13:04.900774924 +0100
@@ -0,0 +1,88 @@ 
+/* PR c/78408 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-fab1-details" } */
+/* { dg-final { scan-tree-dump-times "after previous" 17 "fab1" } } */
+
+struct S { char a[32]; };
+struct T { char a[65536]; };
+void bar (int, struct S *, struct S *, struct T *, struct T *);
+void baz (char *, char *);
+
+void
+f1 (void)
+{
+  struct S a, b;
+  struct T c, d;
+  a = b = (struct S) {};
+  c = d = (struct T) {};
+  bar (1, &a, &b, &c, &d);
+}
+
+void
+f2 (void)
+{
+  struct S a, b;
+  struct T c, d;
+  b = (struct S) {};
+  a = b;
+  d = (struct T) {};
+  c = d;
+  bar (2, &a, &b, &c, &d);
+}
+
+void
+f3 (void)
+{
+  struct S a, b;
+  struct T c, d;
+  __builtin_memset (&b, 0, sizeof (b));
+  a = b;
+  __builtin_memset (&d, 0, sizeof (d));
+  c = d;
+  bar (3, &a, &b, &c, &d);
+}
+
+
+void
+f4 (void)
+{
+  struct S a, b;
+  struct T c, d;
+  b = (struct S) {};
+  __builtin_memcpy (&a, &b, sizeof (b));
+  d = (struct T) {};
+  __builtin_memcpy (&c, &d, sizeof (d));
+  bar (4, &a, &b, &c, &d);
+}
+
+void
+f5 (void)
+{
+  struct S a, b;
+  struct T c, d;
+  __builtin_memset (&b, 0, sizeof (b));
+  __builtin_memcpy (&a, &b, sizeof (b));
+  __builtin_memset (&d, 0, sizeof (d));
+  __builtin_memcpy (&c, &d, sizeof (d));
+  bar (5, &a, &b, &c, &d);
+}
+
+void
+f6 (void)
+{
+  struct S a, b, e, g;
+  struct T c, d, f, h;
+  g = e = a = b = (struct S) {};
+  h = f = c = d = (struct T) {};
+  bar (6, &a, &b, &c, &d);
+  bar (6, &e, &g, &f, &h);
+}
+
+void
+f7 (void)
+{
+  char a[64], b[64];
+  __builtin_memset (a + 13, 2, 27);
+  __builtin_memcpy (b + 4, a + 17, 23);
+  baz (a, b);
+}
--- gcc/testsuite/gcc.dg/pr78408-2.c.jj	2016-12-15 15:13:48.060200199 +0100
+++ gcc/testsuite/gcc.dg/pr78408-2.c	2016-12-15 15:15:50.880564683 +0100
@@ -0,0 +1,39 @@ 
+/* PR c/78408 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-fab1-details" } */
+/* { dg-final { scan-tree-dump-not "after previous" "fab1" } } */
+
+struct S { char a[32]; };
+struct T { char a[65536]; };
+void bar (int, struct S *, struct S *, struct T *, struct T *);
+void baz (char *, char *);
+
+void
+f1 (void)
+{
+  struct S a, b;
+  struct T c, d;
+  __builtin_memset (&b, 2, sizeof (b));
+  a = b;
+  __builtin_memset (&d, 3, sizeof (d));
+  c = d;
+  bar (3, &a, &b, &c, &d);
+}
+
+void
+f2 (void)
+{
+  char a[64], b[64];
+  __builtin_memset (a + 13, 2, 27);
+  __builtin_memcpy (b + 4, a + 17, 24);
+  baz (a, b);
+}
+
+void
+f3 (void)
+{
+  char a[64], b[64];
+  __builtin_memset (a + 13, 2, 27);
+  __builtin_memcpy (b + 4, a + 12, 5);
+  baz (a, b);
+}