diff mbox

Fix PR46556 (poor address generation)

Message ID 1319463504.648.10.camel@gnopaine
State New
Headers show

Commit Message

Bill Schmidt Oct. 24, 2011, 1:38 p.m. UTC
OK, I've removed the pointer-arithmetic case from expand, to be handled
later by straight-line strength reduction.  Here's the patch to deal
with just the specific pattern of PR46556 (which will also eventually be
handled by strength reduction, but not as quickly).

(FYI, I've been thinking through the strength reduction pass, and my
plan is to stage in some of the easiest cases first, hopefully for 4.7,
and gradually add the more complex pieces.  Explicit multiplies in the
IL with known constants can be done pretty easily.  More complexity is
added when the multiplier is a variable, when conditional increments are
present, and when multiplies are hidden in addressing expressions.)

The present patch was bootstrapped and regression-tested on
powerpc64-linux.  OK for trunk?

Thanks,
Bill


2011-10-24  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>

gcc:

	PR rtl-optimization/46556
	* expr.c (restructure_base_and_offset): New function.
	(expand_expr_real_1): Replace result of get_inner_reference
	with result of restructure_base_and_offset when applicable.
	* Makefile.in (expr.o): Update dependencies.

gcc/testsuite:

	PR rtl-optimization/46556
	* gcc.dg/tree-ssa-pr46556-1.c: New testcase.
	* gcc.dg/tree-ssa-pr46556-2.c: Likewise.
	* gcc.dg/tree-ssa-pr46556-3.c: Likewise.

Comments

Bill Schmidt Oct. 30, 2011, 4:20 p.m. UTC | #1
Ping.

On Mon, 2011-10-24 at 08:38 -0500, William J. Schmidt wrote:
> OK, I've removed the pointer-arithmetic case from expand, to be handled
> later by straight-line strength reduction.  Here's the patch to deal
> with just the specific pattern of PR46556 (which will also eventually be
> handled by strength reduction, but not as quickly).
> 
> (FYI, I've been thinking through the strength reduction pass, and my
> plan is to stage in some of the easiest cases first, hopefully for 4.7,
> and gradually add the more complex pieces.  Explicit multiplies in the
> IL with known constants can be done pretty easily.  More complexity is
> added when the multiplier is a variable, when conditional increments are
> present, and when multiplies are hidden in addressing expressions.)
> 
> The present patch was bootstrapped and regression-tested on
> powerpc64-linux.  OK for trunk?
> 
> Thanks,
> Bill
> 
> 
> 2011-10-24  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>
> 
> gcc:
> 
> 	PR rtl-optimization/46556
> 	* expr.c (restructure_base_and_offset): New function.
> 	(expand_expr_real_1): Replace result of get_inner_reference
> 	with result of restructure_base_and_offset when applicable.
> 	* Makefile.in (expr.o): Update dependencies.
> 
> gcc/testsuite:
> 
> 	PR rtl-optimization/46556
> 	* gcc.dg/tree-ssa-pr46556-1.c: New testcase.
> 	* gcc.dg/tree-ssa-pr46556-2.c: Likewise.
> 	* gcc.dg/tree-ssa-pr46556-3.c: Likewise.
> 
> 
> Index: gcc/testsuite/gcc.dg/tree-ssa/pr46556-1.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/tree-ssa/pr46556-1.c	(revision 0)
> +++ gcc/testsuite/gcc.dg/tree-ssa/pr46556-1.c	(revision 0)
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-rtl-expand" } */
> +
> +struct x
> +{
> +  int a[16];
> +  int b[16];
> +  int c[16];
> +};
> +
> +extern void foo (int, int, int);
> +
> +void
> +f (struct x *p, unsigned int n)
> +{
> +  foo (p->a[n], p->c[n], p->b[n]);
> +}
> +
> +/* { dg-final { scan-rtl-dump-times "\\(mem/s:SI \\(plus:" 2 "expand" } } */
> +/* { dg-final { scan-rtl-dump-times "const_int 128" 1 "expand" } } */
> +/* { dg-final { scan-rtl-dump-times "const_int 64 \\\[0x40\\\]\\)\\) \\\[" 1 "expand" } } */
> +/* { dg-final { cleanup-rtl-dump "expand" } } */
> Index: gcc/testsuite/gcc.dg/tree-ssa/pr46556-2.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/tree-ssa/pr46556-2.c	(revision 0)
> +++ gcc/testsuite/gcc.dg/tree-ssa/pr46556-2.c	(revision 0)
> @@ -0,0 +1,26 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-rtl-expand" } */
> +
> +struct x
> +{
> +  int a[16];
> +  int b[16];
> +  int c[16];
> +};
> +
> +extern void foo (int, int, int);
> +
> +void
> +f (struct x *p, unsigned int n)
> +{
> +  foo (p->a[n], p->c[n], p->b[n]);
> +  if (n > 12)
> +    foo (p->a[n], p->c[n], p->b[n]);
> +  else if (n > 3)
> +    foo (p->b[n], p->a[n], p->c[n]);
> +}
> +
> +/* { dg-final { scan-rtl-dump-times "\\(mem/s:SI \\(plus:" 6 "expand" } } */
> +/* { dg-final { scan-rtl-dump-times "const_int 128" 3 "expand" } } */
> +/* { dg-final { scan-rtl-dump-times "const_int 64 \\\[0x40\\\]\\)\\) \\\[" 3 "expand" } } */
> +/* { dg-final { cleanup-rtl-dump "expand" } } */
> Index: gcc/testsuite/gcc.dg/tree-ssa/pr46556-3.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/tree-ssa/pr46556-3.c	(revision 0)
> +++ gcc/testsuite/gcc.dg/tree-ssa/pr46556-3.c	(revision 0)
> @@ -0,0 +1,27 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-rtl-expand" } */
> +struct x
> +{
> +  int a[16];
> +  int b[16];
> +  int c[16];
> +};
> +
> +extern void foo (int, int, int);
> +
> +void
> +f (struct x *p, unsigned int n)
> +{
> +  foo (p->a[n], p->c[n], p->b[n]);
> +  if (n > 3)
> +    {
> +      foo (p->a[n], p->c[n], p->b[n]);
> +      if (n > 12)
> +	foo (p->b[n], p->a[n], p->c[n]);
> +    }
> +}
> +
> +/* { dg-final { scan-rtl-dump-times "\\(mem/s:SI \\(plus:" 6 "expand" } } */
> +/* { dg-final { scan-rtl-dump-times "const_int 128" 3 "expand" } } */
> +/* { dg-final { scan-rtl-dump-times "const_int 64 \\\[0x40\\\]\\)\\) \\\[" 3 "expand" } } */
> +/* { dg-final { cleanup-rtl-dump "expand" } } */
> Index: gcc/expr.c
> ===================================================================
> --- gcc/expr.c	(revision 180378)
> +++ gcc/expr.c	(working copy)
> @@ -56,6 +56,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "ssaexpand.h"
>  #include "target-globals.h"
>  #include "params.h"
> +#include "tree-pretty-print.h"
> 
>  /* Decide whether a function's arguments should be processed
>     from first to last or from last to first.
> @@ -7648,7 +7649,66 @@ expand_constructor (tree exp, rtx target, enum exp
>    return target;
>  }
> 
> +/* Given BASE, OFFSET, and BITPOS derived from EXPR, determine whether
> +   there is a profitable opportunity to restructure address arithmetic
> +   within BASE and OFFSET.  If so, produce such a restructuring and
> +   return it.  */
> +/* TODO: This belongs more properly in a separate pass that performs
> +   general strength reduction on straight-line code.  Eventually move
> +   this there.  */
> 
> +static tree
> +restructure_base_and_offset (tree expr, tree base, tree offset,
> +			     HOST_WIDE_INT bitpos)
> +{
> +  tree mult_op0, mult_op1, t1, t2, offset_type, mult_expr, add_expr, mem_ref;
> +  double_int c, c1, c2, c3, c4;
> +
> +  /* Look for the following pattern:
> +
> +       base   = MEM_REF (T1, C1);
> +       offset = MULT_EXPR (PLUS_EXPR (T2, C2), C3)
> +       bitpos = C4 * BITS_PER_UNIT
> +
> +     If found, convert it to:
> +
> +       MEM_REF (POINTER_PLUS_EXPR (T1, MULT_EXPR (T2, C3)),
> +                C1 + (C2 * C3) + C4)
> +   */
> +  if (!base
> +      || !offset
> +      || TREE_CODE (base) != MEM_REF
> +      || TREE_CODE (offset) != MULT_EXPR)
> +    return NULL_TREE;
> +
> +  mult_op0 = TREE_OPERAND (offset, 0);
> +  mult_op1 = TREE_OPERAND (offset, 1);
> +
> +  if (TREE_CODE (mult_op0) != PLUS_EXPR
> +      || TREE_CODE (mult_op1) != INTEGER_CST
> +      || TREE_CODE (TREE_OPERAND (mult_op0, 1)) != INTEGER_CST)
> +    return NULL_TREE;
> +
> +  t1 = TREE_OPERAND (base, 0);
> +  t2 = TREE_OPERAND (mult_op0, 0);
> +  c1 = mem_ref_offset (base);
> +  c2 = tree_to_double_int (TREE_OPERAND (mult_op0, 1));
> +  c3 = tree_to_double_int (mult_op1);
> +  c4 = uhwi_to_double_int (bitpos / BITS_PER_UNIT);
> +  c = double_int_add (double_int_add (c1, double_int_mul (c2, c3)), c4);
> +
> +  offset_type = TREE_TYPE (TREE_OPERAND (base, 1));
> +
> +  mult_expr = fold_build2 (MULT_EXPR, sizetype, t2, 
> +			   double_int_to_tree (sizetype, c3));
> +
> +  add_expr = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (t1), t1, mult_expr);
> +
> +  mem_ref = fold_build2 (MEM_REF, TREE_TYPE (expr), add_expr,
> +			 double_int_to_tree (offset_type, c));
> +  return mem_ref;
> +}
> +
>  /* expand_expr: generate code for computing expression EXP.
>     An rtx for the computed value is returned.  The value is never null.
>     In the case of a void EXP, const0_rtx is returned.
> @@ -9584,7 +9644,7 @@ expand_expr_real_1 (tree exp, rtx target, enum mac
>        {
>  	enum machine_mode mode1, mode2;
>  	HOST_WIDE_INT bitsize, bitpos;
> -	tree offset;
> +	tree offset, tem2;
>  	int volatilep = 0, must_force_mem;
>  	bool packedp = false;
>  	tree tem = get_inner_reference (exp, &bitsize, &bitpos, &offset,
> @@ -9601,6 +9661,36 @@ expand_expr_real_1 (tree exp, rtx target, enum mac
>  		&& DECL_PACKED (TREE_OPERAND (exp, 1))))
>  	  packedp = true;
> 
> +	/* Attempt to restructure the base and offset to combine constants.
> +	   Bitfield references should eventually be lowered prior to this
> +	   point and are not dealt with.  Skip BLKmode aggregates as well.
> +	   This generates dead code, so require -O2.  */
> +	if (optimize > 1
> +	    && code != BIT_FIELD_REF
> +	    && (code != COMPONENT_REF 
> +		|| !DECL_BIT_FIELD (TREE_OPERAND (exp, 1)))
> +	    && TYPE_MODE (TREE_TYPE (exp)) != BLKmode
> +	    && bitpos % BITS_PER_UNIT == 0
> +	    && ((tem2 = restructure_base_and_offset (exp, tem, offset, bitpos))
> +		!= NULL_TREE))
> +	  {
> +	    copy_ref_info (tem2, exp);
> +	    tem = tem2;
> +	    /* The offset and bitpos were absorbed into the new MEM_REF, so 
> +	       make sure we don't add them in again.  */
> +	    offset = NULL_TREE;
> +	    bitpos = 0;
> +
> +	    if (dump_file && (dump_flags & TDF_DETAILS))
> +	      {
> +		fprintf (dump_file, "\nRestructured inner reference:\n");
> +		fprintf (dump_file, "  Original reference: ");
> +		print_generic_expr (dump_file, exp, TDF_VOPS|TDF_MEMSYMS);
> +		fprintf (dump_file, "\n  Replacement: ");
> +		print_generic_expr (dump_file, tem, TDF_VOPS|TDF_MEMSYMS);
> +	      }
> +	  }
> +
>  	/* If TEM's type is a union of variable size, pass TARGET to the inner
>  	   computation, since it will need a temporary and TARGET is known
>  	   to have to do.  This occurs in unchecked conversion in Ada.  */
> Index: gcc/Makefile.in
> ===================================================================
> --- gcc/Makefile.in	(revision 180378)
> +++ gcc/Makefile.in	(working copy)
> @@ -2926,7 +2926,7 @@ expr.o : expr.c $(CONFIG_H) $(SYSTEM_H) coretypes.
>     reload.h langhooks.h intl.h $(TM_P_H) $(TARGET_H) \
>     tree-iterator.h gt-expr.h $(MACHMODE_H) $(TIMEVAR_H) $(TREE_FLOW_H) \
>     $(TREE_PASS_H) $(DF_H) $(DIAGNOSTIC_H) vecprim.h $(SSAEXPAND_H) \
> -   $(PARAMS_H) $(COMMON_TARGET_H)
> +   $(PARAMS_H) $(COMMON_TARGET_H) tree-pretty-print.h
>  dojump.o : dojump.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) $(RTL_H) $(TREE_H) \
>     $(FLAGS_H) $(FUNCTION_H) $(EXPR_H) $(OPTABS_H) $(INSN_ATTR_H) insn-config.h \
>     langhooks.h $(GGC_H) gt-dojump.h vecprim.h $(BASIC_BLOCK_H) output.h
> 
>
Richard Biener Nov. 2, 2011, 11:55 a.m. UTC | #2
On Sun, 30 Oct 2011, William J. Schmidt wrote:

> Ping.
> 
> On Mon, 2011-10-24 at 08:38 -0500, William J. Schmidt wrote:
> > OK, I've removed the pointer-arithmetic case from expand, to be handled
> > later by straight-line strength reduction.  Here's the patch to deal
> > with just the specific pattern of PR46556 (which will also eventually be
> > handled by strength reduction, but not as quickly).
> > 
> > (FYI, I've been thinking through the strength reduction pass, and my
> > plan is to stage in some of the easiest cases first, hopefully for 4.7,
> > and gradually add the more complex pieces.  Explicit multiplies in the
> > IL with known constants can be done pretty easily.  More complexity is
> > added when the multiplier is a variable, when conditional increments are
> > present, and when multiplies are hidden in addressing expressions.)
> > 
> > The present patch was bootstrapped and regression-tested on
> > powerpc64-linux.  OK for trunk?

Hmmm ...

> > Thanks,
> > Bill
> > 
> > 
> > 2011-10-24  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>
> > 
> > gcc:
> > 
> > 	PR rtl-optimization/46556
> > 	* expr.c (restructure_base_and_offset): New function.
> > 	(expand_expr_real_1): Replace result of get_inner_reference
> > 	with result of restructure_base_and_offset when applicable.
> > 	* Makefile.in (expr.o): Update dependencies.
> > 
> > gcc/testsuite:
> > 
> > 	PR rtl-optimization/46556
> > 	* gcc.dg/tree-ssa-pr46556-1.c: New testcase.
> > 	* gcc.dg/tree-ssa-pr46556-2.c: Likewise.
> > 	* gcc.dg/tree-ssa-pr46556-3.c: Likewise.
> > 
> > 
> > Index: gcc/testsuite/gcc.dg/tree-ssa/pr46556-1.c
> > ===================================================================
> > --- gcc/testsuite/gcc.dg/tree-ssa/pr46556-1.c	(revision 0)
> > +++ gcc/testsuite/gcc.dg/tree-ssa/pr46556-1.c	(revision 0)
> > @@ -0,0 +1,22 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -fdump-rtl-expand" } */
> > +
> > +struct x
> > +{
> > +  int a[16];
> > +  int b[16];
> > +  int c[16];
> > +};
> > +
> > +extern void foo (int, int, int);
> > +
> > +void
> > +f (struct x *p, unsigned int n)
> > +{
> > +  foo (p->a[n], p->c[n], p->b[n]);
> > +}
> > +
> > +/* { dg-final { scan-rtl-dump-times "\\(mem/s:SI \\(plus:" 2 "expand" } } */
> > +/* { dg-final { scan-rtl-dump-times "const_int 128" 1 "expand" } } */
> > +/* { dg-final { scan-rtl-dump-times "const_int 64 \\\[0x40\\\]\\)\\) \\\[" 1 "expand" } } */
> > +/* { dg-final { cleanup-rtl-dump "expand" } } */
> > Index: gcc/testsuite/gcc.dg/tree-ssa/pr46556-2.c
> > ===================================================================
> > --- gcc/testsuite/gcc.dg/tree-ssa/pr46556-2.c	(revision 0)
> > +++ gcc/testsuite/gcc.dg/tree-ssa/pr46556-2.c	(revision 0)
> > @@ -0,0 +1,26 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -fdump-rtl-expand" } */
> > +
> > +struct x
> > +{
> > +  int a[16];
> > +  int b[16];
> > +  int c[16];
> > +};
> > +
> > +extern void foo (int, int, int);
> > +
> > +void
> > +f (struct x *p, unsigned int n)
> > +{
> > +  foo (p->a[n], p->c[n], p->b[n]);
> > +  if (n > 12)
> > +    foo (p->a[n], p->c[n], p->b[n]);
> > +  else if (n > 3)
> > +    foo (p->b[n], p->a[n], p->c[n]);
> > +}
> > +
> > +/* { dg-final { scan-rtl-dump-times "\\(mem/s:SI \\(plus:" 6 "expand" } } */
> > +/* { dg-final { scan-rtl-dump-times "const_int 128" 3 "expand" } } */
> > +/* { dg-final { scan-rtl-dump-times "const_int 64 \\\[0x40\\\]\\)\\) \\\[" 3 "expand" } } */
> > +/* { dg-final { cleanup-rtl-dump "expand" } } */
> > Index: gcc/testsuite/gcc.dg/tree-ssa/pr46556-3.c
> > ===================================================================
> > --- gcc/testsuite/gcc.dg/tree-ssa/pr46556-3.c	(revision 0)
> > +++ gcc/testsuite/gcc.dg/tree-ssa/pr46556-3.c	(revision 0)
> > @@ -0,0 +1,27 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -fdump-rtl-expand" } */
> > +struct x
> > +{
> > +  int a[16];
> > +  int b[16];
> > +  int c[16];
> > +};
> > +
> > +extern void foo (int, int, int);
> > +
> > +void
> > +f (struct x *p, unsigned int n)
> > +{
> > +  foo (p->a[n], p->c[n], p->b[n]);
> > +  if (n > 3)
> > +    {
> > +      foo (p->a[n], p->c[n], p->b[n]);
> > +      if (n > 12)
> > +	foo (p->b[n], p->a[n], p->c[n]);
> > +    }
> > +}
> > +
> > +/* { dg-final { scan-rtl-dump-times "\\(mem/s:SI \\(plus:" 6 "expand" } } */
> > +/* { dg-final { scan-rtl-dump-times "const_int 128" 3 "expand" } } */
> > +/* { dg-final { scan-rtl-dump-times "const_int 64 \\\[0x40\\\]\\)\\) \\\[" 3 "expand" } } */
> > +/* { dg-final { cleanup-rtl-dump "expand" } } */
> > Index: gcc/expr.c
> > ===================================================================
> > --- gcc/expr.c	(revision 180378)
> > +++ gcc/expr.c	(working copy)
> > @@ -56,6 +56,7 @@ along with GCC; see the file COPYING3.  If not see
> >  #include "ssaexpand.h"
> >  #include "target-globals.h"
> >  #include "params.h"
> > +#include "tree-pretty-print.h"
> > 
> >  /* Decide whether a function's arguments should be processed
> >     from first to last or from last to first.
> > @@ -7648,7 +7649,66 @@ expand_constructor (tree exp, rtx target, enum exp
> >    return target;
> >  }
> > 
> > +/* Given BASE, OFFSET, and BITPOS derived from EXPR, determine whether
> > +   there is a profitable opportunity to restructure address arithmetic
> > +   within BASE and OFFSET.  If so, produce such a restructuring and
> > +   return it.  */
> > +/* TODO: This belongs more properly in a separate pass that performs
> > +   general strength reduction on straight-line code.  Eventually move
> > +   this there.  */
> > 
> > +static tree
> > +restructure_base_and_offset (tree expr, tree base, tree offset,
> > +			     HOST_WIDE_INT bitpos)
> > +{
> > +  tree mult_op0, mult_op1, t1, t2, offset_type, mult_expr, add_expr, mem_ref;
> > +  double_int c, c1, c2, c3, c4;
> > +
> > +  /* Look for the following pattern:
> > +
> > +       base   = MEM_REF (T1, C1);
> > +       offset = MULT_EXPR (PLUS_EXPR (T2, C2), C3)
> > +       bitpos = C4 * BITS_PER_UNIT
> > +
> > +     If found, convert it to:
> > +
> > +       MEM_REF (POINTER_PLUS_EXPR (T1, MULT_EXPR (T2, C3)),
> > +                C1 + (C2 * C3) + C4)
> > +   */
> > +  if (!base
> > +      || !offset
> > +      || TREE_CODE (base) != MEM_REF
> > +      || TREE_CODE (offset) != MULT_EXPR)
> > +    return NULL_TREE;
> > +
> > +  mult_op0 = TREE_OPERAND (offset, 0);
> > +  mult_op1 = TREE_OPERAND (offset, 1);
> > +
> > +  if (TREE_CODE (mult_op0) != PLUS_EXPR
> > +      || TREE_CODE (mult_op1) != INTEGER_CST
> > +      || TREE_CODE (TREE_OPERAND (mult_op0, 1)) != INTEGER_CST)
> > +    return NULL_TREE;
> > +
> > +  t1 = TREE_OPERAND (base, 0);
> > +  t2 = TREE_OPERAND (mult_op0, 0);
> > +  c1 = mem_ref_offset (base);
> > +  c2 = tree_to_double_int (TREE_OPERAND (mult_op0, 1));
> > +  c3 = tree_to_double_int (mult_op1);
> > +  c4 = uhwi_to_double_int (bitpos / BITS_PER_UNIT);
> > +  c = double_int_add (double_int_add (c1, double_int_mul (c2, c3)), c4);
> > +
> > +  offset_type = TREE_TYPE (TREE_OPERAND (base, 1));
> > +
> > +  mult_expr = fold_build2 (MULT_EXPR, sizetype, t2, 
> > +			   double_int_to_tree (sizetype, c3));
> > +
> > +  add_expr = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (t1), t1, mult_expr);
> > +
> > +  mem_ref = fold_build2 (MEM_REF, TREE_TYPE (expr), add_expr,
> > +			 double_int_to_tree (offset_type, c));
> > +  return mem_ref;
> > +}
> > +
> >  /* expand_expr: generate code for computing expression EXP.
> >     An rtx for the computed value is returned.  The value is never null.
> >     In the case of a void EXP, const0_rtx is returned.
> > @@ -9584,7 +9644,7 @@ expand_expr_real_1 (tree exp, rtx target, enum mac
> >        {
> >  	enum machine_mode mode1, mode2;
> >  	HOST_WIDE_INT bitsize, bitpos;
> > -	tree offset;
> > +	tree offset, tem2;
> >  	int volatilep = 0, must_force_mem;
> >  	bool packedp = false;
> >  	tree tem = get_inner_reference (exp, &bitsize, &bitpos, &offset,
> > @@ -9601,6 +9661,36 @@ expand_expr_real_1 (tree exp, rtx target, enum mac
> >  		&& DECL_PACKED (TREE_OPERAND (exp, 1))))
> >  	  packedp = true;
> > 
> > +	/* Attempt to restructure the base and offset to combine constants.
> > +	   Bitfield references should eventually be lowered prior to this
> > +	   point and are not dealt with.  Skip BLKmode aggregates as well.
> > +	   This generates dead code, so require -O2.  */
> > +	if (optimize > 1
> > +	    && code != BIT_FIELD_REF
> > +	    && (code != COMPONENT_REF 
> > +		|| !DECL_BIT_FIELD (TREE_OPERAND (exp, 1)))
> > +	    && TYPE_MODE (TREE_TYPE (exp)) != BLKmode
> > +	    && bitpos % BITS_PER_UNIT == 0
> > +	    && ((tem2 = restructure_base_and_offset (exp, tem, offset, bitpos))
> > +		!= NULL_TREE))
> > +	  {
> > +	    copy_ref_info (tem2, exp);
> > +	    tem = tem2;
> > +	    /* The offset and bitpos were absorbed into the new MEM_REF, so 
> > +	       make sure we don't add them in again.  */
> > +	    offset = NULL_TREE;
> > +	    bitpos = 0;

I'm not sure it's a good idea to do that.  Why not instead just
transform the offset and bitpos parts?  Otherwise you are going
to lose all subsequent logic that tries to optimize offsetted
base addressing.

Thus, instead of

> > +       base   = MEM_REF (T1, C1);
> > +       offset = MULT_EXPR (PLUS_EXPR (T2, C2), C3)
> > +       bitpos = C4 * BITS_PER_UNIT
> > +  
> > +     If found, convert it to:
> > +  
> > +       MEM_REF (POINTER_PLUS_EXPR (T1, MULT_EXPR (T2, C3)),
> > +                C1 + (C2 * C3) + C4)

convert it to

   base = MEM_REF (T1, 0);
   offset = MULT_EXPR (T2, C3);
   bitpos = C1 + (C2 * C3) + C4;

?

Also note that with only handling this here you fail to handle
stores (see expand_assignment, or just grep for the places that
handle MEM_REF).

If we want to handle this during expansion we probably want to
factor out a routine that handles address-generation for all
of the cases.

Sorry for pushing back again ... this area of GCC is "interesting" ;)
And yes, the current code is a mess - well accumulated 25 years of
"optimization" ... thus changing this area is not to be taken easily.

Thanks,
Richard.
Bill Schmidt Nov. 2, 2011, 12:24 p.m. UTC | #3
On Wed, 2011-11-02 at 12:55 +0100, Richard Guenther wrote:
> On Sun, 30 Oct 2011, William J. Schmidt wrote:
> 
> > Ping.
> > 
> > On Mon, 2011-10-24 at 08:38 -0500, William J. Schmidt wrote:
> > > OK, I've removed the pointer-arithmetic case from expand, to be handled
> > > later by straight-line strength reduction.  Here's the patch to deal
> > > with just the specific pattern of PR46556 (which will also eventually be
> > > handled by strength reduction, but not as quickly).
> > > 
> > > (FYI, I've been thinking through the strength reduction pass, and my
> > > plan is to stage in some of the easiest cases first, hopefully for 4.7,
> > > and gradually add the more complex pieces.  Explicit multiplies in the
> > > IL with known constants can be done pretty easily.  More complexity is
> > > added when the multiplier is a variable, when conditional increments are
> > > present, and when multiplies are hidden in addressing expressions.)
> > > 
> > > The present patch was bootstrapped and regression-tested on
> > > powerpc64-linux.  OK for trunk?
> 
> Hmmm ...
> 
> > > Thanks,
> > > Bill
> > > 
> > > 
> > > 2011-10-24  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>
> > > 
> > > gcc:
> > > 
> > > 	PR rtl-optimization/46556
> > > 	* expr.c (restructure_base_and_offset): New function.
> > > 	(expand_expr_real_1): Replace result of get_inner_reference
> > > 	with result of restructure_base_and_offset when applicable.
> > > 	* Makefile.in (expr.o): Update dependencies.
> > > 
> > > gcc/testsuite:
> > > 
> > > 	PR rtl-optimization/46556
> > > 	* gcc.dg/tree-ssa-pr46556-1.c: New testcase.
> > > 	* gcc.dg/tree-ssa-pr46556-2.c: Likewise.
> > > 	* gcc.dg/tree-ssa-pr46556-3.c: Likewise.
> > > 
> > > 
> > > Index: gcc/testsuite/gcc.dg/tree-ssa/pr46556-1.c
> > > ===================================================================
> > > --- gcc/testsuite/gcc.dg/tree-ssa/pr46556-1.c	(revision 0)
> > > +++ gcc/testsuite/gcc.dg/tree-ssa/pr46556-1.c	(revision 0)
> > > @@ -0,0 +1,22 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -fdump-rtl-expand" } */
> > > +
> > > +struct x
> > > +{
> > > +  int a[16];
> > > +  int b[16];
> > > +  int c[16];
> > > +};
> > > +
> > > +extern void foo (int, int, int);
> > > +
> > > +void
> > > +f (struct x *p, unsigned int n)
> > > +{
> > > +  foo (p->a[n], p->c[n], p->b[n]);
> > > +}
> > > +
> > > +/* { dg-final { scan-rtl-dump-times "\\(mem/s:SI \\(plus:" 2 "expand" } } */
> > > +/* { dg-final { scan-rtl-dump-times "const_int 128" 1 "expand" } } */
> > > +/* { dg-final { scan-rtl-dump-times "const_int 64 \\\[0x40\\\]\\)\\) \\\[" 1 "expand" } } */
> > > +/* { dg-final { cleanup-rtl-dump "expand" } } */
> > > Index: gcc/testsuite/gcc.dg/tree-ssa/pr46556-2.c
> > > ===================================================================
> > > --- gcc/testsuite/gcc.dg/tree-ssa/pr46556-2.c	(revision 0)
> > > +++ gcc/testsuite/gcc.dg/tree-ssa/pr46556-2.c	(revision 0)
> > > @@ -0,0 +1,26 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -fdump-rtl-expand" } */
> > > +
> > > +struct x
> > > +{
> > > +  int a[16];
> > > +  int b[16];
> > > +  int c[16];
> > > +};
> > > +
> > > +extern void foo (int, int, int);
> > > +
> > > +void
> > > +f (struct x *p, unsigned int n)
> > > +{
> > > +  foo (p->a[n], p->c[n], p->b[n]);
> > > +  if (n > 12)
> > > +    foo (p->a[n], p->c[n], p->b[n]);
> > > +  else if (n > 3)
> > > +    foo (p->b[n], p->a[n], p->c[n]);
> > > +}
> > > +
> > > +/* { dg-final { scan-rtl-dump-times "\\(mem/s:SI \\(plus:" 6 "expand" } } */
> > > +/* { dg-final { scan-rtl-dump-times "const_int 128" 3 "expand" } } */
> > > +/* { dg-final { scan-rtl-dump-times "const_int 64 \\\[0x40\\\]\\)\\) \\\[" 3 "expand" } } */
> > > +/* { dg-final { cleanup-rtl-dump "expand" } } */
> > > Index: gcc/testsuite/gcc.dg/tree-ssa/pr46556-3.c
> > > ===================================================================
> > > --- gcc/testsuite/gcc.dg/tree-ssa/pr46556-3.c	(revision 0)
> > > +++ gcc/testsuite/gcc.dg/tree-ssa/pr46556-3.c	(revision 0)
> > > @@ -0,0 +1,27 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -fdump-rtl-expand" } */
> > > +struct x
> > > +{
> > > +  int a[16];
> > > +  int b[16];
> > > +  int c[16];
> > > +};
> > > +
> > > +extern void foo (int, int, int);
> > > +
> > > +void
> > > +f (struct x *p, unsigned int n)
> > > +{
> > > +  foo (p->a[n], p->c[n], p->b[n]);
> > > +  if (n > 3)
> > > +    {
> > > +      foo (p->a[n], p->c[n], p->b[n]);
> > > +      if (n > 12)
> > > +	foo (p->b[n], p->a[n], p->c[n]);
> > > +    }
> > > +}
> > > +
> > > +/* { dg-final { scan-rtl-dump-times "\\(mem/s:SI \\(plus:" 6 "expand" } } */
> > > +/* { dg-final { scan-rtl-dump-times "const_int 128" 3 "expand" } } */
> > > +/* { dg-final { scan-rtl-dump-times "const_int 64 \\\[0x40\\\]\\)\\) \\\[" 3 "expand" } } */
> > > +/* { dg-final { cleanup-rtl-dump "expand" } } */
> > > Index: gcc/expr.c
> > > ===================================================================
> > > --- gcc/expr.c	(revision 180378)
> > > +++ gcc/expr.c	(working copy)
> > > @@ -56,6 +56,7 @@ along with GCC; see the file COPYING3.  If not see
> > >  #include "ssaexpand.h"
> > >  #include "target-globals.h"
> > >  #include "params.h"
> > > +#include "tree-pretty-print.h"
> > > 
> > >  /* Decide whether a function's arguments should be processed
> > >     from first to last or from last to first.
> > > @@ -7648,7 +7649,66 @@ expand_constructor (tree exp, rtx target, enum exp
> > >    return target;
> > >  }
> > > 
> > > +/* Given BASE, OFFSET, and BITPOS derived from EXPR, determine whether
> > > +   there is a profitable opportunity to restructure address arithmetic
> > > +   within BASE and OFFSET.  If so, produce such a restructuring and
> > > +   return it.  */
> > > +/* TODO: This belongs more properly in a separate pass that performs
> > > +   general strength reduction on straight-line code.  Eventually move
> > > +   this there.  */
> > > 
> > > +static tree
> > > +restructure_base_and_offset (tree expr, tree base, tree offset,
> > > +			     HOST_WIDE_INT bitpos)
> > > +{
> > > +  tree mult_op0, mult_op1, t1, t2, offset_type, mult_expr, add_expr, mem_ref;
> > > +  double_int c, c1, c2, c3, c4;
> > > +
> > > +  /* Look for the following pattern:
> > > +
> > > +       base   = MEM_REF (T1, C1);
> > > +       offset = MULT_EXPR (PLUS_EXPR (T2, C2), C3)
> > > +       bitpos = C4 * BITS_PER_UNIT
> > > +
> > > +     If found, convert it to:
> > > +
> > > +       MEM_REF (POINTER_PLUS_EXPR (T1, MULT_EXPR (T2, C3)),
> > > +                C1 + (C2 * C3) + C4)
> > > +   */
> > > +  if (!base
> > > +      || !offset
> > > +      || TREE_CODE (base) != MEM_REF
> > > +      || TREE_CODE (offset) != MULT_EXPR)
> > > +    return NULL_TREE;
> > > +
> > > +  mult_op0 = TREE_OPERAND (offset, 0);
> > > +  mult_op1 = TREE_OPERAND (offset, 1);
> > > +
> > > +  if (TREE_CODE (mult_op0) != PLUS_EXPR
> > > +      || TREE_CODE (mult_op1) != INTEGER_CST
> > > +      || TREE_CODE (TREE_OPERAND (mult_op0, 1)) != INTEGER_CST)
> > > +    return NULL_TREE;
> > > +
> > > +  t1 = TREE_OPERAND (base, 0);
> > > +  t2 = TREE_OPERAND (mult_op0, 0);
> > > +  c1 = mem_ref_offset (base);
> > > +  c2 = tree_to_double_int (TREE_OPERAND (mult_op0, 1));
> > > +  c3 = tree_to_double_int (mult_op1);
> > > +  c4 = uhwi_to_double_int (bitpos / BITS_PER_UNIT);
> > > +  c = double_int_add (double_int_add (c1, double_int_mul (c2, c3)), c4);
> > > +
> > > +  offset_type = TREE_TYPE (TREE_OPERAND (base, 1));
> > > +
> > > +  mult_expr = fold_build2 (MULT_EXPR, sizetype, t2, 
> > > +			   double_int_to_tree (sizetype, c3));
> > > +
> > > +  add_expr = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (t1), t1, mult_expr);
> > > +
> > > +  mem_ref = fold_build2 (MEM_REF, TREE_TYPE (expr), add_expr,
> > > +			 double_int_to_tree (offset_type, c));
> > > +  return mem_ref;
> > > +}
> > > +
> > >  /* expand_expr: generate code for computing expression EXP.
> > >     An rtx for the computed value is returned.  The value is never null.
> > >     In the case of a void EXP, const0_rtx is returned.
> > > @@ -9584,7 +9644,7 @@ expand_expr_real_1 (tree exp, rtx target, enum mac
> > >        {
> > >  	enum machine_mode mode1, mode2;
> > >  	HOST_WIDE_INT bitsize, bitpos;
> > > -	tree offset;
> > > +	tree offset, tem2;
> > >  	int volatilep = 0, must_force_mem;
> > >  	bool packedp = false;
> > >  	tree tem = get_inner_reference (exp, &bitsize, &bitpos, &offset,
> > > @@ -9601,6 +9661,36 @@ expand_expr_real_1 (tree exp, rtx target, enum mac
> > >  		&& DECL_PACKED (TREE_OPERAND (exp, 1))))
> > >  	  packedp = true;
> > > 
> > > +	/* Attempt to restructure the base and offset to combine constants.
> > > +	   Bitfield references should eventually be lowered prior to this
> > > +	   point and are not dealt with.  Skip BLKmode aggregates as well.
> > > +	   This generates dead code, so require -O2.  */
> > > +	if (optimize > 1
> > > +	    && code != BIT_FIELD_REF
> > > +	    && (code != COMPONENT_REF 
> > > +		|| !DECL_BIT_FIELD (TREE_OPERAND (exp, 1)))
> > > +	    && TYPE_MODE (TREE_TYPE (exp)) != BLKmode
> > > +	    && bitpos % BITS_PER_UNIT == 0
> > > +	    && ((tem2 = restructure_base_and_offset (exp, tem, offset, bitpos))
> > > +		!= NULL_TREE))
> > > +	  {
> > > +	    copy_ref_info (tem2, exp);
> > > +	    tem = tem2;
> > > +	    /* The offset and bitpos were absorbed into the new MEM_REF, so 
> > > +	       make sure we don't add them in again.  */
> > > +	    offset = NULL_TREE;
> > > +	    bitpos = 0;
> 
> I'm not sure it's a good idea to do that.  Why not instead just
> transform the offset and bitpos parts?  Otherwise you are going
> to lose all subsequent logic that tries to optimize offsetted
> base addressing.
> 
> Thus, instead of
> 
> > > +       base   = MEM_REF (T1, C1);
> > > +       offset = MULT_EXPR (PLUS_EXPR (T2, C2), C3)
> > > +       bitpos = C4 * BITS_PER_UNIT
> > > +  
> > > +     If found, convert it to:
> > > +  
> > > +       MEM_REF (POINTER_PLUS_EXPR (T1, MULT_EXPR (T2, C3)),
> > > +                C1 + (C2 * C3) + C4)
> 
> convert it to
> 
>    base = MEM_REF (T1, 0);
>    offset = MULT_EXPR (T2, C3);
>    bitpos = C1 + (C2 * C3) + C4;
> 
> ?

Hm.  I'll have to play with that to be sure it still solves our original
problem after all the downstream logic kicks in...  

> 
> Also note that with only handling this here you fail to handle
> stores (see expand_assignment, or just grep for the places that
> handle MEM_REF).

Bleah.  OK.  Expand is a messy beast -- you had pointed me at this spot
a while back and it looked on the surface like it handled both...sigh.

> 
> If we want to handle this during expansion we probably want to
> factor out a routine that handles address-generation for all
> of the cases.

Well, doing it during expand was intended to be an "easy" temporary
solution for 4.7 until we can do it "right" in the tree phases as part
of strength reduction.  The "easy" part seems to have gone into
hiding. ;)  If you feel the strength reduction pass is generally on the
right track, it may be best to concentrate on getting that completed
rather than keep pulling on loose threads in expand.  Let me know your
preference.

> 
> Sorry for pushing back again ... this area of GCC is "interesting" ;)
> And yes, the current code is a mess - well accumulated 25 years of
> "optimization" ... thus changing this area is not to be taken easily.
> 

Yep, I'm all for getting it right.  Thanks for the review...

Bill

> Thanks,
> Richard.
>
Richard Biener Nov. 2, 2011, 2:14 p.m. UTC | #4
On Wed, 2 Nov 2011, William J. Schmidt wrote:

> On Wed, 2011-11-02 at 12:55 +0100, Richard Guenther wrote:
> > On Sun, 30 Oct 2011, William J. Schmidt wrote:
> > 
> > > Ping.
> > > 
> > > On Mon, 2011-10-24 at 08:38 -0500, William J. Schmidt wrote:
> > > > OK, I've removed the pointer-arithmetic case from expand, to be handled
> > > > later by straight-line strength reduction.  Here's the patch to deal
> > > > with just the specific pattern of PR46556 (which will also eventually be
> > > > handled by strength reduction, but not as quickly).
> > > > 
> > > > (FYI, I've been thinking through the strength reduction pass, and my
> > > > plan is to stage in some of the easiest cases first, hopefully for 4.7,
> > > > and gradually add the more complex pieces.  Explicit multiplies in the
> > > > IL with known constants can be done pretty easily.  More complexity is
> > > > added when the multiplier is a variable, when conditional increments are
> > > > present, and when multiplies are hidden in addressing expressions.)
> > > > 
> > > > The present patch was bootstrapped and regression-tested on
> > > > powerpc64-linux.  OK for trunk?
> > 
> > Hmmm ...
> > 
> > > > Thanks,
> > > > Bill
> > > > 
> > > > 
> > > > 2011-10-24  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>
> > > > 
> > > > gcc:
> > > > 
> > > > 	PR rtl-optimization/46556
> > > > 	* expr.c (restructure_base_and_offset): New function.
> > > > 	(expand_expr_real_1): Replace result of get_inner_reference
> > > > 	with result of restructure_base_and_offset when applicable.
> > > > 	* Makefile.in (expr.o): Update dependencies.
> > > > 
> > > > gcc/testsuite:
> > > > 
> > > > 	PR rtl-optimization/46556
> > > > 	* gcc.dg/tree-ssa-pr46556-1.c: New testcase.
> > > > 	* gcc.dg/tree-ssa-pr46556-2.c: Likewise.
> > > > 	* gcc.dg/tree-ssa-pr46556-3.c: Likewise.
> > > > 
> > > > 
> > > > Index: gcc/testsuite/gcc.dg/tree-ssa/pr46556-1.c
> > > > ===================================================================
> > > > --- gcc/testsuite/gcc.dg/tree-ssa/pr46556-1.c	(revision 0)
> > > > +++ gcc/testsuite/gcc.dg/tree-ssa/pr46556-1.c	(revision 0)
> > > > @@ -0,0 +1,22 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O2 -fdump-rtl-expand" } */
> > > > +
> > > > +struct x
> > > > +{
> > > > +  int a[16];
> > > > +  int b[16];
> > > > +  int c[16];
> > > > +};
> > > > +
> > > > +extern void foo (int, int, int);
> > > > +
> > > > +void
> > > > +f (struct x *p, unsigned int n)
> > > > +{
> > > > +  foo (p->a[n], p->c[n], p->b[n]);
> > > > +}
> > > > +
> > > > +/* { dg-final { scan-rtl-dump-times "\\(mem/s:SI \\(plus:" 2 "expand" } } */
> > > > +/* { dg-final { scan-rtl-dump-times "const_int 128" 1 "expand" } } */
> > > > +/* { dg-final { scan-rtl-dump-times "const_int 64 \\\[0x40\\\]\\)\\) \\\[" 1 "expand" } } */
> > > > +/* { dg-final { cleanup-rtl-dump "expand" } } */
> > > > Index: gcc/testsuite/gcc.dg/tree-ssa/pr46556-2.c
> > > > ===================================================================
> > > > --- gcc/testsuite/gcc.dg/tree-ssa/pr46556-2.c	(revision 0)
> > > > +++ gcc/testsuite/gcc.dg/tree-ssa/pr46556-2.c	(revision 0)
> > > > @@ -0,0 +1,26 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O2 -fdump-rtl-expand" } */
> > > > +
> > > > +struct x
> > > > +{
> > > > +  int a[16];
> > > > +  int b[16];
> > > > +  int c[16];
> > > > +};
> > > > +
> > > > +extern void foo (int, int, int);
> > > > +
> > > > +void
> > > > +f (struct x *p, unsigned int n)
> > > > +{
> > > > +  foo (p->a[n], p->c[n], p->b[n]);
> > > > +  if (n > 12)
> > > > +    foo (p->a[n], p->c[n], p->b[n]);
> > > > +  else if (n > 3)
> > > > +    foo (p->b[n], p->a[n], p->c[n]);
> > > > +}
> > > > +
> > > > +/* { dg-final { scan-rtl-dump-times "\\(mem/s:SI \\(plus:" 6 "expand" } } */
> > > > +/* { dg-final { scan-rtl-dump-times "const_int 128" 3 "expand" } } */
> > > > +/* { dg-final { scan-rtl-dump-times "const_int 64 \\\[0x40\\\]\\)\\) \\\[" 3 "expand" } } */
> > > > +/* { dg-final { cleanup-rtl-dump "expand" } } */
> > > > Index: gcc/testsuite/gcc.dg/tree-ssa/pr46556-3.c
> > > > ===================================================================
> > > > --- gcc/testsuite/gcc.dg/tree-ssa/pr46556-3.c	(revision 0)
> > > > +++ gcc/testsuite/gcc.dg/tree-ssa/pr46556-3.c	(revision 0)
> > > > @@ -0,0 +1,27 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O2 -fdump-rtl-expand" } */
> > > > +struct x
> > > > +{
> > > > +  int a[16];
> > > > +  int b[16];
> > > > +  int c[16];
> > > > +};
> > > > +
> > > > +extern void foo (int, int, int);
> > > > +
> > > > +void
> > > > +f (struct x *p, unsigned int n)
> > > > +{
> > > > +  foo (p->a[n], p->c[n], p->b[n]);
> > > > +  if (n > 3)
> > > > +    {
> > > > +      foo (p->a[n], p->c[n], p->b[n]);
> > > > +      if (n > 12)
> > > > +	foo (p->b[n], p->a[n], p->c[n]);
> > > > +    }
> > > > +}
> > > > +
> > > > +/* { dg-final { scan-rtl-dump-times "\\(mem/s:SI \\(plus:" 6 "expand" } } */
> > > > +/* { dg-final { scan-rtl-dump-times "const_int 128" 3 "expand" } } */
> > > > +/* { dg-final { scan-rtl-dump-times "const_int 64 \\\[0x40\\\]\\)\\) \\\[" 3 "expand" } } */
> > > > +/* { dg-final { cleanup-rtl-dump "expand" } } */
> > > > Index: gcc/expr.c
> > > > ===================================================================
> > > > --- gcc/expr.c	(revision 180378)
> > > > +++ gcc/expr.c	(working copy)
> > > > @@ -56,6 +56,7 @@ along with GCC; see the file COPYING3.  If not see
> > > >  #include "ssaexpand.h"
> > > >  #include "target-globals.h"
> > > >  #include "params.h"
> > > > +#include "tree-pretty-print.h"
> > > > 
> > > >  /* Decide whether a function's arguments should be processed
> > > >     from first to last or from last to first.
> > > > @@ -7648,7 +7649,66 @@ expand_constructor (tree exp, rtx target, enum exp
> > > >    return target;
> > > >  }
> > > > 
> > > > +/* Given BASE, OFFSET, and BITPOS derived from EXPR, determine whether
> > > > +   there is a profitable opportunity to restructure address arithmetic
> > > > +   within BASE and OFFSET.  If so, produce such a restructuring and
> > > > +   return it.  */
> > > > +/* TODO: This belongs more properly in a separate pass that performs
> > > > +   general strength reduction on straight-line code.  Eventually move
> > > > +   this there.  */
> > > > 
> > > > +static tree
> > > > +restructure_base_and_offset (tree expr, tree base, tree offset,
> > > > +			     HOST_WIDE_INT bitpos)
> > > > +{
> > > > +  tree mult_op0, mult_op1, t1, t2, offset_type, mult_expr, add_expr, mem_ref;
> > > > +  double_int c, c1, c2, c3, c4;
> > > > +
> > > > +  /* Look for the following pattern:
> > > > +
> > > > +       base   = MEM_REF (T1, C1);
> > > > +       offset = MULT_EXPR (PLUS_EXPR (T2, C2), C3)
> > > > +       bitpos = C4 * BITS_PER_UNIT
> > > > +
> > > > +     If found, convert it to:
> > > > +
> > > > +       MEM_REF (POINTER_PLUS_EXPR (T1, MULT_EXPR (T2, C3)),
> > > > +                C1 + (C2 * C3) + C4)
> > > > +   */
> > > > +  if (!base
> > > > +      || !offset
> > > > +      || TREE_CODE (base) != MEM_REF
> > > > +      || TREE_CODE (offset) != MULT_EXPR)
> > > > +    return NULL_TREE;
> > > > +
> > > > +  mult_op0 = TREE_OPERAND (offset, 0);
> > > > +  mult_op1 = TREE_OPERAND (offset, 1);
> > > > +
> > > > +  if (TREE_CODE (mult_op0) != PLUS_EXPR
> > > > +      || TREE_CODE (mult_op1) != INTEGER_CST
> > > > +      || TREE_CODE (TREE_OPERAND (mult_op0, 1)) != INTEGER_CST)
> > > > +    return NULL_TREE;
> > > > +
> > > > +  t1 = TREE_OPERAND (base, 0);
> > > > +  t2 = TREE_OPERAND (mult_op0, 0);
> > > > +  c1 = mem_ref_offset (base);
> > > > +  c2 = tree_to_double_int (TREE_OPERAND (mult_op0, 1));
> > > > +  c3 = tree_to_double_int (mult_op1);
> > > > +  c4 = uhwi_to_double_int (bitpos / BITS_PER_UNIT);
> > > > +  c = double_int_add (double_int_add (c1, double_int_mul (c2, c3)), c4);
> > > > +
> > > > +  offset_type = TREE_TYPE (TREE_OPERAND (base, 1));
> > > > +
> > > > +  mult_expr = fold_build2 (MULT_EXPR, sizetype, t2, 
> > > > +			   double_int_to_tree (sizetype, c3));
> > > > +
> > > > +  add_expr = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (t1), t1, mult_expr);
> > > > +
> > > > +  mem_ref = fold_build2 (MEM_REF, TREE_TYPE (expr), add_expr,
> > > > +			 double_int_to_tree (offset_type, c));
> > > > +  return mem_ref;
> > > > +}
> > > > +
> > > >  /* expand_expr: generate code for computing expression EXP.
> > > >     An rtx for the computed value is returned.  The value is never null.
> > > >     In the case of a void EXP, const0_rtx is returned.
> > > > @@ -9584,7 +9644,7 @@ expand_expr_real_1 (tree exp, rtx target, enum mac
> > > >        {
> > > >  	enum machine_mode mode1, mode2;
> > > >  	HOST_WIDE_INT bitsize, bitpos;
> > > > -	tree offset;
> > > > +	tree offset, tem2;
> > > >  	int volatilep = 0, must_force_mem;
> > > >  	bool packedp = false;
> > > >  	tree tem = get_inner_reference (exp, &bitsize, &bitpos, &offset,
> > > > @@ -9601,6 +9661,36 @@ expand_expr_real_1 (tree exp, rtx target, enum mac
> > > >  		&& DECL_PACKED (TREE_OPERAND (exp, 1))))
> > > >  	  packedp = true;
> > > > 
> > > > +	/* Attempt to restructure the base and offset to combine constants.
> > > > +	   Bitfield references should eventually be lowered prior to this
> > > > +	   point and are not dealt with.  Skip BLKmode aggregates as well.
> > > > +	   This generates dead code, so require -O2.  */
> > > > +	if (optimize > 1
> > > > +	    && code != BIT_FIELD_REF
> > > > +	    && (code != COMPONENT_REF 
> > > > +		|| !DECL_BIT_FIELD (TREE_OPERAND (exp, 1)))
> > > > +	    && TYPE_MODE (TREE_TYPE (exp)) != BLKmode
> > > > +	    && bitpos % BITS_PER_UNIT == 0
> > > > +	    && ((tem2 = restructure_base_and_offset (exp, tem, offset, bitpos))
> > > > +		!= NULL_TREE))
> > > > +	  {
> > > > +	    copy_ref_info (tem2, exp);
> > > > +	    tem = tem2;
> > > > +	    /* The offset and bitpos were absorbed into the new MEM_REF, so 
> > > > +	       make sure we don't add them in again.  */
> > > > +	    offset = NULL_TREE;
> > > > +	    bitpos = 0;
> > 
> > I'm not sure it's a good idea to do that.  Why not instead just
> > transform the offset and bitpos parts?  Otherwise you are going
> > to lose all subsequent logic that tries to optimize offsetted
> > base addressing.
> > 
> > Thus, instead of
> > 
> > > > +       base   = MEM_REF (T1, C1);
> > > > +       offset = MULT_EXPR (PLUS_EXPR (T2, C2), C3)
> > > > +       bitpos = C4 * BITS_PER_UNIT
> > > > +  
> > > > +     If found, convert it to:
> > > > +  
> > > > +       MEM_REF (POINTER_PLUS_EXPR (T1, MULT_EXPR (T2, C3)),
> > > > +                C1 + (C2 * C3) + C4)
> > 
> > convert it to
> > 
> >    base = MEM_REF (T1, 0);
> >    offset = MULT_EXPR (T2, C3);
> >    bitpos = C1 + (C2 * C3) + C4;
> > 
> > ?
> 
> Hm.  I'll have to play with that to be sure it still solves our original
> problem after all the downstream logic kicks in...  
> 
> > 
> > Also note that with only handling this here you fail to handle
> > stores (see expand_assignment, or just grep for the places that
> > handle MEM_REF).
> 
> Bleah.  OK.  Expand is a messy beast -- you had pointed me at this spot
> a while back and it looked on the surface like it handled both...sigh.
> 
> > 
> > If we want to handle this during expansion we probably want to
> > factor out a routine that handles address-generation for all
> > of the cases.
> 
> Well, doing it during expand was intended to be an "easy" temporary
> solution for 4.7 until we can do it "right" in the tree phases as part
> of strength reduction.  The "easy" part seems to have gone into
> hiding. ;)  If you feel the strength reduction pass is generally on the
> right track, it may be best to concentrate on getting that completed
> rather than keep pulling on loose threads in expand.  Let me know your
> preference.

Yeah, strength reduction looks like a better approach to me.  After
all an easy temporary solution is likely going to regress for someone
(esp. touching that area...).

I've been hoping for others to chime in with opinions, but appearantly
folks are too busy... ;)

Richard.
diff mbox

Patch

Index: gcc/testsuite/gcc.dg/tree-ssa/pr46556-1.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr46556-1.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr46556-1.c	(revision 0)
@@ -0,0 +1,22 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-rtl-expand" } */
+
+struct x
+{
+  int a[16];
+  int b[16];
+  int c[16];
+};
+
+extern void foo (int, int, int);
+
+void
+f (struct x *p, unsigned int n)
+{
+  foo (p->a[n], p->c[n], p->b[n]);
+}
+
+/* { dg-final { scan-rtl-dump-times "\\(mem/s:SI \\(plus:" 2 "expand" } } */
+/* { dg-final { scan-rtl-dump-times "const_int 128" 1 "expand" } } */
+/* { dg-final { scan-rtl-dump-times "const_int 64 \\\[0x40\\\]\\)\\) \\\[" 1 "expand" } } */
+/* { dg-final { cleanup-rtl-dump "expand" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr46556-2.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr46556-2.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr46556-2.c	(revision 0)
@@ -0,0 +1,26 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-rtl-expand" } */
+
+struct x
+{
+  int a[16];
+  int b[16];
+  int c[16];
+};
+
+extern void foo (int, int, int);
+
+void
+f (struct x *p, unsigned int n)
+{
+  foo (p->a[n], p->c[n], p->b[n]);
+  if (n > 12)
+    foo (p->a[n], p->c[n], p->b[n]);
+  else if (n > 3)
+    foo (p->b[n], p->a[n], p->c[n]);
+}
+
+/* { dg-final { scan-rtl-dump-times "\\(mem/s:SI \\(plus:" 6 "expand" } } */
+/* { dg-final { scan-rtl-dump-times "const_int 128" 3 "expand" } } */
+/* { dg-final { scan-rtl-dump-times "const_int 64 \\\[0x40\\\]\\)\\) \\\[" 3 "expand" } } */
+/* { dg-final { cleanup-rtl-dump "expand" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr46556-3.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr46556-3.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr46556-3.c	(revision 0)
@@ -0,0 +1,27 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-rtl-expand" } */
+struct x
+{
+  int a[16];
+  int b[16];
+  int c[16];
+};
+
+extern void foo (int, int, int);
+
+void
+f (struct x *p, unsigned int n)
+{
+  foo (p->a[n], p->c[n], p->b[n]);
+  if (n > 3)
+    {
+      foo (p->a[n], p->c[n], p->b[n]);
+      if (n > 12)
+	foo (p->b[n], p->a[n], p->c[n]);
+    }
+}
+
+/* { dg-final { scan-rtl-dump-times "\\(mem/s:SI \\(plus:" 6 "expand" } } */
+/* { dg-final { scan-rtl-dump-times "const_int 128" 3 "expand" } } */
+/* { dg-final { scan-rtl-dump-times "const_int 64 \\\[0x40\\\]\\)\\) \\\[" 3 "expand" } } */
+/* { dg-final { cleanup-rtl-dump "expand" } } */
Index: gcc/expr.c
===================================================================
--- gcc/expr.c	(revision 180378)
+++ gcc/expr.c	(working copy)
@@ -56,6 +56,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "ssaexpand.h"
 #include "target-globals.h"
 #include "params.h"
+#include "tree-pretty-print.h"
 
 /* Decide whether a function's arguments should be processed
    from first to last or from last to first.
@@ -7648,7 +7649,66 @@  expand_constructor (tree exp, rtx target, enum exp
   return target;
 }
 
+/* Given BASE, OFFSET, and BITPOS derived from EXPR, determine whether
+   there is a profitable opportunity to restructure address arithmetic
+   within BASE and OFFSET.  If so, produce such a restructuring and
+   return it.  */
+/* TODO: This belongs more properly in a separate pass that performs
+   general strength reduction on straight-line code.  Eventually move
+   this there.  */
 
+static tree
+restructure_base_and_offset (tree expr, tree base, tree offset,
+			     HOST_WIDE_INT bitpos)
+{
+  tree mult_op0, mult_op1, t1, t2, offset_type, mult_expr, add_expr, mem_ref;
+  double_int c, c1, c2, c3, c4;
+
+  /* Look for the following pattern:
+
+       base   = MEM_REF (T1, C1);
+       offset = MULT_EXPR (PLUS_EXPR (T2, C2), C3)
+       bitpos = C4 * BITS_PER_UNIT
+
+     If found, convert it to:
+
+       MEM_REF (POINTER_PLUS_EXPR (T1, MULT_EXPR (T2, C3)),
+                C1 + (C2 * C3) + C4)
+   */
+  if (!base
+      || !offset
+      || TREE_CODE (base) != MEM_REF
+      || TREE_CODE (offset) != MULT_EXPR)
+    return NULL_TREE;
+
+  mult_op0 = TREE_OPERAND (offset, 0);
+  mult_op1 = TREE_OPERAND (offset, 1);
+
+  if (TREE_CODE (mult_op0) != PLUS_EXPR
+      || TREE_CODE (mult_op1) != INTEGER_CST
+      || TREE_CODE (TREE_OPERAND (mult_op0, 1)) != INTEGER_CST)
+    return NULL_TREE;
+
+  t1 = TREE_OPERAND (base, 0);
+  t2 = TREE_OPERAND (mult_op0, 0);
+  c1 = mem_ref_offset (base);
+  c2 = tree_to_double_int (TREE_OPERAND (mult_op0, 1));
+  c3 = tree_to_double_int (mult_op1);
+  c4 = uhwi_to_double_int (bitpos / BITS_PER_UNIT);
+  c = double_int_add (double_int_add (c1, double_int_mul (c2, c3)), c4);
+
+  offset_type = TREE_TYPE (TREE_OPERAND (base, 1));
+
+  mult_expr = fold_build2 (MULT_EXPR, sizetype, t2, 
+			   double_int_to_tree (sizetype, c3));
+
+  add_expr = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (t1), t1, mult_expr);
+
+  mem_ref = fold_build2 (MEM_REF, TREE_TYPE (expr), add_expr,
+			 double_int_to_tree (offset_type, c));
+  return mem_ref;
+}
+
 /* expand_expr: generate code for computing expression EXP.
    An rtx for the computed value is returned.  The value is never null.
    In the case of a void EXP, const0_rtx is returned.
@@ -9584,7 +9644,7 @@  expand_expr_real_1 (tree exp, rtx target, enum mac
       {
 	enum machine_mode mode1, mode2;
 	HOST_WIDE_INT bitsize, bitpos;
-	tree offset;
+	tree offset, tem2;
 	int volatilep = 0, must_force_mem;
 	bool packedp = false;
 	tree tem = get_inner_reference (exp, &bitsize, &bitpos, &offset,
@@ -9601,6 +9661,36 @@  expand_expr_real_1 (tree exp, rtx target, enum mac
 		&& DECL_PACKED (TREE_OPERAND (exp, 1))))
 	  packedp = true;
 
+	/* Attempt to restructure the base and offset to combine constants.
+	   Bitfield references should eventually be lowered prior to this
+	   point and are not dealt with.  Skip BLKmode aggregates as well.
+	   This generates dead code, so require -O2.  */
+	if (optimize > 1
+	    && code != BIT_FIELD_REF
+	    && (code != COMPONENT_REF 
+		|| !DECL_BIT_FIELD (TREE_OPERAND (exp, 1)))
+	    && TYPE_MODE (TREE_TYPE (exp)) != BLKmode
+	    && bitpos % BITS_PER_UNIT == 0
+	    && ((tem2 = restructure_base_and_offset (exp, tem, offset, bitpos))
+		!= NULL_TREE))
+	  {
+	    copy_ref_info (tem2, exp);
+	    tem = tem2;
+	    /* The offset and bitpos were absorbed into the new MEM_REF, so 
+	       make sure we don't add them in again.  */
+	    offset = NULL_TREE;
+	    bitpos = 0;
+
+	    if (dump_file && (dump_flags & TDF_DETAILS))
+	      {
+		fprintf (dump_file, "\nRestructured inner reference:\n");
+		fprintf (dump_file, "  Original reference: ");
+		print_generic_expr (dump_file, exp, TDF_VOPS|TDF_MEMSYMS);
+		fprintf (dump_file, "\n  Replacement: ");
+		print_generic_expr (dump_file, tem, TDF_VOPS|TDF_MEMSYMS);
+	      }
+	  }
+
 	/* If TEM's type is a union of variable size, pass TARGET to the inner
 	   computation, since it will need a temporary and TARGET is known
 	   to have to do.  This occurs in unchecked conversion in Ada.  */
Index: gcc/Makefile.in
===================================================================
--- gcc/Makefile.in	(revision 180378)
+++ gcc/Makefile.in	(working copy)
@@ -2926,7 +2926,7 @@  expr.o : expr.c $(CONFIG_H) $(SYSTEM_H) coretypes.
    reload.h langhooks.h intl.h $(TM_P_H) $(TARGET_H) \
    tree-iterator.h gt-expr.h $(MACHMODE_H) $(TIMEVAR_H) $(TREE_FLOW_H) \
    $(TREE_PASS_H) $(DF_H) $(DIAGNOSTIC_H) vecprim.h $(SSAEXPAND_H) \
-   $(PARAMS_H) $(COMMON_TARGET_H)
+   $(PARAMS_H) $(COMMON_TARGET_H) tree-pretty-print.h
 dojump.o : dojump.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) $(RTL_H) $(TREE_H) \
    $(FLAGS_H) $(FUNCTION_H) $(EXPR_H) $(OPTABS_H) $(INSN_ATTR_H) insn-config.h \
    langhooks.h $(GGC_H) gt-dojump.h vecprim.h $(BASIC_BLOCK_H) output.h