diff mbox

Fix PR46556 (poor address generation)

Message ID 1318947241.22857.13.camel@gnopaine
State New
Headers show

Commit Message

Bill Schmidt Oct. 18, 2011, 2:14 p.m. UTC
Greetings,

Here is a new revision of the tree portions of this patch.  I moved the
pattern recognizer to expand, and added additional logic to look for the
same pattern in gimple form.  I added two more tests to verify the new
logic.

I didn't run into any problems with the RTL CSE phases.  I can't recall
for certain what caused me to abandon the expand version previously.
There may not have been good reason; too many versions to keep track of
and too many interruptions, I suppose.  In any case, I'm much happier
having this code in the expander.

Paolo's RTL logic for unpropagating the zero-offset case is not going to
work out as is.  It causes a number of performance degradations, which I
suspect are due to the pass reordering.  That's a separate issue,
though, and not needed for this patch.

Bootstrapped and regression-tested on powerpc64-linux.  SPEC cpu2000
shows a number of small improvements and no significant degradations.
SPEC cpu2006 testing is pending.

Thanks,
Bill


2011-10-18  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>

gcc:

	PR rtl-optimization/46556
	* expr.c (tree-pretty-print.h): New include.
	(restructure_base_and_offset): New function.
	(restructure_mem_ref): New function.
	(expand_expr_real_1): In MEM_REF case, attempt restructure_mem_ref
	first.  In normal_inner_ref case, attempt restructure_base_and_offset
	first.
	* Makefile.in: Update dependences for expr.o.

gcc/testsuite:

	PR rtl-optimization/46556
	* gcc.dg/tree-ssa-pr46556-1.c: New test.
	* gcc.dg/tree-ssa-pr46556-2.c: Likewise.
	* gcc.dg/tree-ssa-pr46556-3.c: Likewise.
	* gcc.dg/tree-ssa-pr46556-4.c: Likewise.
	* gcc.dg/tree-ssa-pr46556-5.c: Likewise.

Comments

Richard Biener Oct. 21, 2011, 9:26 a.m. UTC | #1
On Tue, Oct 18, 2011 at 4:14 PM, William J. Schmidt
<wschmidt@linux.vnet.ibm.com> wrote:
> Greetings,
>
> Here is a new revision of the tree portions of this patch.  I moved the
> pattern recognizer to expand, and added additional logic to look for the
> same pattern in gimple form.  I added two more tests to verify the new
> logic.
>
> I didn't run into any problems with the RTL CSE phases.  I can't recall
> for certain what caused me to abandon the expand version previously.
> There may not have been good reason; too many versions to keep track of
> and too many interruptions, I suppose.  In any case, I'm much happier
> having this code in the expander.
>
> Paolo's RTL logic for unpropagating the zero-offset case is not going to
> work out as is.  It causes a number of performance degradations, which I
> suspect are due to the pass reordering.  That's a separate issue,
> though, and not needed for this patch.
>
> Bootstrapped and regression-tested on powerpc64-linux.  SPEC cpu2000
> shows a number of small improvements and no significant degradations.
> SPEC cpu2006 testing is pending.
>
> Thanks,
> Bill
>
>
> 2011-10-18  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>
>
> gcc:
>
>        PR rtl-optimization/46556
>        * expr.c (tree-pretty-print.h): New include.
>        (restructure_base_and_offset): New function.
>        (restructure_mem_ref): New function.
>        (expand_expr_real_1): In MEM_REF case, attempt restructure_mem_ref
>        first.  In normal_inner_ref case, attempt restructure_base_and_offset
>        first.
>        * Makefile.in: Update dependences for expr.o.
>
> gcc/testsuite:
>
>        PR rtl-optimization/46556
>        * gcc.dg/tree-ssa-pr46556-1.c: New test.
>        * gcc.dg/tree-ssa-pr46556-2.c: Likewise.
>        * gcc.dg/tree-ssa-pr46556-3.c: Likewise.
>        * gcc.dg/tree-ssa-pr46556-4.c: Likewise.
>        * gcc.dg/tree-ssa-pr46556-5.c: Likewise.
>
>
> Index: gcc/testsuite/gcc.dg/tree-ssa/pr46556-1.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/tree-ssa/pr46556-1.c   (revision 0)
> +++ gcc/testsuite/gcc.dg/tree-ssa/pr46556-1.c   (revision 0)
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-rtl-expand" } */
> +
> +struct x
> +{
> +  int a[16];
> +  int b[16];
> +  int c[16];
> +};
> +
> +extern void foo (int, int, int);
> +
> +void
> +f (struct x *p, unsigned int n)
> +{
> +  foo (p->a[n], p->c[n], p->b[n]);
> +}
> +
> +/* { dg-final { scan-rtl-dump-times "\\(mem/s:SI \\(plus:" 2 "expand" } } */
> +/* { dg-final { scan-rtl-dump-times "const_int 128" 1 "expand" } } */
> +/* { dg-final { scan-rtl-dump-times "const_int 64 \\\[0x40\\\]\\)\\) \\\[" 1 "expand" } } */
> +/* { dg-final { cleanup-rtl-dump "expand" } } */
> Index: gcc/testsuite/gcc.dg/tree-ssa/pr46556-2.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/tree-ssa/pr46556-2.c   (revision 0)
> +++ gcc/testsuite/gcc.dg/tree-ssa/pr46556-2.c   (revision 0)
> @@ -0,0 +1,26 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-rtl-expand" } */
> +
> +struct x
> +{
> +  int a[16];
> +  int b[16];
> +  int c[16];
> +};
> +
> +extern void foo (int, int, int);
> +
> +void
> +f (struct x *p, unsigned int n)
> +{
> +  foo (p->a[n], p->c[n], p->b[n]);
> +  if (n > 12)
> +    foo (p->a[n], p->c[n], p->b[n]);
> +  else if (n > 3)
> +    foo (p->b[n], p->a[n], p->c[n]);
> +}
> +
> +/* { dg-final { scan-rtl-dump-times "\\(mem/s:SI \\(plus:" 6 "expand" } } */
> +/* { dg-final { scan-rtl-dump-times "const_int 128" 3 "expand" } } */
> +/* { dg-final { scan-rtl-dump-times "const_int 64 \\\[0x40\\\]\\)\\) \\\[" 3 "expand" } } */
> +/* { dg-final { cleanup-rtl-dump "expand" } } */
> Index: gcc/testsuite/gcc.dg/tree-ssa/pr46556-3.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/tree-ssa/pr46556-3.c   (revision 0)
> +++ gcc/testsuite/gcc.dg/tree-ssa/pr46556-3.c   (revision 0)
> @@ -0,0 +1,27 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-rtl-expand" } */
> +struct x
> +{
> +  int a[16];
> +  int b[16];
> +  int c[16];
> +};
> +
> +extern void foo (int, int, int);
> +
> +void
> +f (struct x *p, unsigned int n)
> +{
> +  foo (p->a[n], p->c[n], p->b[n]);
> +  if (n > 3)
> +    {
> +      foo (p->a[n], p->c[n], p->b[n]);
> +      if (n > 12)
> +       foo (p->b[n], p->a[n], p->c[n]);
> +    }
> +}
> +
> +/* { dg-final { scan-rtl-dump-times "\\(mem/s:SI \\(plus:" 6 "expand" } } */
> +/* { dg-final { scan-rtl-dump-times "const_int 128" 3 "expand" } } */
> +/* { dg-final { scan-rtl-dump-times "const_int 64 \\\[0x40\\\]\\)\\) \\\[" 3 "expand" } } */
> +/* { dg-final { cleanup-rtl-dump "expand" } } */
> Index: gcc/testsuite/gcc.dg/tree-ssa/pr46556-4.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/tree-ssa/pr46556-4.c   (revision 0)
> +++ gcc/testsuite/gcc.dg/tree-ssa/pr46556-4.c   (revision 0)
> @@ -0,0 +1,28 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-rtl-expand" } */
> +
> +struct x
> +{
> +  int a[16];
> +  int b[16];
> +  int c[16];
> +};
> +
> +extern void foo (int, int, int);
> +
> +void
> +f (struct x *p, unsigned int n)
> +{
> +  foo (*((int *)p + n*4), *((int *)p + 32 + n*4), *((int *)p + 16 + n*4));
> +  if (n > 3)
> +    {
> +      foo (*((int *)p + n*4), *((int *)p + 32 + n*4), *((int *)p + 16 + n*4));
> +      if (n > 12)
> +       foo (*((int *)p + 16 + n*4), *((int *)p + n*4), *((int *)p + 32 + n*4));
> +    }
> +}
> +
> +/* { dg-final { scan-rtl-dump-times "\\(mem:SI \\(plus:" 6 "expand" } } */
> +/* { dg-final { scan-rtl-dump-times "const_int 128" 3 "expand" } } */
> +/* { dg-final { scan-rtl-dump-times "const_int 64 \\\[0x40\\\]\\)\\) \\\[" 3 "expand" } } */
> +/* { dg-final { cleanup-rtl-dump "expand" } } */
> Index: gcc/testsuite/gcc.dg/tree-ssa/pr46556-5.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/tree-ssa/pr46556-5.c   (revision 0)
> +++ gcc/testsuite/gcc.dg/tree-ssa/pr46556-5.c   (revision 0)
> @@ -0,0 +1,36 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-rtl-expand" } */
> +/* { dg-skip-if "" { lp64 } { "*" } { "-m32" } } */
> +/* { dg-skip-if "" { llp64 } { "*" } { "-m32" } } */
> +
> +struct x
> +{
> +  int a[16];
> +  int b[16];
> +  int c[16];
> +};
> +
> +extern void foo (int, int, int);
> +
> +void
> +f (struct x *p, unsigned int n)
> +{
> +  foo (*((int *)p + n * 4),
> +       *((int *)p + (n + 8) * 4),
> +       *((int *)p + (n + 4) * 4));
> +  if (n > 3)
> +    {
> +      foo (*((int *)p + n * 4),
> +          *((int *)p + (n + 8) * 4),
> +          *((int *)p + (n + 4) * 4));
> +      if (n > 12)
> +       foo (*((int *)p + (n + 4) * 4),
> +            *((int *)p + n * 4),
> +            *((int *)p + (n + 8) * 4));
> +    }
> +}
> +
> +/* { dg-final { scan-rtl-dump-times "\\(mem:SI \\(plus:" 6 "expand" } } */
> +/* { dg-final { scan-rtl-dump-times "const_int 128" 3 "expand" } } */
> +/* { dg-final { scan-rtl-dump-times "const_int 64 \\\[0x40\\\]\\)\\) \\\[" 3 "expand" } } */
> +/* { dg-final { cleanup-rtl-dump "expand" } } */
> Index: gcc/expr.c
> ===================================================================
> --- gcc/expr.c  (revision 180138)
> +++ gcc/expr.c  (working copy)
> @@ -56,6 +56,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "ssaexpand.h"
>  #include "target-globals.h"
>  #include "params.h"
> +#include "tree-pretty-print.h"
>
>  /* Decide whether a function's arguments should be processed
>    from first to last or from last to first.
> @@ -7648,7 +7649,157 @@ expand_constructor (tree exp, rtx target, enum exp
>   return target;
>  }
>
> +/* Given BASE, OFFSET, and BITPOS derived from EXPR, determine whether
> +   there is a profitable opportunity to restructure address arithmetic
> +   within BASE and OFFSET.  If so, produce such a restructuring and
> +   return it.  */
> +/* TODO: This belongs more properly in a separate pass that performs
> +   general strength reduction on straight-line code.  Eventually move
> +   this there.  */
>
> +static tree
> +restructure_base_and_offset (tree expr, tree base, tree offset,
> +                            HOST_WIDE_INT bitpos)
> +{
> +  tree mult_op0, mult_op1, t1, t2, offset_type, mult_expr, add_expr, mem_ref;
> +  double_int c, c1, c2, c3, c4;
> +
> +  /* Look for the following pattern:
> +
> +       base   = MEM_REF (T1, C1);
> +       offset = MULT_EXPR (PLUS_EXPR (T2, C2), C3)
> +       bitpos = C4 * BITS_PER_UNIT
> +
> +     If found, convert it to:
> +
> +       MEM_REF (POINTER_PLUS_EXPR (T1, MULT_EXPR (T2, C3)),
> +                C1 + (C2 * C3) + C4)
> +   */
> +  if (!base
> +      || !offset
> +      || TREE_CODE (base) != MEM_REF
> +      || TREE_CODE (offset) != MULT_EXPR)
> +    return NULL_TREE;
> +
> +  mult_op0 = TREE_OPERAND (offset, 0);
> +  mult_op1 = TREE_OPERAND (offset, 1);
> +
> +  if (TREE_CODE (mult_op0) != PLUS_EXPR
> +      || TREE_CODE (mult_op1) != INTEGER_CST
> +      || TREE_CODE (TREE_OPERAND (mult_op0, 1)) != INTEGER_CST)
> +    return NULL_TREE;
> +
> +  t1 = TREE_OPERAND (base, 0);
> +  t2 = TREE_OPERAND (mult_op0, 0);
> +  c1 = mem_ref_offset (base);
> +  c2 = tree_to_double_int (TREE_OPERAND (mult_op0, 1));
> +  c3 = tree_to_double_int (mult_op1);
> +  c4 = uhwi_to_double_int (bitpos / BITS_PER_UNIT);
> +  c = double_int_add (double_int_add (c1, double_int_mul (c2, c3)), c4);
> +
> +  offset_type = TREE_TYPE (TREE_OPERAND (base, 1));
> +
> +  mult_expr = fold_build2 (MULT_EXPR, sizetype, t2,
> +                          double_int_to_tree (sizetype, c3));
> +
> +  add_expr = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (t1), t1, mult_expr);
> +
> +  mem_ref = fold_build2 (MEM_REF, TREE_TYPE (expr), add_expr,
> +                        double_int_to_tree (offset_type, c));
> +  return mem_ref;
> +}
> +
> +/* Look for a hidden array reference in memory reference EXP.  If found,
> +   return a revised memory reference that restructures addressability
> +   to combine constants.  */
> +/* TODO: This belongs more properly in a separate pass that performs
> +   general strength reduction on straight-line code.  Eventually move
> +   this there.  */
> +
> +static tree
> +restructure_mem_ref (tree exp)
> +{
> +  tree s2, s3, t1, t2, offset_type, mult_expr, add_expr;
> +  tree s2_op0, s2_op1, s3_op0, s3_op1, result;
> +  gimple s1_stmt, s2_stmt, s3_stmt;
> +  double_int c1, c2, c3, c;
> +
> +  /* Currently we look for the following pattern, where each Si is an
> +     SSA name, each Tj is an arbitrary tree, and each Ck is an integer
> +     constant:
> +
> +       S3 = T2 + C2;
> +       S2 = S3 * C3;
> +       S1 = T1 ptr+ S2;
> +       exp = MEM_REF (S1, C1);
> +
> +     This is rewritten as:
> +
> +       exp = MEM_REF (T1 ptr+ (T2 * C3)), C1 + C2 * C3);
> +
> +  */
> +  if (TREE_CODE (TREE_OPERAND (exp, 0)) != SSA_NAME)
> +    return NULL_TREE;
> +
> +  /* We don't use get_def_for_expr for S1 because TER doesn't forward
> +     S1 in some situations where this transform is useful, such as
> +     when S1 is the base of two MEM_REFs fitting the pattern.  */
> +  s1_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (exp, 0));

You can't do this - this will possibly generate wrong code.  You _do_
have to use get_def_for_expr.  Or do it when we are still in "true" SSA form...

Richard.

> +  if (!s1_stmt
> +      || !is_gimple_assign (s1_stmt)
> +      || gimple_assign_rhs_code (s1_stmt) != POINTER_PLUS_EXPR)
> +    return NULL_TREE;
> +
> +  c1 = mem_ref_offset (exp);
> +  t1 = gimple_assign_rhs1 (s1_stmt);
> +  s2 = gimple_assign_rhs2 (s1_stmt);
> +
> +  if (TREE_CODE (s2) != SSA_NAME)
> +    return NULL_TREE;
> +
> +  s2_stmt = get_def_for_expr (s2, MULT_EXPR);
> +
> +  if (!s2_stmt)
> +    return NULL_TREE;
> +
> +  s2_op0 = gimple_assign_rhs1 (s2_stmt);
> +  s2_op1 = gimple_assign_rhs2 (s2_stmt);
> +
> +  if (TREE_CODE (s2_op1) != INTEGER_CST)
> +    return NULL_TREE;
> +
> +  s3 = s2_op0;
> +  c3 = tree_to_double_int (s2_op1);
> +  s3_stmt = get_def_for_expr (s3, PLUS_EXPR);
> +
> +  if (!s3_stmt)
> +    return NULL_TREE;
> +
> +  s3_op0 = gimple_assign_rhs1 (s3_stmt);
> +  s3_op1 = gimple_assign_rhs2 (s3_stmt);
> +
> +  if (TREE_CODE (s3_op1) != INTEGER_CST)
> +    return NULL_TREE;
> +
> +  t2 = s3_op0;
> +  c2 = tree_to_double_int (s3_op1);
> +  c = double_int_add (c1, double_int_mul (c2, c3));
> +
> +  offset_type = TREE_TYPE (TREE_OPERAND (exp, 1));
> +
> +  mult_expr = fold_build2 (MULT_EXPR, sizetype, t2,
> +                          double_int_to_tree (sizetype, c3));
> +
> +  add_expr = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (t1), t1, mult_expr);
> +
> +  result = build2 (MEM_REF, TREE_TYPE (exp), add_expr,
> +                  double_int_to_tree (offset_type, c));
> +
> +  return result;
> +}
> +
> +
>  /* expand_expr: generate code for computing expression EXP.
>    An rtx for the computed value is returned.  The value is never null.
>    In the case of a void EXP, const0_rtx is returned.
> @@ -9313,6 +9464,28 @@ expand_expr_real_1 (tree exp, rtx target, enum mac
>        gimple def_stmt;
>        enum insn_code icode;
>        unsigned align;
> +       tree tem;
> +
> +       /* Attempt to restructure a hidden array reference, such
> +          as *(p + (n + k) * s).  This generates dead code, so
> +          require -O2.  */
> +       if (optimize > 1
> +           && mode != BLKmode
> +           && ((tem = restructure_mem_ref (exp)) != NULL_TREE))
> +         {
> +           if (dump_file && (dump_flags & TDF_DETAILS))
> +             {
> +               fprintf (dump_file, "\nRestructured memory reference:\n");
> +               fprintf (dump_file, "  Original reference: ");
> +               print_generic_expr (dump_file, exp, TDF_VOPS|TDF_MEMSYMS);
> +               fprintf (dump_file, "\n  Replacement: ");
> +               print_generic_expr (dump_file, tem, TDF_VOPS|TDF_MEMSYMS);
> +             }
> +           copy_ref_info (tem, exp);
> +           exp = tem;
> +           base = TREE_OPERAND (exp, 0);
> +         }
> +
>        /* Handle expansion of non-aliased memory with non-BLKmode.  That
>           might end up in a register.  */
>        if (TREE_CODE (base) == ADDR_EXPR)
> @@ -9584,7 +9757,7 @@ expand_expr_real_1 (tree exp, rtx target, enum mac
>       {
>        enum machine_mode mode1, mode2;
>        HOST_WIDE_INT bitsize, bitpos;
> -       tree offset;
> +       tree offset, tem2;
>        int volatilep = 0, must_force_mem;
>        bool packedp = false;
>        tree tem = get_inner_reference (exp, &bitsize, &bitpos, &offset,
> @@ -9601,6 +9774,36 @@ expand_expr_real_1 (tree exp, rtx target, enum mac
>                && DECL_PACKED (TREE_OPERAND (exp, 1))))
>          packedp = true;
>
> +       /* Attempt to restructure the base and offset to combine constants.
> +          Bitfield references should eventually be lowered prior to this
> +          point and are not dealt with.  Skip BLKmode aggregates as well.
> +          This generates dead code, so require -O2.  */
> +       if (optimize > 1
> +           && code != BIT_FIELD_REF
> +           && (code != COMPONENT_REF
> +               || !DECL_BIT_FIELD (TREE_OPERAND (exp, 1)))
> +           && TYPE_MODE (TREE_TYPE (exp)) != BLKmode
> +           && bitpos % BITS_PER_UNIT == 0
> +           && ((tem2 = restructure_base_and_offset (exp, tem, offset, bitpos))
> +               != NULL_TREE))
> +         {
> +           copy_ref_info (tem2, exp);
> +           tem = tem2;
> +           /* The offset and bitpos were absorbed into the new MEM_REF, so
> +              make sure we don't add them in again.  */
> +           offset = NULL_TREE;
> +           bitpos = 0;
> +
> +           if (dump_file && (dump_flags & TDF_DETAILS))
> +             {
> +               fprintf (dump_file, "\nRestructured inner reference:\n");
> +               fprintf (dump_file, "  Original reference: ");
> +               print_generic_expr (dump_file, exp, TDF_VOPS|TDF_MEMSYMS);
> +               fprintf (dump_file, "\n  Replacement: ");
> +               print_generic_expr (dump_file, tem, TDF_VOPS|TDF_MEMSYMS);
> +             }
> +         }
> +
>        /* If TEM's type is a union of variable size, pass TARGET to the inner
>           computation, since it will need a temporary and TARGET is known
>           to have to do.  This occurs in unchecked conversion in Ada.  */
> Index: gcc/Makefile.in
> ===================================================================
> --- gcc/Makefile.in     (revision 180138)
> +++ gcc/Makefile.in     (working copy)
> @@ -2926,7 +2926,7 @@ expr.o : expr.c $(CONFIG_H) $(SYSTEM_H) coretypes.
>    reload.h langhooks.h intl.h $(TM_P_H) $(TARGET_H) \
>    tree-iterator.h gt-expr.h $(MACHMODE_H) $(TIMEVAR_H) $(TREE_FLOW_H) \
>    $(TREE_PASS_H) $(DF_H) $(DIAGNOSTIC_H) vecprim.h $(SSAEXPAND_H) \
> -   $(PARAMS_H) $(COMMON_TARGET_H)
> +   $(PARAMS_H) $(COMMON_TARGET_H) tree-pretty-print.h
>  dojump.o : dojump.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) $(RTL_H) $(TREE_H) \
>    $(FLAGS_H) $(FUNCTION_H) $(EXPR_H) $(OPTABS_H) $(INSN_ATTR_H) insn-config.h \
>    langhooks.h $(GGC_H) gt-dojump.h vecprim.h $(BASIC_BLOCK_H) output.h
>
>
>
Bill Schmidt Oct. 21, 2011, 12:22 p.m. UTC | #2
On Fri, 2011-10-21 at 11:26 +0200, Richard Guenther wrote:
> On Tue, Oct 18, 2011 at 4:14 PM, William J. Schmidt
> <wschmidt@linux.vnet.ibm.com> wrote:

<snip>

> > +
> > +  /* We don't use get_def_for_expr for S1 because TER doesn't forward
> > +     S1 in some situations where this transform is useful, such as
> > +     when S1 is the base of two MEM_REFs fitting the pattern.  */
> > +  s1_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (exp, 0));
> 
> You can't do this - this will possibly generate wrong code.  You _do_
> have to use get_def_for_expr.  Or do it when we are still in "true" SSA form...
> 
> Richard.
> 

OK.  get_def_for_expr always returns NULL here for the cases I was
targeting, so doing this in expand isn't going to be helpful.

Rather than cram this in somewhere else upstream, it might be better to
just wait and let this case be handled by the new strength reduction
pass.  This is one of the easy cases with explicit multiplies in the
instruction stream, so it shouldn't require any special handling there.
Seem reasonable?

Bill
Richard Biener Oct. 23, 2011, 9:31 a.m. UTC | #3
On Fri, Oct 21, 2011 at 2:22 PM, William J. Schmidt
<wschmidt@linux.vnet.ibm.com> wrote:
>
>
> On Fri, 2011-10-21 at 11:26 +0200, Richard Guenther wrote:
>> On Tue, Oct 18, 2011 at 4:14 PM, William J. Schmidt
>> <wschmidt@linux.vnet.ibm.com> wrote:
>
> <snip>
>
>> > +
>> > +  /* We don't use get_def_for_expr for S1 because TER doesn't forward
>> > +     S1 in some situations where this transform is useful, such as
>> > +     when S1 is the base of two MEM_REFs fitting the pattern.  */
>> > +  s1_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (exp, 0));
>>
>> You can't do this - this will possibly generate wrong code.  You _do_
>> have to use get_def_for_expr.  Or do it when we are still in "true" SSA form...
>>
>> Richard.
>>
>
> OK.  get_def_for_expr always returns NULL here for the cases I was
> targeting, so doing this in expand isn't going to be helpful.
>
> Rather than cram this in somewhere else upstream, it might be better to
> just wait and let this case be handled by the new strength reduction
> pass.  This is one of the easy cases with explicit multiplies in the
> instruction stream, so it shouldn't require any special handling there.
> Seem reasonable?

Yes, sure.

Richard.

> Bill
>
>
diff mbox

Patch

Index: gcc/testsuite/gcc.dg/tree-ssa/pr46556-1.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr46556-1.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr46556-1.c	(revision 0)
@@ -0,0 +1,22 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-rtl-expand" } */
+
+struct x
+{
+  int a[16];
+  int b[16];
+  int c[16];
+};
+
+extern void foo (int, int, int);
+
+void
+f (struct x *p, unsigned int n)
+{
+  foo (p->a[n], p->c[n], p->b[n]);
+}
+
+/* { dg-final { scan-rtl-dump-times "\\(mem/s:SI \\(plus:" 2 "expand" } } */
+/* { dg-final { scan-rtl-dump-times "const_int 128" 1 "expand" } } */
+/* { dg-final { scan-rtl-dump-times "const_int 64 \\\[0x40\\\]\\)\\) \\\[" 1 "expand" } } */
+/* { dg-final { cleanup-rtl-dump "expand" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr46556-2.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr46556-2.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr46556-2.c	(revision 0)
@@ -0,0 +1,26 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-rtl-expand" } */
+
+struct x
+{
+  int a[16];
+  int b[16];
+  int c[16];
+};
+
+extern void foo (int, int, int);
+
+void
+f (struct x *p, unsigned int n)
+{
+  foo (p->a[n], p->c[n], p->b[n]);
+  if (n > 12)
+    foo (p->a[n], p->c[n], p->b[n]);
+  else if (n > 3)
+    foo (p->b[n], p->a[n], p->c[n]);
+}
+
+/* { dg-final { scan-rtl-dump-times "\\(mem/s:SI \\(plus:" 6 "expand" } } */
+/* { dg-final { scan-rtl-dump-times "const_int 128" 3 "expand" } } */
+/* { dg-final { scan-rtl-dump-times "const_int 64 \\\[0x40\\\]\\)\\) \\\[" 3 "expand" } } */
+/* { dg-final { cleanup-rtl-dump "expand" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr46556-3.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr46556-3.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr46556-3.c	(revision 0)
@@ -0,0 +1,27 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-rtl-expand" } */
+struct x
+{
+  int a[16];
+  int b[16];
+  int c[16];
+};
+
+extern void foo (int, int, int);
+
+void
+f (struct x *p, unsigned int n)
+{
+  foo (p->a[n], p->c[n], p->b[n]);
+  if (n > 3)
+    {
+      foo (p->a[n], p->c[n], p->b[n]);
+      if (n > 12)
+	foo (p->b[n], p->a[n], p->c[n]);
+    }
+}
+
+/* { dg-final { scan-rtl-dump-times "\\(mem/s:SI \\(plus:" 6 "expand" } } */
+/* { dg-final { scan-rtl-dump-times "const_int 128" 3 "expand" } } */
+/* { dg-final { scan-rtl-dump-times "const_int 64 \\\[0x40\\\]\\)\\) \\\[" 3 "expand" } } */
+/* { dg-final { cleanup-rtl-dump "expand" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr46556-4.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr46556-4.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr46556-4.c	(revision 0)
@@ -0,0 +1,28 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-rtl-expand" } */
+
+struct x
+{
+  int a[16];
+  int b[16];
+  int c[16];
+};
+
+extern void foo (int, int, int);
+
+void
+f (struct x *p, unsigned int n)
+{
+  foo (*((int *)p + n*4), *((int *)p + 32 + n*4), *((int *)p + 16 + n*4));
+  if (n > 3)
+    {
+      foo (*((int *)p + n*4), *((int *)p + 32 + n*4), *((int *)p + 16 + n*4));
+      if (n > 12)
+	foo (*((int *)p + 16 + n*4), *((int *)p + n*4), *((int *)p + 32 + n*4));
+    }
+}
+
+/* { dg-final { scan-rtl-dump-times "\\(mem:SI \\(plus:" 6 "expand" } } */
+/* { dg-final { scan-rtl-dump-times "const_int 128" 3 "expand" } } */
+/* { dg-final { scan-rtl-dump-times "const_int 64 \\\[0x40\\\]\\)\\) \\\[" 3 "expand" } } */
+/* { dg-final { cleanup-rtl-dump "expand" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr46556-5.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr46556-5.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr46556-5.c	(revision 0)
@@ -0,0 +1,36 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-rtl-expand" } */
+/* { dg-skip-if "" { lp64 } { "*" } { "-m32" } } */
+/* { dg-skip-if "" { llp64 } { "*" } { "-m32" } } */
+
+struct x
+{
+  int a[16];
+  int b[16];
+  int c[16];
+};
+
+extern void foo (int, int, int);
+
+void
+f (struct x *p, unsigned int n)
+{
+  foo (*((int *)p + n * 4),
+       *((int *)p + (n + 8) * 4),
+       *((int *)p + (n + 4) * 4));
+  if (n > 3)
+    {
+      foo (*((int *)p + n * 4),
+	   *((int *)p + (n + 8) * 4),
+	   *((int *)p + (n + 4) * 4));
+      if (n > 12)
+	foo (*((int *)p + (n + 4) * 4),
+	     *((int *)p + n * 4),
+	     *((int *)p + (n + 8) * 4));
+    }
+}
+
+/* { dg-final { scan-rtl-dump-times "\\(mem:SI \\(plus:" 6 "expand" } } */
+/* { dg-final { scan-rtl-dump-times "const_int 128" 3 "expand" } } */
+/* { dg-final { scan-rtl-dump-times "const_int 64 \\\[0x40\\\]\\)\\) \\\[" 3 "expand" } } */
+/* { dg-final { cleanup-rtl-dump "expand" } } */
Index: gcc/expr.c
===================================================================
--- gcc/expr.c	(revision 180138)
+++ gcc/expr.c	(working copy)
@@ -56,6 +56,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "ssaexpand.h"
 #include "target-globals.h"
 #include "params.h"
+#include "tree-pretty-print.h"
 
 /* Decide whether a function's arguments should be processed
    from first to last or from last to first.
@@ -7648,7 +7649,157 @@  expand_constructor (tree exp, rtx target, enum exp
   return target;
 }
 
+/* Given BASE, OFFSET, and BITPOS derived from EXPR, determine whether
+   there is a profitable opportunity to restructure address arithmetic
+   within BASE and OFFSET.  If so, produce such a restructuring and
+   return it.  */
+/* TODO: This belongs more properly in a separate pass that performs
+   general strength reduction on straight-line code.  Eventually move
+   this there.  */
 
+static tree
+restructure_base_and_offset (tree expr, tree base, tree offset,
+			     HOST_WIDE_INT bitpos)
+{
+  tree mult_op0, mult_op1, t1, t2, offset_type, mult_expr, add_expr, mem_ref;
+  double_int c, c1, c2, c3, c4;
+
+  /* Look for the following pattern:
+
+       base   = MEM_REF (T1, C1);
+       offset = MULT_EXPR (PLUS_EXPR (T2, C2), C3)
+       bitpos = C4 * BITS_PER_UNIT
+
+     If found, convert it to:
+
+       MEM_REF (POINTER_PLUS_EXPR (T1, MULT_EXPR (T2, C3)),
+                C1 + (C2 * C3) + C4)
+   */
+  if (!base
+      || !offset
+      || TREE_CODE (base) != MEM_REF
+      || TREE_CODE (offset) != MULT_EXPR)
+    return NULL_TREE;
+
+  mult_op0 = TREE_OPERAND (offset, 0);
+  mult_op1 = TREE_OPERAND (offset, 1);
+
+  if (TREE_CODE (mult_op0) != PLUS_EXPR
+      || TREE_CODE (mult_op1) != INTEGER_CST
+      || TREE_CODE (TREE_OPERAND (mult_op0, 1)) != INTEGER_CST)
+    return NULL_TREE;
+
+  t1 = TREE_OPERAND (base, 0);
+  t2 = TREE_OPERAND (mult_op0, 0);
+  c1 = mem_ref_offset (base);
+  c2 = tree_to_double_int (TREE_OPERAND (mult_op0, 1));
+  c3 = tree_to_double_int (mult_op1);
+  c4 = uhwi_to_double_int (bitpos / BITS_PER_UNIT);
+  c = double_int_add (double_int_add (c1, double_int_mul (c2, c3)), c4);
+
+  offset_type = TREE_TYPE (TREE_OPERAND (base, 1));
+
+  mult_expr = fold_build2 (MULT_EXPR, sizetype, t2, 
+			   double_int_to_tree (sizetype, c3));
+
+  add_expr = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (t1), t1, mult_expr);
+
+  mem_ref = fold_build2 (MEM_REF, TREE_TYPE (expr), add_expr,
+			 double_int_to_tree (offset_type, c));
+  return mem_ref;
+}
+
+/* Look for a hidden array reference in memory reference EXP.  If found,
+   return a revised memory reference that restructures addressability
+   to combine constants.  */
+/* TODO: This belongs more properly in a separate pass that performs
+   general strength reduction on straight-line code.  Eventually move
+   this there.  */
+
+static tree
+restructure_mem_ref (tree exp)
+{
+  tree s2, s3, t1, t2, offset_type, mult_expr, add_expr;
+  tree s2_op0, s2_op1, s3_op0, s3_op1, result;
+  gimple s1_stmt, s2_stmt, s3_stmt;
+  double_int c1, c2, c3, c;
+
+  /* Currently we look for the following pattern, where each Si is an
+     SSA name, each Tj is an arbitrary tree, and each Ck is an integer
+     constant:
+
+       S3 = T2 + C2;
+       S2 = S3 * C3;
+       S1 = T1 ptr+ S2;
+       exp = MEM_REF (S1, C1);
+
+     This is rewritten as:
+
+       exp = MEM_REF (T1 ptr+ (T2 * C3)), C1 + C2 * C3);
+
+  */
+  if (TREE_CODE (TREE_OPERAND (exp, 0)) != SSA_NAME)
+    return NULL_TREE;
+
+  /* We don't use get_def_for_expr for S1 because TER doesn't forward
+     S1 in some situations where this transform is useful, such as
+     when S1 is the base of two MEM_REFs fitting the pattern.  */
+  s1_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (exp, 0));
+  
+  if (!s1_stmt
+      || !is_gimple_assign (s1_stmt)
+      || gimple_assign_rhs_code (s1_stmt) != POINTER_PLUS_EXPR)
+    return NULL_TREE;
+
+  c1 = mem_ref_offset (exp);
+  t1 = gimple_assign_rhs1 (s1_stmt);
+  s2 = gimple_assign_rhs2 (s1_stmt);
+
+  if (TREE_CODE (s2) != SSA_NAME)
+    return NULL_TREE;
+
+  s2_stmt = get_def_for_expr (s2, MULT_EXPR);
+
+  if (!s2_stmt)
+    return NULL_TREE;
+
+  s2_op0 = gimple_assign_rhs1 (s2_stmt);
+  s2_op1 = gimple_assign_rhs2 (s2_stmt);
+
+  if (TREE_CODE (s2_op1) != INTEGER_CST)
+    return NULL_TREE;
+
+  s3 = s2_op0;
+  c3 = tree_to_double_int (s2_op1);
+  s3_stmt = get_def_for_expr (s3, PLUS_EXPR);
+
+  if (!s3_stmt)
+    return NULL_TREE;
+
+  s3_op0 = gimple_assign_rhs1 (s3_stmt);
+  s3_op1 = gimple_assign_rhs2 (s3_stmt);
+
+  if (TREE_CODE (s3_op1) != INTEGER_CST)
+    return NULL_TREE;
+
+  t2 = s3_op0;
+  c2 = tree_to_double_int (s3_op1);
+  c = double_int_add (c1, double_int_mul (c2, c3));
+
+  offset_type = TREE_TYPE (TREE_OPERAND (exp, 1));
+
+  mult_expr = fold_build2 (MULT_EXPR, sizetype, t2,
+			   double_int_to_tree (sizetype, c3));
+
+  add_expr = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (t1), t1, mult_expr);
+
+  result = build2 (MEM_REF, TREE_TYPE (exp), add_expr,
+		   double_int_to_tree (offset_type, c));
+
+  return result;
+}
+
+
 /* expand_expr: generate code for computing expression EXP.
    An rtx for the computed value is returned.  The value is never null.
    In the case of a void EXP, const0_rtx is returned.
@@ -9313,6 +9464,28 @@  expand_expr_real_1 (tree exp, rtx target, enum mac
 	gimple def_stmt;
 	enum insn_code icode;
 	unsigned align;
+	tree tem;
+
+	/* Attempt to restructure a hidden array reference, such
+	   as *(p + (n + k) * s).  This generates dead code, so
+	   require -O2.  */
+	if (optimize > 1
+	    && mode != BLKmode
+	    && ((tem = restructure_mem_ref (exp)) != NULL_TREE))
+	  {
+	    if (dump_file && (dump_flags & TDF_DETAILS))
+	      {
+		fprintf (dump_file, "\nRestructured memory reference:\n");
+		fprintf (dump_file, "  Original reference: ");
+		print_generic_expr (dump_file, exp, TDF_VOPS|TDF_MEMSYMS);
+		fprintf (dump_file, "\n  Replacement: ");
+		print_generic_expr (dump_file, tem, TDF_VOPS|TDF_MEMSYMS);
+	      }
+	    copy_ref_info (tem, exp);
+	    exp = tem;
+	    base = TREE_OPERAND (exp, 0);
+	  }
+
 	/* Handle expansion of non-aliased memory with non-BLKmode.  That
 	   might end up in a register.  */
 	if (TREE_CODE (base) == ADDR_EXPR)
@@ -9584,7 +9757,7 @@  expand_expr_real_1 (tree exp, rtx target, enum mac
       {
 	enum machine_mode mode1, mode2;
 	HOST_WIDE_INT bitsize, bitpos;
-	tree offset;
+	tree offset, tem2;
 	int volatilep = 0, must_force_mem;
 	bool packedp = false;
 	tree tem = get_inner_reference (exp, &bitsize, &bitpos, &offset,
@@ -9601,6 +9774,36 @@  expand_expr_real_1 (tree exp, rtx target, enum mac
 		&& DECL_PACKED (TREE_OPERAND (exp, 1))))
 	  packedp = true;
 
+	/* Attempt to restructure the base and offset to combine constants.
+	   Bitfield references should eventually be lowered prior to this
+	   point and are not dealt with.  Skip BLKmode aggregates as well.
+	   This generates dead code, so require -O2.  */
+	if (optimize > 1
+	    && code != BIT_FIELD_REF
+	    && (code != COMPONENT_REF 
+		|| !DECL_BIT_FIELD (TREE_OPERAND (exp, 1)))
+	    && TYPE_MODE (TREE_TYPE (exp)) != BLKmode
+	    && bitpos % BITS_PER_UNIT == 0
+	    && ((tem2 = restructure_base_and_offset (exp, tem, offset, bitpos))
+		!= NULL_TREE))
+	  {
+	    copy_ref_info (tem2, exp);
+	    tem = tem2;
+	    /* The offset and bitpos were absorbed into the new MEM_REF, so 
+	       make sure we don't add them in again.  */
+	    offset = NULL_TREE;
+	    bitpos = 0;
+
+	    if (dump_file && (dump_flags & TDF_DETAILS))
+	      {
+		fprintf (dump_file, "\nRestructured inner reference:\n");
+		fprintf (dump_file, "  Original reference: ");
+		print_generic_expr (dump_file, exp, TDF_VOPS|TDF_MEMSYMS);
+		fprintf (dump_file, "\n  Replacement: ");
+		print_generic_expr (dump_file, tem, TDF_VOPS|TDF_MEMSYMS);
+	      }
+	  }
+
 	/* If TEM's type is a union of variable size, pass TARGET to the inner
 	   computation, since it will need a temporary and TARGET is known
 	   to have to do.  This occurs in unchecked conversion in Ada.  */
Index: gcc/Makefile.in
===================================================================
--- gcc/Makefile.in	(revision 180138)
+++ gcc/Makefile.in	(working copy)
@@ -2926,7 +2926,7 @@  expr.o : expr.c $(CONFIG_H) $(SYSTEM_H) coretypes.
    reload.h langhooks.h intl.h $(TM_P_H) $(TARGET_H) \
    tree-iterator.h gt-expr.h $(MACHMODE_H) $(TIMEVAR_H) $(TREE_FLOW_H) \
    $(TREE_PASS_H) $(DF_H) $(DIAGNOSTIC_H) vecprim.h $(SSAEXPAND_H) \
-   $(PARAMS_H) $(COMMON_TARGET_H)
+   $(PARAMS_H) $(COMMON_TARGET_H) tree-pretty-print.h
 dojump.o : dojump.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) $(RTL_H) $(TREE_H) \
    $(FLAGS_H) $(FUNCTION_H) $(EXPR_H) $(OPTABS_H) $(INSN_ATTR_H) insn-config.h \
    langhooks.h $(GGC_H) gt-dojump.h vecprim.h $(BASIC_BLOCK_H) output.h