diff mbox

Builtins handling in IVOPT

Message ID CA+4CFy7_1JO7r3eq8TWzotR1L6oGKkecZx24OvBd8yeXKcshWQ@mail.gmail.com
State New
Headers show

Commit Message

Wei Mi Nov. 21, 2013, 7:26 a.m. UTC
Hi,

This patch works on the intrinsic calls handling issue in IVOPT mentioned here:
http://gcc.gnu.org/ml/gcc-patches/2010-10/msg01295.html

In find_interesting_uses_stmt, it changes

arg = expr
__builtin_xxx (arg)

to

arg = expr;
tmp = addr_expr (mem_ref(arg));
__builtin_xxx (tmp, ...)

So mem_ref(arg) could be handled by find_interesting_uses_address, and
an iv use of USE_ADDRESS type will be generated for expr, then a TMR
will be generated for expr in rewrite_use_address. Expand pass is
changed accordingly to keep the complex addressing mode not to be
splitted for cse.

With the patch we can handle the motivational case below.

#include <x86intrin.h>
extern __m128i arr[], d[];
void test (void)
{
    unsigned int b;
    for (b = 0; b < 1000; b += 2) {
      __m128i *p = (__m128i *)(&d[b]);
      __m128i a = _mm_load_si128(&arr[4*b+3]);
      __m128i v = _mm_loadu_si128(p);
      v = _mm_xor_si128(v, a);
      _mm_storeu_si128(p, v);
    }
}

gcc-r204792 Without the patch:
.L2:
        movdqu  (%rax), %xmm0
        subq    $-128, %rdx
        addq    $32, %rax
        pxor    -128(%rdx), %xmm0
        movups  %xmm0, -32(%rax)
        cmpq    $arr+64048, %rdx
        jne     .L2

gcc-r204792 With the patch:
.L2:
        movdqu  d(%rax), %xmm0
        pxor    arr+48(,%rax,4), %xmm0
        addq    $32, %rax
        movups  %xmm0, d-32(%rax)
        cmpq    $16000, %rax
        jne     .L2

Following things to be addressed later:
1. TER needs to be extended to handle the case when TMR is csed.

2. For more complex cases to work, besides this patch, we also need to
tune the AVG_LOOP_NITER, which is now set to 5, and it limits
induction variables merging a lot. Increasing the parameter to a
larger one could remove some induction variable in critical loop in
some our benchmarks. reg pressure estimation may also need to be
tuned. I will address it in a separate patch.

3. Now the information about which param of a builtin is of memory
reference type is simply listed as a switch-case in
builtin_has_mem_ref_p and ix86_builtin_has_mem_ref_p. This is not
ideal but there is no infrastructure to describe it in existing
implementation. More detailed information such as parameter and call
side-effect will be important for more precise alias and may worth
some work. Maybe the refinement about this patch could be done after
that.

regression and bootstrap pass on x86_64-linux-gnu. ok for trunk?

Thanks,
Wei.

2013-11-20  Wei Mi  <wmi@google.com>

        * expr.c (expand_expr_addr_expr_1): Not to split TMR.
        (expand_expr_real_1): Ditto.
        * targhooks.c (default_builtin_has_mem_ref_p): Default
        builtin.
        * tree-ssa-loop-ivopts.c (struct iv): Add field builtin_mem_param.
        (alloc_iv): Ditto.
        (remove_unused_ivs): Ditto.
        (builtin_has_mem_ref_p): New function.
        (find_interesting_uses_stmt): Special handling of builtins.
        * gimple-expr.c (is_gimple_address): Add handling of TMR.
        * gimple-expr.h (is_gimple_addressable): Ditto.
        * config/i386/i386.c (ix86_builtin_has_mem_ref_p): New target hook.
        (ix86_atomic_assign_expand_fenv): Ditto.
        (ix86_expand_special_args_builtin): Special handling of TMR for
        builtin.
        * target.def (builtin_has_mem_ref_p): New hook.
        * doc/tm.texi.in: Ditto.
        * doc/tm.texi: Generated.

2013-11-20  Wei Mi  <wmi@google.com>

        * gcc.dg/tree-ssa/ivopt_5.c: New test.

Comments

Zdenek Dvorak Nov. 21, 2013, 8:19 a.m. UTC | #1
Hi,

> This patch works on the intrinsic calls handling issue in IVOPT mentioned here:
> http://gcc.gnu.org/ml/gcc-patches/2010-10/msg01295.html
> 
> In find_interesting_uses_stmt, it changes
> 
> arg = expr
> __builtin_xxx (arg)
> 
> to
> 
> arg = expr;
> tmp = addr_expr (mem_ref(arg));
> __builtin_xxx (tmp, ...)

this looks a bit confusing (and wasteful) to me. It would make more sense to
just record the argument as USE_ADDRESS and do the rewriting in rewrite_use_address.

Zdenek
Bin.Cheng Nov. 21, 2013, 8:48 a.m. UTC | #2
I don't know very much about the problem but willing to study since I
am looking into IVO recently :)

> --- tree-ssa-loop-ivopts.c      (revision 204792)
> +++ tree-ssa-loop-ivopts.c      (working copy)
> @@ -135,6 +135,8 @@ struct iv
>    tree ssa_name;       /* The ssa name with the value.  */
>    bool biv_p;          /* Is it a biv?  */
>    bool have_use_for;   /* Do we already have a use for it?  */
> +  bool builtin_mem_param; /* Used as param of a builtin, so it could not be
> +                            removed by remove_unused_ivs.  */

As comment below, address parameter may be not limited to builtin
function only, how about a variable name more generic?

>    unsigned use_id;     /* The identifier in the use if it is the case.  */
>  };
>
> @@ -952,6 +954,7 @@ alloc_iv (tree base, tree step)
>    iv->step = step;
>    iv->biv_p = false;
>    iv->have_use_for = false;
> +  iv->builtin_mem_param = false;
>    iv->use_id = 0;
>    iv->ssa_name = NULL_TREE;
>
> @@ -1874,13 +1877,36 @@ find_invariants_stmt (struct ivopts_data
>      }
>  }
>
> +/* Find whether the Ith param of the BUILTIN is a mem
> +   reference. If I is -1, it returns whether the BUILTIN
> +   contains any mem reference type param.  */
> +
> +static bool
> +builtin_has_mem_ref_p (gimple builtin, int i)
> +{
> +  tree fndecl = gimple_call_fndecl (builtin);
> +  if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
> +    {
> +      switch (DECL_FUNCTION_CODE (fndecl))
> +       {
> +       case BUILT_IN_PREFETCH:
> +         if (i == -1 || i == 0)
> +            return true;
> +       }
This switch looks strange, could be refactored I think.

> +    }
> +  else if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
> +    return targetm.builtin_has_mem_ref_p ((int) DECL_FUNCTION_CODE
> (fndecl), i);
> +
> +  return false;
> +}
> +
>  /* Finds interesting uses of induction variables in the statement STMT.  */
>
>  static void
>  find_interesting_uses_stmt (struct ivopts_data *data, gimple stmt)
>  {
>    struct iv *iv;
> -  tree op, *lhs, *rhs;
> +  tree op, *lhs, *rhs, callee;
>    ssa_op_iter iter;
>    use_operand_p use_p;
>    enum tree_code code;
> @@ -1937,6 +1963,74 @@ find_interesting_uses_stmt (struct ivopt
>
>          call (memory).  */
>      }
> +  else if (is_gimple_call (stmt)
> +          && (callee = gimple_call_fndecl (stmt))
> +          && is_builtin_fn (callee)
> +          && builtin_has_mem_ref_p (stmt, -1))
> +    {

I noticed the preceding comments about call(memory), is your change a
specific case of the mention one?

> +      size_t i;
> +      for (i = 0; i < gimple_call_num_args (stmt); i++)
> +       {
> +         if (builtin_has_mem_ref_p (stmt, i))
> +           {
> +             gimple def, g;
> +             gimple_seq seq = NULL;
> +             tree type, mem, addr, rhs;
> +             tree *arg = gimple_call_arg_ptr (stmt, i);
> +              location_t loc = gimple_location (stmt);
> +             gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
> +
> +              if (TREE_CODE (*arg) != SSA_NAME)
> +               continue;
> +
> +             def = SSA_NAME_DEF_STMT (*arg);
> +             gcc_assert (gimple_code (def) == GIMPLE_PHI
> +                         || is_gimple_assign (def));
> +             /* Suppose we have the case:
> +                  arg = expr;
> +                  call (arg)
> +                If the expr is not like the form: &MEM(...), change it to:
> +                  arg = expr;
> +                  tmp = &MEM(arg);
> +                  call(tmp);
> +                then try to find interesting uses address in MEM(arg).  */
> +             if (is_gimple_assign (def)
> +                 && (rhs = gimple_assign_rhs1(def))
> +                 && TREE_CODE (rhs) == ADDR_EXPR
> +                 && REFERENCE_CLASS_P (TREE_OPERAND (rhs, 0)))
> +               {
> +                 iv = get_iv (data, *arg);
> +                 if (iv && !iv->builtin_mem_param)
> +                   iv->builtin_mem_param = true;
> +
> +                 find_interesting_uses_address (data, def,
> +                                                &TREE_OPERAND (rhs, 0));
> +               }
> +             else
> +               {
> +                 mem = build2 (MEM_REF, TREE_TYPE (*arg), *arg,
> +                               build_int_cst (TREE_TYPE (*arg), 0));
> +                 type = build_pointer_type (TREE_TYPE (*arg));
> +                 addr = build1 (ADDR_EXPR, type, mem);
> +                 g = gimple_build_assign_with_ops (ADDR_EXPR,
> +                                                   make_ssa_name (type, NULL),
> +                                                   addr, NULL);
> +                 gimple_call_set_arg (stmt, i, gimple_assign_lhs (g));
> +                 update_stmt (stmt);
> +                 gimple_set_location (g, loc);
> +                 gimple_seq_add_stmt_without_update (&seq, g);
> +                 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
> +                 find_interesting_uses_address (data, g,
> &TREE_OPERAND (addr, 0));

This would be the only code changes gimple before iv use rewriting and
seems not a good practice.

> +               }
> +           }
> +         else
> +           {
> +             tree arg = gimple_call_arg (stmt, i);
> +             find_interesting_uses_op (data, arg);
> +           }
> +       }
> +      return;
> +    }
>
>    if (gimple_code (stmt) == GIMPLE_PHI
>        && gimple_bb (stmt) == data->current_loop->header)
> @@ -6507,10 +6601,10 @@ remove_unused_ivs (struct ivopts_data *d
>           && !integer_zerop (info->iv->step)
>           && !info->inv_id
>           && !info->iv->have_use_for
> +         && !info->iv->builtin_mem_param
>           && !info->preserve_biv)
>         {

Thanks,
bin
Richard Biener Nov. 21, 2013, 1:33 p.m. UTC | #3
On Thu, Nov 21, 2013 at 8:26 AM, Wei Mi <wmi@google.com> wrote:
> Hi,
>
> This patch works on the intrinsic calls handling issue in IVOPT mentioned here:
> http://gcc.gnu.org/ml/gcc-patches/2010-10/msg01295.html
>
> In find_interesting_uses_stmt, it changes
>
> arg = expr
> __builtin_xxx (arg)
>
> to
>
> arg = expr;
> tmp = addr_expr (mem_ref(arg));
> __builtin_xxx (tmp, ...)
>
> So mem_ref(arg) could be handled by find_interesting_uses_address, and
> an iv use of USE_ADDRESS type will be generated for expr, then a TMR
> will be generated for expr in rewrite_use_address. Expand pass is
> changed accordingly to keep the complex addressing mode not to be
> splitted for cse.
>
> With the patch we can handle the motivational case below.
>
> #include <x86intrin.h>
> extern __m128i arr[], d[];
> void test (void)
> {
>     unsigned int b;
>     for (b = 0; b < 1000; b += 2) {
>       __m128i *p = (__m128i *)(&d[b]);
>       __m128i a = _mm_load_si128(&arr[4*b+3]);
>       __m128i v = _mm_loadu_si128(p);
>       v = _mm_xor_si128(v, a);
>       _mm_storeu_si128(p, v);
>     }
> }
>
> gcc-r204792 Without the patch:
> .L2:
>         movdqu  (%rax), %xmm0
>         subq    $-128, %rdx
>         addq    $32, %rax
>         pxor    -128(%rdx), %xmm0
>         movups  %xmm0, -32(%rax)
>         cmpq    $arr+64048, %rdx
>         jne     .L2
>
> gcc-r204792 With the patch:
> .L2:
>         movdqu  d(%rax), %xmm0
>         pxor    arr+48(,%rax,4), %xmm0
>         addq    $32, %rax
>         movups  %xmm0, d-32(%rax)
>         cmpq    $16000, %rax
>         jne     .L2
>
> Following things to be addressed later:
> 1. TER needs to be extended to handle the case when TMR is csed.
>
> 2. For more complex cases to work, besides this patch, we also need to
> tune the AVG_LOOP_NITER, which is now set to 5, and it limits
> induction variables merging a lot. Increasing the parameter to a
> larger one could remove some induction variable in critical loop in
> some our benchmarks. reg pressure estimation may also need to be
> tuned. I will address it in a separate patch.
>
> 3. Now the information about which param of a builtin is of memory
> reference type is simply listed as a switch-case in
> builtin_has_mem_ref_p and ix86_builtin_has_mem_ref_p. This is not
> ideal but there is no infrastructure to describe it in existing
> implementation. More detailed information such as parameter and call
> side-effect will be important for more precise alias and may worth
> some work. Maybe the refinement about this patch could be done after
> that.
>
> regression and bootstrap pass on x86_64-linux-gnu. ok for trunk?

So what you are doing is basically not only rewriting memory references
to possibly use TARGET_MEM_REF but also address uses to use
&TARGET_MEM_REF.  I think this is a good thing in general
(given instructions like x86 lea) and I would not bother distinguishing
the different kind of uses.

Richard.

> Thanks,
> Wei.
>
> 2013-11-20  Wei Mi  <wmi@google.com>
>
>         * expr.c (expand_expr_addr_expr_1): Not to split TMR.
>         (expand_expr_real_1): Ditto.
>         * targhooks.c (default_builtin_has_mem_ref_p): Default
>         builtin.
>         * tree-ssa-loop-ivopts.c (struct iv): Add field builtin_mem_param.
>         (alloc_iv): Ditto.
>         (remove_unused_ivs): Ditto.
>         (builtin_has_mem_ref_p): New function.
>         (find_interesting_uses_stmt): Special handling of builtins.
>         * gimple-expr.c (is_gimple_address): Add handling of TMR.
>         * gimple-expr.h (is_gimple_addressable): Ditto.
>         * config/i386/i386.c (ix86_builtin_has_mem_ref_p): New target hook.
>         (ix86_atomic_assign_expand_fenv): Ditto.
>         (ix86_expand_special_args_builtin): Special handling of TMR for
>         builtin.
>         * target.def (builtin_has_mem_ref_p): New hook.
>         * doc/tm.texi.in: Ditto.
>         * doc/tm.texi: Generated.
>
> 2013-11-20  Wei Mi  <wmi@google.com>
>
>         * gcc.dg/tree-ssa/ivopt_5.c: New test.
>
> Index: expr.c
> ===================================================================
> --- expr.c      (revision 204792)
> +++ expr.c      (working copy)
> @@ -7467,7 +7467,19 @@ expand_expr_addr_expr_1 (tree exp, rtx t
>           tem = fold_build_pointer_plus (tem, TREE_OPERAND (exp, 1));
>         return expand_expr (tem, target, tmode, modifier);
>        }
> +    case TARGET_MEM_REF:
> +      {
> +       int old_cse_not_expected;
> +       addr_space_t as
> +         = TYPE_ADDR_SPACE (TREE_TYPE (TREE_TYPE (TREE_OPERAND (exp, 0))));
>
> +       result = addr_for_mem_ref (exp, as, true);
> +       old_cse_not_expected = cse_not_expected;
> +       cse_not_expected = true;
> +       result = memory_address_addr_space (tmode, result, as);
> +       cse_not_expected = old_cse_not_expected;
> +        return result;
> +      }
>      case CONST_DECL:
>        /* Expand the initializer like constants above.  */
>        result = XEXP (expand_expr_constant (DECL_INITIAL (exp),
> @@ -9526,9 +9538,13 @@ expand_expr_real_1 (tree exp, rtx target
>           = TYPE_ADDR_SPACE (TREE_TYPE (TREE_TYPE (TREE_OPERAND (exp, 0))));
>         enum insn_code icode;
>         unsigned int align;
> +       int old_cse_not_expected;
>
>         op0 = addr_for_mem_ref (exp, as, true);
> +       old_cse_not_expected = cse_not_expected;
> +       cse_not_expected = true;
>         op0 = memory_address_addr_space (mode, op0, as);
> +       cse_not_expected = old_cse_not_expected;
>         temp = gen_rtx_MEM (mode, op0);
>         set_mem_attributes (temp, exp, 0);
>         set_mem_addr_space (temp, as);
> Index: targhooks.c
> ===================================================================
> --- targhooks.c (revision 204792)
> +++ targhooks.c (working copy)
> @@ -566,6 +566,13 @@ default_builtin_reciprocal (unsigned int
>  }
>
>  bool
> +default_builtin_has_mem_ref_p (int built_in_function ATTRIBUTE_UNUSED,
> +                              int i ATTRIBUTE_UNUSED)
> +{
> +  return false;
> +}
> +
> +bool
>  hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false (
>         cumulative_args_t ca ATTRIBUTE_UNUSED,
>         enum machine_mode mode ATTRIBUTE_UNUSED,
> Index: tree-ssa-loop-ivopts.c
> ===================================================================
> --- tree-ssa-loop-ivopts.c      (revision 204792)
> +++ tree-ssa-loop-ivopts.c      (working copy)
> @@ -135,6 +135,8 @@ struct iv
>    tree ssa_name;       /* The ssa name with the value.  */
>    bool biv_p;          /* Is it a biv?  */
>    bool have_use_for;   /* Do we already have a use for it?  */
> +  bool builtin_mem_param; /* Used as param of a builtin, so it could not be
> +                            removed by remove_unused_ivs.  */
>    unsigned use_id;     /* The identifier in the use if it is the case.  */
>  };
>
> @@ -952,6 +954,7 @@ alloc_iv (tree base, tree step)
>    iv->step = step;
>    iv->biv_p = false;
>    iv->have_use_for = false;
> +  iv->builtin_mem_param = false;
>    iv->use_id = 0;
>    iv->ssa_name = NULL_TREE;
>
> @@ -1874,13 +1877,36 @@ find_invariants_stmt (struct ivopts_data
>      }
>  }
>
> +/* Find whether the Ith param of the BUILTIN is a mem
> +   reference. If I is -1, it returns whether the BUILTIN
> +   contains any mem reference type param.  */
> +
> +static bool
> +builtin_has_mem_ref_p (gimple builtin, int i)
> +{
> +  tree fndecl = gimple_call_fndecl (builtin);
> +  if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
> +    {
> +      switch (DECL_FUNCTION_CODE (fndecl))
> +       {
> +       case BUILT_IN_PREFETCH:
> +         if (i == -1 || i == 0)
> +            return true;
> +       }
> +    }
> +  else if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
> +    return targetm.builtin_has_mem_ref_p ((int) DECL_FUNCTION_CODE
> (fndecl), i);
> +
> +  return false;
> +}
> +
>  /* Finds interesting uses of induction variables in the statement STMT.  */
>
>  static void
>  find_interesting_uses_stmt (struct ivopts_data *data, gimple stmt)
>  {
>    struct iv *iv;
> -  tree op, *lhs, *rhs;
> +  tree op, *lhs, *rhs, callee;
>    ssa_op_iter iter;
>    use_operand_p use_p;
>    enum tree_code code;
> @@ -1937,6 +1963,74 @@ find_interesting_uses_stmt (struct ivopt
>
>          call (memory).  */
>      }
> +  else if (is_gimple_call (stmt)
> +          && (callee = gimple_call_fndecl (stmt))
> +          && is_builtin_fn (callee)
> +          && builtin_has_mem_ref_p (stmt, -1))
> +    {
> +      size_t i;
> +      for (i = 0; i < gimple_call_num_args (stmt); i++)
> +       {
> +         if (builtin_has_mem_ref_p (stmt, i))
> +           {
> +             gimple def, g;
> +             gimple_seq seq = NULL;
> +             tree type, mem, addr, rhs;
> +             tree *arg = gimple_call_arg_ptr (stmt, i);
> +              location_t loc = gimple_location (stmt);
> +             gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
> +
> +              if (TREE_CODE (*arg) != SSA_NAME)
> +               continue;
> +
> +             def = SSA_NAME_DEF_STMT (*arg);
> +             gcc_assert (gimple_code (def) == GIMPLE_PHI
> +                         || is_gimple_assign (def));
> +             /* Suppose we have the case:
> +                  arg = expr;
> +                  call (arg)
> +                If the expr is not like the form: &MEM(...), change it to:
> +                  arg = expr;
> +                  tmp = &MEM(arg);
> +                  call(tmp);
> +                then try to find interesting uses address in MEM(arg).  */
> +             if (is_gimple_assign (def)
> +                 && (rhs = gimple_assign_rhs1(def))
> +                 && TREE_CODE (rhs) == ADDR_EXPR
> +                 && REFERENCE_CLASS_P (TREE_OPERAND (rhs, 0)))
> +               {
> +                 iv = get_iv (data, *arg);
> +                 if (iv && !iv->builtin_mem_param)
> +                   iv->builtin_mem_param = true;
> +
> +                 find_interesting_uses_address (data, def,
> +                                                &TREE_OPERAND (rhs, 0));
> +               }
> +             else
> +               {
> +                 mem = build2 (MEM_REF, TREE_TYPE (*arg), *arg,
> +                               build_int_cst (TREE_TYPE (*arg), 0));
> +                 type = build_pointer_type (TREE_TYPE (*arg));
> +                 addr = build1 (ADDR_EXPR, type, mem);
> +                 g = gimple_build_assign_with_ops (ADDR_EXPR,
> +                                                   make_ssa_name (type, NULL),
> +                                                   addr, NULL);
> +                 gimple_call_set_arg (stmt, i, gimple_assign_lhs (g));
> +                 update_stmt (stmt);
> +                 gimple_set_location (g, loc);
> +                 gimple_seq_add_stmt_without_update (&seq, g);
> +                 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
> +                 find_interesting_uses_address (data, g,
> &TREE_OPERAND (addr, 0));
> +               }
> +           }
> +         else
> +           {
> +             tree arg = gimple_call_arg (stmt, i);
> +             find_interesting_uses_op (data, arg);
> +           }
> +       }
> +      return;
> +    }
>
>    if (gimple_code (stmt) == GIMPLE_PHI
>        && gimple_bb (stmt) == data->current_loop->header)
> @@ -6507,10 +6601,10 @@ remove_unused_ivs (struct ivopts_data *d
>           && !integer_zerop (info->iv->step)
>           && !info->inv_id
>           && !info->iv->have_use_for
> +         && !info->iv->builtin_mem_param
>           && !info->preserve_biv)
>         {
>           bitmap_set_bit (toremove, SSA_NAME_VERSION (info->iv->ssa_name));
> -
>           tree def = info->iv->ssa_name;
>
>           if (MAY_HAVE_DEBUG_STMTS && SSA_NAME_DEF_STMT (def))
> Index: gimple-expr.c
> ===================================================================
> --- gimple-expr.c       (revision 204792)
> +++ gimple-expr.c       (working copy)
> @@ -633,7 +633,8 @@ is_gimple_address (const_tree t)
>        op = TREE_OPERAND (op, 0);
>      }
>
> -  if (CONSTANT_CLASS_P (op) || TREE_CODE (op) == MEM_REF)
> +  if (CONSTANT_CLASS_P (op) || TREE_CODE (op) == MEM_REF
> +      || TREE_CODE (op) == TARGET_MEM_REF)
>      return true;
>
>    switch (TREE_CODE (op))
> Index: gimple-expr.h
> ===================================================================
> --- gimple-expr.h       (revision 204792)
> +++ gimple-expr.h       (working copy)
> @@ -122,7 +122,8 @@ static inline bool
>  is_gimple_addressable (tree t)
>  {
>    return (is_gimple_id (t) || handled_component_p (t)
> -         || TREE_CODE (t) == MEM_REF);
> +         || TREE_CODE (t) == MEM_REF
> +         || TREE_CODE (t) == TARGET_MEM_REF);
>  }
>
>  /* Return true if T is a valid gimple constant.  */
> Index: config/i386/i386.c
> ===================================================================
> --- config/i386/i386.c  (revision 204792)
> +++ config/i386/i386.c  (working copy)
> @@ -29639,6 +29639,50 @@ ix86_init_mmx_sse_builtins (void)
>      }
>  }
>
> +/* Return whether the Ith param of the BUILTIN_FUNCTION
> +   is a memory reference. If I == -1, return whether the
> +   BUILTIN_FUNCTION contains any memory reference param.  */
> +
> +static bool
> +ix86_builtin_has_mem_ref_p (int builtin_function, int i)
> +{
> +  switch ((enum ix86_builtins) builtin_function)
> +    {
> +    /* LOAD.  */
> +    case IX86_BUILTIN_LOADHPS:
> +    case IX86_BUILTIN_LOADLPS:
> +    case IX86_BUILTIN_LOADHPD:
> +    case IX86_BUILTIN_LOADLPD:
> +      if (i == -1 || i == 1)
> +       return true;
> +      break;
> +    case IX86_BUILTIN_LOADUPS:
> +    case IX86_BUILTIN_LOADUPD:
> +    case IX86_BUILTIN_LOADDQU:
> +    case IX86_BUILTIN_LOADUPD256:
> +    case IX86_BUILTIN_LOADUPS256:
> +    case IX86_BUILTIN_LOADDQU256:
> +    case IX86_BUILTIN_LDDQU256:
> +      if (i == -1 || i == 0)
> +       return true;
> +      break;
> +    /* STORE.  */
> +    case IX86_BUILTIN_STOREHPS:
> +    case IX86_BUILTIN_STORELPS:
> +    case IX86_BUILTIN_STOREUPS:
> +    case IX86_BUILTIN_STOREUPD:
> +    case IX86_BUILTIN_STOREDQU:
> +    case IX86_BUILTIN_STOREUPD256:
> +    case IX86_BUILTIN_STOREUPS256:
> +    case IX86_BUILTIN_STOREDQU256:
> +      if (i == -1 || i == 0)
> +       return true;
> +    default:
> +      break;
> +    }
> +  return false;
> +}
> +
>  /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
>     to return a pointer to VERSION_DECL if the outcome of the expression
>     formed by PREDICATE_CHAIN is true.  This function will be called during
> @@ -32525,7 +32569,13 @@ ix86_expand_special_args_builtin (const
>           if (i == memory)
>             {
>               /* This must be the memory operand.  */
> -             op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
> +
> +             /* We expect the builtin could be expanded to rtl with memory
> +                operand and proper addressing mode will be kept as specified
> +                in TARGET_MEM_REF.  */
> +             if (!(TREE_CODE (arg) == ADDR_EXPR
> +                   && TREE_CODE (TREE_OPERAND (arg, 0)) == TARGET_MEM_REF))
> +               op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
>               op = gen_rtx_MEM (mode, op);
>               gcc_assert (GET_MODE (op) == mode
>                           || GET_MODE (op) == VOIDmode);
> @@ -43737,6 +43787,9 @@ ix86_atomic_assign_expand_fenv (tree *ho
>  #undef TARGET_BUILTIN_RECIPROCAL
>  #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
>
> +#undef TARGET_BUILTIN_HAS_MEM_REF_P
> +#define TARGET_BUILTIN_HAS_MEM_REF_P ix86_builtin_has_mem_ref_p
> +
>  #undef TARGET_ASM_FUNCTION_EPILOGUE
>  #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
>
> Index: doc/tm.texi
> ===================================================================
> --- doc/tm.texi (revision 204792)
> +++ doc/tm.texi (working copy)
> @@ -709,6 +709,12 @@ If a target implements string objects th
>  If a target implements string objects then this hook should should
> provide a facility to check the function arguments in @var{args_list}
> against the format specifiers in @var{format_arg} where the type of
> @var{format_arg} is one recognized as a valid string reference type.
>  @end deftypefn
>
> +@deftypefn {Target Hook} bool TARGET_BUILTIN_HAS_MEM_REF_P (int
> @var{builtin_function}, int @var{i})
> +This hook return whether the @var{i}th param of the @var{builtin_function}
> +is a memory reference. If @var{i} is -1, return whether the
> @var{builtin_function}
> +contains any memory reference type param.
> +@end deftypefn
> +
>  @deftypefn {Target Hook} void TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE (void)
>  This target function is similar to the hook @code{TARGET_OPTION_OVERRIDE}
>  but is called when the optimize level is changed via an attribute or
> Index: doc/tm.texi.in
> ===================================================================
> --- doc/tm.texi.in      (revision 204792)
> +++ doc/tm.texi.in      (working copy)
> @@ -697,6 +697,8 @@ should use @code{TARGET_HANDLE_C_OPTION}
>
>  @hook TARGET_CHECK_STRING_OBJECT_FORMAT_ARG
>
> +@hook TARGET_BUILTIN_HAS_MEM_REF_P
> +
>  @hook TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
>
>  @defmac C_COMMON_OVERRIDE_OPTIONS
> Index: target.def
> ===================================================================
> --- target.def  (revision 204792)
> +++ target.def  (working copy)
> @@ -1742,6 +1742,14 @@ HOOK_VECTOR_END (vectorize)
>  #undef HOOK_PREFIX
>  #define HOOK_PREFIX "TARGET_"
>
> +DEFHOOK
> +(builtin_has_mem_ref_p,
> + "This hook return whether the @var{i}th param of the @var{builtin_function}\n\
> +is a memory reference. If @var{i} is -1, return whether the
> @var{builtin_function}\n\
> +contains any memory reference type param.",
> + bool, (int builtin_function, int i),
> + default_builtin_has_mem_ref_p)
> +
>  /* Allow target specific overriding of option settings after options have
>    been changed by an attribute or pragma or when it is reset at the
>    end of the code affected by an attribute or pragma.  */
> Index: testsuite/gcc.dg/tree-ssa/ivopt_5.c
> ===================================================================
> --- testsuite/gcc.dg/tree-ssa/ivopt_5.c (revision 0)
> +++ testsuite/gcc.dg/tree-ssa/ivopt_5.c (revision 0)
> @@ -0,0 +1,21 @@
> +/* { dg-do compile { target {{ i?86-*-* x86_64-*-* } && lp64 } } } */
> +/* { dg-options "-O2 -m64 -fdump-tree-ivopts-details" } */
> +
> +/* Make sure only one iv is selected after IVOPT.  */
> +
> +#include <x86intrin.h>
> +extern __m128i arr[], d[];
> +void test (void)
> +{
> +    unsigned int b;
> +    for (b = 0; b < 1000; b += 2) {
> +      __m128i *p = (__m128i *)(&d[b]);
> +      __m128i a = _mm_load_si128(&arr[4*b+3]);
> +      __m128i v = _mm_loadu_si128(p);
> +      v = _mm_xor_si128(v, a);
> +      _mm_storeu_si128(p, v);
> +    }
> +}
> +
> +/* { dg-final { scan-tree-dump-times "PHI <ivtmp" 1 "ivopts"} } */
> +/* { dg-final { cleanup-tree-dump "ivopts" } } */
Wei Mi Nov. 21, 2013, 6:30 p.m. UTC | #4
On Thu, Nov 21, 2013 at 12:19 AM, Zdenek Dvorak
<rakdver@iuuk.mff.cuni.cz> wrote:
> Hi,
>
>> This patch works on the intrinsic calls handling issue in IVOPT mentioned here:
>> http://gcc.gnu.org/ml/gcc-patches/2010-10/msg01295.html
>>
>> In find_interesting_uses_stmt, it changes
>>
>> arg = expr
>> __builtin_xxx (arg)
>>
>> to
>>
>> arg = expr;
>> tmp = addr_expr (mem_ref(arg));
>> __builtin_xxx (tmp, ...)
>
> this looks a bit confusing (and wasteful) to me. It would make more sense to
> just record the argument as USE_ADDRESS and do the rewriting in rewrite_use_address.
>
> Zdenek

My intention was to use find_interesting_uses_address directly. But
you are right, the logic looks better to only do the rewriting in
rewrite_use_address. I will change here.

Thanks,
Wei.
Wei Mi Nov. 21, 2013, 6:41 p.m. UTC | #5
Thanks for the comments.

Regards,
Wei.

On Thu, Nov 21, 2013 at 12:48 AM, Bin.Cheng <amker.cheng@gmail.com> wrote:
> I don't know very much about the problem but willing to study since I
> am looking into IVO recently :)
>
>> --- tree-ssa-loop-ivopts.c      (revision 204792)
>> +++ tree-ssa-loop-ivopts.c      (working copy)
>> @@ -135,6 +135,8 @@ struct iv
>>    tree ssa_name;       /* The ssa name with the value.  */
>>    bool biv_p;          /* Is it a biv?  */
>>    bool have_use_for;   /* Do we already have a use for it?  */
>> +  bool builtin_mem_param; /* Used as param of a builtin, so it could not be
>> +                            removed by remove_unused_ivs.  */
>
> As comment below, address parameter may be not limited to builtin
> function only, how about a variable name more generic?

Ok, I will change it to a different name.

>
>>    unsigned use_id;     /* The identifier in the use if it is the case.  */
>>  };
>>
>> @@ -952,6 +954,7 @@ alloc_iv (tree base, tree step)
>>    iv->step = step;
>>    iv->biv_p = false;
>>    iv->have_use_for = false;
>> +  iv->builtin_mem_param = false;
>>    iv->use_id = 0;
>>    iv->ssa_name = NULL_TREE;
>>
>> @@ -1874,13 +1877,36 @@ find_invariants_stmt (struct ivopts_data
>>      }
>>  }
>>
>> +/* Find whether the Ith param of the BUILTIN is a mem
>> +   reference. If I is -1, it returns whether the BUILTIN
>> +   contains any mem reference type param.  */
>> +
>> +static bool
>> +builtin_has_mem_ref_p (gimple builtin, int i)
>> +{
>> +  tree fndecl = gimple_call_fndecl (builtin);
>> +  if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
>> +    {
>> +      switch (DECL_FUNCTION_CODE (fndecl))
>> +       {
>> +       case BUILT_IN_PREFETCH:
>> +         if (i == -1 || i == 0)
>> +            return true;
>> +       }
> This switch looks strange, could be refactored I think.
>

I am not sure if there will be more builtins coming in the func, so I
use switch case here.

>> +    }
>> +  else if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
>> +    return targetm.builtin_has_mem_ref_p ((int) DECL_FUNCTION_CODE
>> (fndecl), i);
>> +
>> +  return false;
>> +}
>> +
>>  /* Finds interesting uses of induction variables in the statement STMT.  */
>>
>>  static void
>>  find_interesting_uses_stmt (struct ivopts_data *data, gimple stmt)
>>  {
>>    struct iv *iv;
>> -  tree op, *lhs, *rhs;
>> +  tree op, *lhs, *rhs, callee;
>>    ssa_op_iter iter;
>>    use_operand_p use_p;
>>    enum tree_code code;
>> @@ -1937,6 +1963,74 @@ find_interesting_uses_stmt (struct ivopt
>>
>>          call (memory).  */
>>      }
>> +  else if (is_gimple_call (stmt)
>> +          && (callee = gimple_call_fndecl (stmt))
>> +          && is_builtin_fn (callee)
>> +          && builtin_has_mem_ref_p (stmt, -1))
>> +    {
>
> I noticed the preceding comments about call(memory), is your change a
> specific case of the mention one?
>

Yes, I will fix it.

>> +      size_t i;
>> +      for (i = 0; i < gimple_call_num_args (stmt); i++)
>> +       {
>> +         if (builtin_has_mem_ref_p (stmt, i))
>> +           {
>> +             gimple def, g;
>> +             gimple_seq seq = NULL;
>> +             tree type, mem, addr, rhs;
>> +             tree *arg = gimple_call_arg_ptr (stmt, i);
>> +              location_t loc = gimple_location (stmt);
>> +             gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
>> +
>> +              if (TREE_CODE (*arg) != SSA_NAME)
>> +               continue;
>> +
>> +             def = SSA_NAME_DEF_STMT (*arg);
>> +             gcc_assert (gimple_code (def) == GIMPLE_PHI
>> +                         || is_gimple_assign (def));
>> +             /* Suppose we have the case:
>> +                  arg = expr;
>> +                  call (arg)
>> +                If the expr is not like the form: &MEM(...), change it to:
>> +                  arg = expr;
>> +                  tmp = &MEM(arg);
>> +                  call(tmp);
>> +                then try to find interesting uses address in MEM(arg).  */
>> +             if (is_gimple_assign (def)
>> +                 && (rhs = gimple_assign_rhs1(def))
>> +                 && TREE_CODE (rhs) == ADDR_EXPR
>> +                 && REFERENCE_CLASS_P (TREE_OPERAND (rhs, 0)))
>> +               {
>> +                 iv = get_iv (data, *arg);
>> +                 if (iv && !iv->builtin_mem_param)
>> +                   iv->builtin_mem_param = true;
>> +
>> +                 find_interesting_uses_address (data, def,
>> +                                                &TREE_OPERAND (rhs, 0));
>> +               }
>> +             else
>> +               {
>> +                 mem = build2 (MEM_REF, TREE_TYPE (*arg), *arg,
>> +                               build_int_cst (TREE_TYPE (*arg), 0));
>> +                 type = build_pointer_type (TREE_TYPE (*arg));
>> +                 addr = build1 (ADDR_EXPR, type, mem);
>> +                 g = gimple_build_assign_with_ops (ADDR_EXPR,
>> +                                                   make_ssa_name (type, NULL),
>> +                                                   addr, NULL);
>> +                 gimple_call_set_arg (stmt, i, gimple_assign_lhs (g));
>> +                 update_stmt (stmt);
>> +                 gimple_set_location (g, loc);
>> +                 gimple_seq_add_stmt_without_update (&seq, g);
>> +                 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
>> +                 find_interesting_uses_address (data, g,
>> &TREE_OPERAND (addr, 0));
>
> This would be the only code changes gimple before iv use rewriting and
> seems not a good practice.

Yes. Zdenek mentioned the same. I will think how to fix it.

>
>> +               }
>> +           }
>> +         else
>> +           {
>> +             tree arg = gimple_call_arg (stmt, i);
>> +             find_interesting_uses_op (data, arg);
>> +           }
>> +       }
>> +      return;
>> +    }
>>
>>    if (gimple_code (stmt) == GIMPLE_PHI
>>        && gimple_bb (stmt) == data->current_loop->header)
>> @@ -6507,10 +6601,10 @@ remove_unused_ivs (struct ivopts_data *d
>>           && !integer_zerop (info->iv->step)
>>           && !info->inv_id
>>           && !info->iv->have_use_for
>> +         && !info->iv->builtin_mem_param
>>           && !info->preserve_biv)
>>         {
>
> Thanks,
> bin
>
> --
> Best Regards.
Wei Mi Nov. 21, 2013, 6:41 p.m. UTC | #6
> So what you are doing is basically not only rewriting memory references
> to possibly use TARGET_MEM_REF but also address uses to use
> &TARGET_MEM_REF.  I think this is a good thing in general
> (given instructions like x86 lea) and I would not bother distinguishing
> the different kind of uses.
>
> Richard.
>

You mean to change normal expr to &TMR(expr) form in order to utilize
x86 lea type instructions as much as possible. It is interesting. I
can experiment that idea later. I am not sure if it could simply work.
My concern is x86 lea still has some limitation (such as three
operands lea will have longer latency and can only be issued to
port1), if we change some expr to &TMR(expr), will it inhitbit cse
opportunity if codegen find out it is not good to use lea?

Thanks,
Wei.
Richard Biener Nov. 21, 2013, 7:36 p.m. UTC | #7
Wei Mi <wmi@google.com> wrote:
>> So what you are doing is basically not only rewriting memory
>references
>> to possibly use TARGET_MEM_REF but also address uses to use
>> &TARGET_MEM_REF.  I think this is a good thing in general
>> (given instructions like x86 lea) and I would not bother
>distinguishing
>> the different kind of uses.
>>
>> Richard.
>>
>
>You mean to change normal expr to &TMR(expr) form in order to utilize
>x86 lea type instructions as much as possible. It is interesting. I
>can experiment that idea later. I am not sure if it could simply work.
>My concern is x86 lea still has some limitation (such as three
>operands lea will have longer latency and can only be issued to
>port1), if we change some expr to &TMR(expr), will it inhitbit cse
>opportunity if codegen find out it is not good to use lea?

That needs to be determined.  Over all it might be because ivopts runs so early.  At rtl level there should not be big differences apart from better initial address computations.

Did I misunderstand what your patch does?

Richard.

>Thanks,
>Wei.
Wei Mi Nov. 21, 2013, 8:08 p.m. UTC | #8
On Thu, Nov 21, 2013 at 11:36 AM, Richard Biener
<richard.guenther@gmail.com> wrote:
> Wei Mi <wmi@google.com> wrote:
>>> So what you are doing is basically not only rewriting memory
>>references
>>> to possibly use TARGET_MEM_REF but also address uses to use
>>> &TARGET_MEM_REF.  I think this is a good thing in general
>>> (given instructions like x86 lea) and I would not bother
>>distinguishing
>>> the different kind of uses.
>>>
>>> Richard.
>>>
>>
>>You mean to change normal expr to &TMR(expr) form in order to utilize
>>x86 lea type instructions as much as possible. It is interesting. I
>>can experiment that idea later. I am not sure if it could simply work.
>>My concern is x86 lea still has some limitation (such as three
>>operands lea will have longer latency and can only be issued to
>>port1), if we change some expr to &TMR(expr), will it inhitbit cse
>>opportunity if codegen find out it is not good to use lea?
>
> That needs to be determined.  Over all it might be because ivopts runs so early.  At rtl level there should not be big differences apart from better initial address computations.
>
> Did I misunderstand what your patch does?
>
> Richard.
>

My patch wants to address the issue that iv uses using as memory
reference actuals for load/store/prefetch builtins are treated as
non-linear iv uses instead of address iv uses, and the result of
determine_use_iv_cost is wrong. After we change those uses to address
uses, less ivs may be used, TMR will be generated for those iv uses
and efficent addressing mode could be utilized.

Thanks,
Wei.
Richard Biener Nov. 21, 2013, 9:01 p.m. UTC | #9
Wei Mi <wmi@google.com> wrote:
>On Thu, Nov 21, 2013 at 11:36 AM, Richard Biener
><richard.guenther@gmail.com> wrote:
>> Wei Mi <wmi@google.com> wrote:
>>>> So what you are doing is basically not only rewriting memory
>>>references
>>>> to possibly use TARGET_MEM_REF but also address uses to use
>>>> &TARGET_MEM_REF.  I think this is a good thing in general
>>>> (given instructions like x86 lea) and I would not bother
>>>distinguishing
>>>> the different kind of uses.
>>>>
>>>> Richard.
>>>>
>>>
>>>You mean to change normal expr to &TMR(expr) form in order to utilize
>>>x86 lea type instructions as much as possible. It is interesting. I
>>>can experiment that idea later. I am not sure if it could simply
>work.
>>>My concern is x86 lea still has some limitation (such as three
>>>operands lea will have longer latency and can only be issued to
>>>port1), if we change some expr to &TMR(expr), will it inhitbit cse
>>>opportunity if codegen find out it is not good to use lea?
>>
>> That needs to be determined.  Over all it might be because ivopts
>runs so early.  At rtl level there should not be big differences apart
>from better initial address computations.
>>
>> Did I misunderstand what your patch does?
>>
>> Richard.
>>
>
>My patch wants to address the issue that iv uses using as memory
>reference actuals for load/store/prefetch builtins are treated as
>non-linear iv uses instead of address iv uses, and the result of
>determine_use_iv_cost is wrong. After we change those uses to address
>uses, less ivs may be used, TMR will be generated for those iv uses
>and efficent addressing mode could be utilized.

But are not all pointer typed uses address uses?!

Richard.

>Thanks,
>Wei.
Wei Mi Nov. 21, 2013, 9:36 p.m. UTC | #10
On Thu, Nov 21, 2013 at 1:01 PM, Richard Biener
<richard.guenther@gmail.com> wrote:
> Wei Mi <wmi@google.com> wrote:
>>On Thu, Nov 21, 2013 at 11:36 AM, Richard Biener
>><richard.guenther@gmail.com> wrote:
>>> Wei Mi <wmi@google.com> wrote:
>>>>> So what you are doing is basically not only rewriting memory
>>>>references
>>>>> to possibly use TARGET_MEM_REF but also address uses to use
>>>>> &TARGET_MEM_REF.  I think this is a good thing in general
>>>>> (given instructions like x86 lea) and I would not bother
>>>>distinguishing
>>>>> the different kind of uses.
>>>>>
>>>>> Richard.
>>>>>
>>>>
>>>>You mean to change normal expr to &TMR(expr) form in order to utilize
>>>>x86 lea type instructions as much as possible. It is interesting. I
>>>>can experiment that idea later. I am not sure if it could simply
>>work.
>>>>My concern is x86 lea still has some limitation (such as three
>>>>operands lea will have longer latency and can only be issued to
>>>>port1), if we change some expr to &TMR(expr), will it inhitbit cse
>>>>opportunity if codegen find out it is not good to use lea?
>>>
>>> That needs to be determined.  Over all it might be because ivopts
>>runs so early.  At rtl level there should not be big differences apart
>>from better initial address computations.
>>>
>>> Did I misunderstand what your patch does?
>>>
>>> Richard.
>>>
>>
>>My patch wants to address the issue that iv uses using as memory
>>reference actuals for load/store/prefetch builtins are treated as
>>non-linear iv uses instead of address iv uses, and the result of
>>determine_use_iv_cost is wrong. After we change those uses to address
>>uses, less ivs may be used, TMR will be generated for those iv uses
>>and efficent addressing mode could be utilized.
>
> But are not all pointer typed uses address uses?!
>
> Richard.
>

If a pointer typed use is plainly value passed to a func call, it is
not an address use, right? But as you said, x86 lea may help here.

Thanks,
Wei.
Richard Biener Nov. 22, 2013, 11:12 a.m. UTC | #11
On Thu, Nov 21, 2013 at 10:36 PM, Wei Mi <wmi@google.com> wrote:
> On Thu, Nov 21, 2013 at 1:01 PM, Richard Biener
> <richard.guenther@gmail.com> wrote:
>> Wei Mi <wmi@google.com> wrote:
>>>On Thu, Nov 21, 2013 at 11:36 AM, Richard Biener
>>><richard.guenther@gmail.com> wrote:
>>>> Wei Mi <wmi@google.com> wrote:
>>>>>> So what you are doing is basically not only rewriting memory
>>>>>references
>>>>>> to possibly use TARGET_MEM_REF but also address uses to use
>>>>>> &TARGET_MEM_REF.  I think this is a good thing in general
>>>>>> (given instructions like x86 lea) and I would not bother
>>>>>distinguishing
>>>>>> the different kind of uses.
>>>>>>
>>>>>> Richard.
>>>>>>
>>>>>
>>>>>You mean to change normal expr to &TMR(expr) form in order to utilize
>>>>>x86 lea type instructions as much as possible. It is interesting. I
>>>>>can experiment that idea later. I am not sure if it could simply
>>>work.
>>>>>My concern is x86 lea still has some limitation (such as three
>>>>>operands lea will have longer latency and can only be issued to
>>>>>port1), if we change some expr to &TMR(expr), will it inhitbit cse
>>>>>opportunity if codegen find out it is not good to use lea?
>>>>
>>>> That needs to be determined.  Over all it might be because ivopts
>>>runs so early.  At rtl level there should not be big differences apart
>>>from better initial address computations.
>>>>
>>>> Did I misunderstand what your patch does?
>>>>
>>>> Richard.
>>>>
>>>
>>>My patch wants to address the issue that iv uses using as memory
>>>reference actuals for load/store/prefetch builtins are treated as
>>>non-linear iv uses instead of address iv uses, and the result of
>>>determine_use_iv_cost is wrong. After we change those uses to address
>>>uses, less ivs may be used, TMR will be generated for those iv uses
>>>and efficent addressing mode could be utilized.
>>
>> But are not all pointer typed uses address uses?!
>>
>> Richard.
>>
>
> If a pointer typed use is plainly value passed to a func call, it is
> not an address use, right? But as you said, x86 lea may help here.

But that's what you are matching ... (well, for builtins you know
will expand that to a memory reference).

What I dislike in the patch is the special-casing of some builtins
via a target hook.  I'd rather say treat all internal functions and
all target builtins that way.  Or simply all addresses.

Thanks,
Richard.

> Thanks,
> Wei.
Zdenek Dvorak Nov. 22, 2013, 11:19 a.m. UTC | #12
Hi,

> > If a pointer typed use is plainly value passed to a func call, it is
> > not an address use, right? But as you said, x86 lea may help here.
> 
> But that's what you are matching ... (well, for builtins you know
> will expand that to a memory reference).
> 
> What I dislike in the patch is the special-casing of some builtins
> via a target hook.  I'd rather say treat all internal functions and
> all target builtins that way.  Or simply all addresses.

unless the architecture has lea-type instruction to compute the address,
computing say b+4*i incurs some cost, while if mem[b+4*i] is accessed, the
computation is for free.  Thus, it does not make sense to treat the address
computations the same way as memory references (or to treat all functions
the same way as builtins which translate to memory references),

Zdenek
Richard Biener Nov. 22, 2013, 12:46 p.m. UTC | #13
On Fri, Nov 22, 2013 at 12:19 PM, Zdenek Dvorak
<rakdver@iuuk.mff.cuni.cz> wrote:
> Hi,
>
>> > If a pointer typed use is plainly value passed to a func call, it is
>> > not an address use, right? But as you said, x86 lea may help here.
>>
>> But that's what you are matching ... (well, for builtins you know
>> will expand that to a memory reference).
>>
>> What I dislike in the patch is the special-casing of some builtins
>> via a target hook.  I'd rather say treat all internal functions and
>> all target builtins that way.  Or simply all addresses.
>
> unless the architecture has lea-type instruction to compute the address,
> computing say b+4*i incurs some cost, while if mem[b+4*i] is accessed, the
> computation is for free.  Thus, it does not make sense to treat the address
> computations the same way as memory references (or to treat all functions
> the same way as builtins which translate to memory references),

I understand that, but I think the patch tries to improve code generation
not by changing the set of IVs used (thus adjust cost considerations)
but by changing the way it rewrites certain address uses.  And we already
do compute the correct costs for address uses (we just don't perform
exactly the correct verifications for whether a &TMR is supported by
the target?).

Richard.

> Zdenek
Zdenek Dvorak Nov. 22, 2013, 2:11 p.m. UTC | #14
Hi,

> >> > If a pointer typed use is plainly value passed to a func call, it is
> >> > not an address use, right? But as you said, x86 lea may help here.
> >>
> >> But that's what you are matching ... (well, for builtins you know
> >> will expand that to a memory reference).
> >>
> >> What I dislike in the patch is the special-casing of some builtins
> >> via a target hook.  I'd rather say treat all internal functions and
> >> all target builtins that way.  Or simply all addresses.
> >
> > unless the architecture has lea-type instruction to compute the address,
> > computing say b+4*i incurs some cost, while if mem[b+4*i] is accessed, the
> > computation is for free.  Thus, it does not make sense to treat the address
> > computations the same way as memory references (or to treat all functions
> > the same way as builtins which translate to memory references),
> 
> I understand that, but I think the patch tries to improve code generation
> not by changing the set of IVs used (thus adjust cost considerations)
> but by changing the way it rewrites certain address uses.

actually, the costs are adjusted in the patch -- the address calculations
for the handled builtins are recorded as USE_ADDRESS (not as USE_NONLINEAR_EXPR),
so that their costs are calculated in the same way as for memory references,

Zdenek
Bin.Cheng Nov. 22, 2013, 4:14 p.m. UTC | #15
On Fri, Nov 22, 2013 at 10:11 PM, Zdenek Dvorak
<rakdver@iuuk.mff.cuni.cz> wrote:
> Hi,
>
>> >> > If a pointer typed use is plainly value passed to a func call, it is
>> >> > not an address use, right? But as you said, x86 lea may help here.
>> >>
>> >> But that's what you are matching ... (well, for builtins you know
>> >> will expand that to a memory reference).
>> >>
>> >> What I dislike in the patch is the special-casing of some builtins
>> >> via a target hook.  I'd rather say treat all internal functions and
>> >> all target builtins that way.  Or simply all addresses.
>> >
>> > unless the architecture has lea-type instruction to compute the address,
>> > computing say b+4*i incurs some cost, while if mem[b+4*i] is accessed, the
>> > computation is for free.  Thus, it does not make sense to treat the address
>> > computations the same way as memory references (or to treat all functions
>> > the same way as builtins which translate to memory references),
>>
>> I understand that, but I think the patch tries to improve code generation
>> not by changing the set of IVs used (thus adjust cost considerations)
>> but by changing the way it rewrites certain address uses.
>
> actually, the costs are adjusted in the patch -- the address calculations
> for the handled builtins are recorded as USE_ADDRESS (not as USE_NONLINEAR_EXPR),
> so that their costs are calculated in the same way as for memory references,
>
> Zdenek
I think the problem can be showed by below example:
struct tag
{
  int a[10];
  int b;
};
struct tag s;
int foo(int len)
{
  int i = 0;
  int sum = 0;
  for (i = 0; i < len; i++)
    sum += barr (&s.a[i]);

  return sum;
}
The dump before IVOPT is like:

  <bb 4>:
  # i_16 = PHI <i_10(6), 0(3)>
  # sum_17 = PHI <sum_9(6), 0(3)>
  _6 = &s.a[i_16];
  _8 = barr (_6);
  sum_9 = _8 + sum_17;
  i_10 = i_16 + 1;
  if (len_5(D) > i_10)
    goto <bb 6>;
  else
    goto <bb 5>;

  <bb 5>:
  # sum_11 = PHI <sum_9(4)>
  goto <bb 7>;

  <bb 6>:
  goto <bb 4>;

The iv use of _6 in bar(_6) is actually an memory address and it can
be computed efficiently for some targets.  For now IVOPT only honors
address type iv uses appeared in memory access, so this patch is
trying to handle this kind of address iv which is outside of memory
access just the same.  Please correct me if I am wrong.

After thought twice, I have two concerns about this:
1) Why can't we just enhance the nolinear use logic to handle this
kind of iv use?  It's more complicated to handle it as a normal
address type iv, consider that IVOPT adds auto-increment candidates
according to them, you have to deal with that in this way
(auto-increment addressing mode can't apply to this kind of address iv
use).
2) If I understand it right, this is an issue not only limited to
builtin functions, it stands for normal function calls too, right?

Thanks,
bin
Wei Mi Nov. 22, 2013, 5:02 p.m. UTC | #16
On Fri, Nov 22, 2013 at 6:11 AM, Zdenek Dvorak <rakdver@iuuk.mff.cuni.cz> wrote:
> Hi,
>
>> >> > If a pointer typed use is plainly value passed to a func call, it is
>> >> > not an address use, right? But as you said, x86 lea may help here.
>> >>
>> >> But that's what you are matching ... (well, for builtins you know
>> >> will expand that to a memory reference).
>> >>
>> >> What I dislike in the patch is the special-casing of some builtins
>> >> via a target hook.  I'd rather say treat all internal functions and
>> >> all target builtins that way.  Or simply all addresses.
>> >
>> > unless the architecture has lea-type instruction to compute the address,
>> > computing say b+4*i incurs some cost, while if mem[b+4*i] is accessed, the
>> > computation is for free.  Thus, it does not make sense to treat the address
>> > computations the same way as memory references (or to treat all functions
>> > the same way as builtins which translate to memory references),
>>
>> I understand that, but I think the patch tries to improve code generation
>> not by changing the set of IVs used (thus adjust cost considerations)
>> but by changing the way it rewrites certain address uses.
>
> actually, the costs are adjusted in the patch -- the address calculations
> for the handled builtins are recorded as USE_ADDRESS (not as USE_NONLINEAR_EXPR),
> so that their costs are calculated in the same way as for memory references,
>
> Zdenek

Sorry for not making it clear. Yes, the costs are adjusted in the
patch. Although I changed the expr to addr_expr(mem(expr)) pattern,
the intention was only to avoid changing find_interesting_uses_address
and rewrite_use_address, which Zdenek and Bin.Cheng thought was not
good. I am changing this part.

Thanks,
Wei.
Wei Mi Nov. 22, 2013, 5:21 p.m. UTC | #17
> I think the problem can be showed by below example:
> struct tag
> {
>   int a[10];
>   int b;
> };
> struct tag s;
> int foo(int len)
> {
>   int i = 0;
>   int sum = 0;
>   for (i = 0; i < len; i++)
>     sum += barr (&s.a[i]);
>
>   return sum;
> }
> The dump before IVOPT is like:
>
>   <bb 4>:
>   # i_16 = PHI <i_10(6), 0(3)>
>   # sum_17 = PHI <sum_9(6), 0(3)>
>   _6 = &s.a[i_16];
>   _8 = barr (_6);
>   sum_9 = _8 + sum_17;
>   i_10 = i_16 + 1;
>   if (len_5(D) > i_10)
>     goto <bb 6>;
>   else
>     goto <bb 5>;
>
>   <bb 5>:
>   # sum_11 = PHI <sum_9(4)>
>   goto <bb 7>;
>
>   <bb 6>:
>   goto <bb 4>;
>
> The iv use of _6 in bar(_6) is actually an memory address and it can
> be computed efficiently for some targets.  For now IVOPT only honors
> address type iv uses appeared in memory access, so this patch is
> trying to handle this kind of address iv which is outside of memory
> access just the same.  Please correct me if I am wrong.

Yes, that is correct.

>
> After thought twice, I have two concerns about this:
> 1) Why can't we just enhance the nolinear use logic to handle this
> kind of iv use?  It's more complicated to handle it as a normal
> address type iv, consider that IVOPT adds auto-increment candidates
> according to them, you have to deal with that in this way
> (auto-increment addressing mode can't apply to this kind of address iv
> use).

I think existing address iv use logic is enough to handle it. I am
changing it and moving the gimple change from
find_interesting_uses_stmt to rewrite_use_address in original patch.

> 2) If I understand it right, this is an issue not only limited to
> builtin functions, it stands for normal function calls too, right?
>

For builtin function, such as _mm_loadu_si128(b+4*i), it will be
expanded to an insn: MOVDQU mem[b+4*i], $xmmreg, and the computation
of b+4*i is for free. But for function call, the b+4*i will only be
used as the value of an actual, and its computation cost cannot be
avoided.

Thanks,
Wei.
Wei Mi Nov. 22, 2013, 5:57 p.m. UTC | #18
On Fri, Nov 22, 2013 at 9:21 AM, Wei Mi <wmi@google.com> wrote:
>> I think the problem can be showed by below example:
>> struct tag
>> {
>>   int a[10];
>>   int b;
>> };
>> struct tag s;
>> int foo(int len)
>> {
>>   int i = 0;
>>   int sum = 0;
>>   for (i = 0; i < len; i++)
>>     sum += barr (&s.a[i]);
>>
>>   return sum;
>> }
>> The dump before IVOPT is like:
>>
>>   <bb 4>:
>>   # i_16 = PHI <i_10(6), 0(3)>
>>   # sum_17 = PHI <sum_9(6), 0(3)>
>>   _6 = &s.a[i_16];
>>   _8 = barr (_6);
>>   sum_9 = _8 + sum_17;
>>   i_10 = i_16 + 1;
>>   if (len_5(D) > i_10)
>>     goto <bb 6>;
>>   else
>>     goto <bb 5>;
>>
>>   <bb 5>:
>>   # sum_11 = PHI <sum_9(4)>
>>   goto <bb 7>;
>>
>>   <bb 6>:
>>   goto <bb 4>;
>>
>> The iv use of _6 in bar(_6) is actually an memory address and it can
>> be computed efficiently for some targets.  For now IVOPT only honors
>> address type iv uses appeared in memory access, so this patch is
>> trying to handle this kind of address iv which is outside of memory
>> access just the same.  Please correct me if I am wrong.
>
> Yes, that is correct.
>

Sorry, to make a correction here. That is not my patch is doing. The
patch is not handling normal address exprs, but those exprs could be
expressed as mem accesses after builtins expand.

>>
>> After thought twice, I have two concerns about this:
>> 1) Why can't we just enhance the nolinear use logic to handle this
>> kind of iv use?  It's more complicated to handle it as a normal
>> address type iv, consider that IVOPT adds auto-increment candidates
>> according to them, you have to deal with that in this way
>> (auto-increment addressing mode can't apply to this kind of address iv
>> use).
>
> I think existing address iv use logic is enough to handle it. I am
> changing it and moving the gimple change from
> find_interesting_uses_stmt to rewrite_use_address in original patch.
>
>> 2) If I understand it right, this is an issue not only limited to
>> builtin functions, it stands for normal function calls too, right?
>>
>
> For builtin function, such as _mm_loadu_si128(b+4*i), it will be
> expanded to an insn: MOVDQU mem[b+4*i], $xmmreg, and the computation
> of b+4*i is for free. But for function call, the b+4*i will only be
> used as the value of an actual, and its computation cost cannot be
> avoided.
>
> Thanks,
> Wei.
Bin.Cheng Nov. 23, 2013, 5:10 a.m. UTC | #19
On Sat, Nov 23, 2013 at 1:57 AM, Wei Mi <wmi@google.com> wrote:
> On Fri, Nov 22, 2013 at 9:21 AM, Wei Mi <wmi@google.com> wrote:
>>> I think the problem can be showed by below example:
>>> struct tag
>>> {
>>>   int a[10];
>>>   int b;
>>> };
>>> struct tag s;
>>> int foo(int len)
>>> {
>>>   int i = 0;
>>>   int sum = 0;
>>>   for (i = 0; i < len; i++)
>>>     sum += barr (&s.a[i]);
>>>
>>>   return sum;
>>> }
>>> The dump before IVOPT is like:
>>>
>>>   <bb 4>:
>>>   # i_16 = PHI <i_10(6), 0(3)>
>>>   # sum_17 = PHI <sum_9(6), 0(3)>
>>>   _6 = &s.a[i_16];
>>>   _8 = barr (_6);
>>>   sum_9 = _8 + sum_17;
>>>   i_10 = i_16 + 1;
>>>   if (len_5(D) > i_10)
>>>     goto <bb 6>;
>>>   else
>>>     goto <bb 5>;
>>>
>>>   <bb 5>:
>>>   # sum_11 = PHI <sum_9(4)>
>>>   goto <bb 7>;
>>>
>>>   <bb 6>:
>>>   goto <bb 4>;
>>>
>>> The iv use of _6 in bar(_6) is actually an memory address and it can
>>> be computed efficiently for some targets.  For now IVOPT only honors
>>> address type iv uses appeared in memory access, so this patch is
>>> trying to handle this kind of address iv which is outside of memory
>>> access just the same.  Please correct me if I am wrong.
>>
>> Yes, that is correct.
>>
>
> Sorry, to make a correction here. That is not my patch is doing. The
> patch is not handling normal address exprs, but those exprs could be
> expressed as mem accesses after builtins expand.
>
Thanks for explanation, now I understand why these builtins are special.

Thanks,
bin
diff mbox

Patch

Index: expr.c
===================================================================
--- expr.c      (revision 204792)
+++ expr.c      (working copy)
@@ -7467,7 +7467,19 @@  expand_expr_addr_expr_1 (tree exp, rtx t
          tem = fold_build_pointer_plus (tem, TREE_OPERAND (exp, 1));
        return expand_expr (tem, target, tmode, modifier);
       }
+    case TARGET_MEM_REF:
+      {
+       int old_cse_not_expected;
+       addr_space_t as
+         = TYPE_ADDR_SPACE (TREE_TYPE (TREE_TYPE (TREE_OPERAND (exp, 0))));

+       result = addr_for_mem_ref (exp, as, true);
+       old_cse_not_expected = cse_not_expected;
+       cse_not_expected = true;
+       result = memory_address_addr_space (tmode, result, as);
+       cse_not_expected = old_cse_not_expected;
+        return result;
+      }
     case CONST_DECL:
       /* Expand the initializer like constants above.  */
       result = XEXP (expand_expr_constant (DECL_INITIAL (exp),
@@ -9526,9 +9538,13 @@  expand_expr_real_1 (tree exp, rtx target
          = TYPE_ADDR_SPACE (TREE_TYPE (TREE_TYPE (TREE_OPERAND (exp, 0))));
        enum insn_code icode;
        unsigned int align;
+       int old_cse_not_expected;

        op0 = addr_for_mem_ref (exp, as, true);
+       old_cse_not_expected = cse_not_expected;
+       cse_not_expected = true;
        op0 = memory_address_addr_space (mode, op0, as);
+       cse_not_expected = old_cse_not_expected;
        temp = gen_rtx_MEM (mode, op0);
        set_mem_attributes (temp, exp, 0);
        set_mem_addr_space (temp, as);
Index: targhooks.c
===================================================================
--- targhooks.c (revision 204792)
+++ targhooks.c (working copy)
@@ -566,6 +566,13 @@  default_builtin_reciprocal (unsigned int
 }

 bool
+default_builtin_has_mem_ref_p (int built_in_function ATTRIBUTE_UNUSED,
+                              int i ATTRIBUTE_UNUSED)
+{
+  return false;
+}
+
+bool
 hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false (
        cumulative_args_t ca ATTRIBUTE_UNUSED,
        enum machine_mode mode ATTRIBUTE_UNUSED,
Index: tree-ssa-loop-ivopts.c
===================================================================
--- tree-ssa-loop-ivopts.c      (revision 204792)
+++ tree-ssa-loop-ivopts.c      (working copy)
@@ -135,6 +135,8 @@  struct iv
   tree ssa_name;       /* The ssa name with the value.  */
   bool biv_p;          /* Is it a biv?  */
   bool have_use_for;   /* Do we already have a use for it?  */
+  bool builtin_mem_param; /* Used as param of a builtin, so it could not be
+                            removed by remove_unused_ivs.  */
   unsigned use_id;     /* The identifier in the use if it is the case.  */
 };

@@ -952,6 +954,7 @@  alloc_iv (tree base, tree step)
   iv->step = step;
   iv->biv_p = false;
   iv->have_use_for = false;
+  iv->builtin_mem_param = false;
   iv->use_id = 0;
   iv->ssa_name = NULL_TREE;

@@ -1874,13 +1877,36 @@  find_invariants_stmt (struct ivopts_data
     }
 }

+/* Find whether the Ith param of the BUILTIN is a mem
+   reference. If I is -1, it returns whether the BUILTIN
+   contains any mem reference type param.  */
+
+static bool
+builtin_has_mem_ref_p (gimple builtin, int i)
+{
+  tree fndecl = gimple_call_fndecl (builtin);
+  if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
+    {
+      switch (DECL_FUNCTION_CODE (fndecl))
+       {
+       case BUILT_IN_PREFETCH:
+         if (i == -1 || i == 0)
+            return true;
+       }
+    }
+  else if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
+    return targetm.builtin_has_mem_ref_p ((int) DECL_FUNCTION_CODE
(fndecl), i);
+
+  return false;
+}
+
 /* Finds interesting uses of induction variables in the statement STMT.  */

 static void
 find_interesting_uses_stmt (struct ivopts_data *data, gimple stmt)
 {
   struct iv *iv;
-  tree op, *lhs, *rhs;
+  tree op, *lhs, *rhs, callee;
   ssa_op_iter iter;
   use_operand_p use_p;
   enum tree_code code;
@@ -1937,6 +1963,74 @@  find_interesting_uses_stmt (struct ivopt

         call (memory).  */
     }
+  else if (is_gimple_call (stmt)
+          && (callee = gimple_call_fndecl (stmt))
+          && is_builtin_fn (callee)
+          && builtin_has_mem_ref_p (stmt, -1))
+    {
+      size_t i;
+      for (i = 0; i < gimple_call_num_args (stmt); i++)
+       {
+         if (builtin_has_mem_ref_p (stmt, i))
+           {
+             gimple def, g;
+             gimple_seq seq = NULL;
+             tree type, mem, addr, rhs;
+             tree *arg = gimple_call_arg_ptr (stmt, i);
+              location_t loc = gimple_location (stmt);
+             gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
+
+              if (TREE_CODE (*arg) != SSA_NAME)
+               continue;
+
+             def = SSA_NAME_DEF_STMT (*arg);
+             gcc_assert (gimple_code (def) == GIMPLE_PHI
+                         || is_gimple_assign (def));
+             /* Suppose we have the case:
+                  arg = expr;
+                  call (arg)
+                If the expr is not like the form: &MEM(...), change it to:
+                  arg = expr;
+                  tmp = &MEM(arg);
+                  call(tmp);
+                then try to find interesting uses address in MEM(arg).  */
+             if (is_gimple_assign (def)
+                 && (rhs = gimple_assign_rhs1(def))
+                 && TREE_CODE (rhs) == ADDR_EXPR
+                 && REFERENCE_CLASS_P (TREE_OPERAND (rhs, 0)))
+               {
+                 iv = get_iv (data, *arg);
+                 if (iv && !iv->builtin_mem_param)
+                   iv->builtin_mem_param = true;
+
+                 find_interesting_uses_address (data, def,
+                                                &TREE_OPERAND (rhs, 0));
+               }
+             else
+               {
+                 mem = build2 (MEM_REF, TREE_TYPE (*arg), *arg,
+                               build_int_cst (TREE_TYPE (*arg), 0));
+                 type = build_pointer_type (TREE_TYPE (*arg));
+                 addr = build1 (ADDR_EXPR, type, mem);
+                 g = gimple_build_assign_with_ops (ADDR_EXPR,
+                                                   make_ssa_name (type, NULL),
+                                                   addr, NULL);
+                 gimple_call_set_arg (stmt, i, gimple_assign_lhs (g));
+                 update_stmt (stmt);
+                 gimple_set_location (g, loc);
+                 gimple_seq_add_stmt_without_update (&seq, g);
+                 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
+                 find_interesting_uses_address (data, g,
&TREE_OPERAND (addr, 0));
+               }
+           }
+         else
+           {
+             tree arg = gimple_call_arg (stmt, i);
+             find_interesting_uses_op (data, arg);
+           }
+       }
+      return;
+    }

   if (gimple_code (stmt) == GIMPLE_PHI
       && gimple_bb (stmt) == data->current_loop->header)
@@ -6507,10 +6601,10 @@  remove_unused_ivs (struct ivopts_data *d
          && !integer_zerop (info->iv->step)
          && !info->inv_id
          && !info->iv->have_use_for
+         && !info->iv->builtin_mem_param
          && !info->preserve_biv)
        {
          bitmap_set_bit (toremove, SSA_NAME_VERSION (info->iv->ssa_name));
-
          tree def = info->iv->ssa_name;

          if (MAY_HAVE_DEBUG_STMTS && SSA_NAME_DEF_STMT (def))
Index: gimple-expr.c
===================================================================
--- gimple-expr.c       (revision 204792)
+++ gimple-expr.c       (working copy)
@@ -633,7 +633,8 @@  is_gimple_address (const_tree t)
       op = TREE_OPERAND (op, 0);
     }

-  if (CONSTANT_CLASS_P (op) || TREE_CODE (op) == MEM_REF)
+  if (CONSTANT_CLASS_P (op) || TREE_CODE (op) == MEM_REF
+      || TREE_CODE (op) == TARGET_MEM_REF)
     return true;

   switch (TREE_CODE (op))
Index: gimple-expr.h
===================================================================
--- gimple-expr.h       (revision 204792)
+++ gimple-expr.h       (working copy)
@@ -122,7 +122,8 @@  static inline bool
 is_gimple_addressable (tree t)
 {
   return (is_gimple_id (t) || handled_component_p (t)
-         || TREE_CODE (t) == MEM_REF);
+         || TREE_CODE (t) == MEM_REF
+         || TREE_CODE (t) == TARGET_MEM_REF);
 }

 /* Return true if T is a valid gimple constant.  */
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c  (revision 204792)
+++ config/i386/i386.c  (working copy)
@@ -29639,6 +29639,50 @@  ix86_init_mmx_sse_builtins (void)
     }
 }

+/* Return whether the Ith param of the BUILTIN_FUNCTION
+   is a memory reference. If I == -1, return whether the
+   BUILTIN_FUNCTION contains any memory reference param.  */
+
+static bool
+ix86_builtin_has_mem_ref_p (int builtin_function, int i)
+{
+  switch ((enum ix86_builtins) builtin_function)
+    {
+    /* LOAD.  */
+    case IX86_BUILTIN_LOADHPS:
+    case IX86_BUILTIN_LOADLPS:
+    case IX86_BUILTIN_LOADHPD:
+    case IX86_BUILTIN_LOADLPD:
+      if (i == -1 || i == 1)
+       return true;
+      break;
+    case IX86_BUILTIN_LOADUPS:
+    case IX86_BUILTIN_LOADUPD:
+    case IX86_BUILTIN_LOADDQU:
+    case IX86_BUILTIN_LOADUPD256:
+    case IX86_BUILTIN_LOADUPS256:
+    case IX86_BUILTIN_LOADDQU256:
+    case IX86_BUILTIN_LDDQU256:
+      if (i == -1 || i == 0)
+       return true;
+      break;
+    /* STORE.  */
+    case IX86_BUILTIN_STOREHPS:
+    case IX86_BUILTIN_STORELPS:
+    case IX86_BUILTIN_STOREUPS:
+    case IX86_BUILTIN_STOREUPD:
+    case IX86_BUILTIN_STOREDQU:
+    case IX86_BUILTIN_STOREUPD256:
+    case IX86_BUILTIN_STOREUPS256:
+    case IX86_BUILTIN_STOREDQU256:
+      if (i == -1 || i == 0)
+       return true;
+    default:
+      break;
+    }
+  return false;
+}
+
 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
    to return a pointer to VERSION_DECL if the outcome of the expression
    formed by PREDICATE_CHAIN is true.  This function will be called during
@@ -32525,7 +32569,13 @@  ix86_expand_special_args_builtin (const
          if (i == memory)
            {
              /* This must be the memory operand.  */
-             op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
+
+             /* We expect the builtin could be expanded to rtl with memory
+                operand and proper addressing mode will be kept as specified
+                in TARGET_MEM_REF.  */
+             if (!(TREE_CODE (arg) == ADDR_EXPR
+                   && TREE_CODE (TREE_OPERAND (arg, 0)) == TARGET_MEM_REF))
+               op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
              op = gen_rtx_MEM (mode, op);
              gcc_assert (GET_MODE (op) == mode
                          || GET_MODE (op) == VOIDmode);
@@ -43737,6 +43787,9 @@  ix86_atomic_assign_expand_fenv (tree *ho
 #undef TARGET_BUILTIN_RECIPROCAL
 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal

+#undef TARGET_BUILTIN_HAS_MEM_REF_P
+#define TARGET_BUILTIN_HAS_MEM_REF_P ix86_builtin_has_mem_ref_p
+
 #undef TARGET_ASM_FUNCTION_EPILOGUE
 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue

Index: doc/tm.texi
===================================================================
--- doc/tm.texi (revision 204792)
+++ doc/tm.texi (working copy)
@@ -709,6 +709,12 @@  If a target implements string objects th
 If a target implements string objects then this hook should should
provide a facility to check the function arguments in @var{args_list}
against the format specifiers in @var{format_arg} where the type of
@var{format_arg} is one recognized as a valid string reference type.
 @end deftypefn

+@deftypefn {Target Hook} bool TARGET_BUILTIN_HAS_MEM_REF_P (int
@var{builtin_function}, int @var{i})
+This hook return whether the @var{i}th param of the @var{builtin_function}
+is a memory reference. If @var{i} is -1, return whether the
@var{builtin_function}
+contains any memory reference type param.
+@end deftypefn
+
 @deftypefn {Target Hook} void TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE (void)
 This target function is similar to the hook @code{TARGET_OPTION_OVERRIDE}
 but is called when the optimize level is changed via an attribute or
Index: doc/tm.texi.in
===================================================================
--- doc/tm.texi.in      (revision 204792)
+++ doc/tm.texi.in      (working copy)
@@ -697,6 +697,8 @@  should use @code{TARGET_HANDLE_C_OPTION}

 @hook TARGET_CHECK_STRING_OBJECT_FORMAT_ARG

+@hook TARGET_BUILTIN_HAS_MEM_REF_P
+
 @hook TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE

 @defmac C_COMMON_OVERRIDE_OPTIONS
Index: target.def
===================================================================
--- target.def  (revision 204792)
+++ target.def  (working copy)
@@ -1742,6 +1742,14 @@  HOOK_VECTOR_END (vectorize)
 #undef HOOK_PREFIX
 #define HOOK_PREFIX "TARGET_"

+DEFHOOK
+(builtin_has_mem_ref_p,
+ "This hook return whether the @var{i}th param of the @var{builtin_function}\n\
+is a memory reference. If @var{i} is -1, return whether the
@var{builtin_function}\n\
+contains any memory reference type param.",
+ bool, (int builtin_function, int i),
+ default_builtin_has_mem_ref_p)
+
 /* Allow target specific overriding of option settings after options have
   been changed by an attribute or pragma or when it is reset at the
   end of the code affected by an attribute or pragma.  */
Index: testsuite/gcc.dg/tree-ssa/ivopt_5.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/ivopt_5.c (revision 0)
+++ testsuite/gcc.dg/tree-ssa/ivopt_5.c (revision 0)
@@ -0,0 +1,21 @@ 
+/* { dg-do compile { target {{ i?86-*-* x86_64-*-* } && lp64 } } } */
+/* { dg-options "-O2 -m64 -fdump-tree-ivopts-details" } */
+
+/* Make sure only one iv is selected after IVOPT.  */
+
+#include <x86intrin.h>
+extern __m128i arr[], d[];
+void test (void)
+{
+    unsigned int b;
+    for (b = 0; b < 1000; b += 2) {
+      __m128i *p = (__m128i *)(&d[b]);
+      __m128i a = _mm_load_si128(&arr[4*b+3]);
+      __m128i v = _mm_loadu_si128(p);
+      v = _mm_xor_si128(v, a);
+      _mm_storeu_si128(p, v);
+    }
+}
+
+/* { dg-final { scan-tree-dump-times "PHI <ivtmp" 1 "ivopts"} } */
+/* { dg-final { cleanup-tree-dump "ivopts" } } */