diff mbox series

[rs6000] Folding of vector loads in GIMPLE

Message ID 1505227262.14827.155.camel@brimstone.rchland.ibm.com
State New
Headers show
Series [rs6000] Folding of vector loads in GIMPLE | expand

Commit Message

will schmidt Sept. 12, 2017, 2:41 p.m. UTC
Hi

[PATCH, rs6000] Folding of vector loads in GIMPLE

Folding of vector loads in GIMPLE.
    
- Add code to handle gimple folding for the vec_ld builtins.
- Remove the now obsoleted folding code for vec_ld from rs6000-c.c. Surrounding
comments have been adjusted slightly so they continue to read OK for the
vec_st code that remains.
    
The resulting code is specifically verified by the powerpc/fold-vec-ld-*.c
tests which have been posted separately. (a few minutes ago).
    
Regtest successfully completed on power6 and newer. (p6,p7,p8le,p8be,p9).

OK for trunk?

Thanks,
-Will
    
[gcc]
    
        2017-09-12  Will Schmidt  <will_schmidt@vnet.ibm.com>
    
	* config/rs6000/rs6000.c (rs6000_gimple_fold_builtin): Add handling
	  for early folding of vector loads (ALTIVEC_BUILTIN_LVX_*).
	* config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin):
	  Remove obsoleted code for handling ALTIVEC_BUILTIN_VEC_LD.

Comments

Bill Schmidt Sept. 12, 2017, 3:22 p.m. UTC | #1
> On Sep 12, 2017, at 9:41 AM, Will Schmidt <will_schmidt@vnet.ibm.com> wrote:
> 
> Hi
> 
> [PATCH, rs6000] Folding of vector loads in GIMPLE
> 
> Folding of vector loads in GIMPLE.
> 
> - Add code to handle gimple folding for the vec_ld builtins.
> - Remove the now obsoleted folding code for vec_ld from rs6000-c.c. Surrounding
> comments have been adjusted slightly so they continue to read OK for the
> vec_st code that remains.
> 
> The resulting code is specifically verified by the powerpc/fold-vec-ld-*.c
> tests which have been posted separately. (a few minutes ago).
> 
> Regtest successfully completed on power6 and newer. (p6,p7,p8le,p8be,p9).
> 
> OK for trunk?
> 
> Thanks,
> -Will
> 
> [gcc]
> 
>        2017-09-12  Will Schmidt  <will_schmidt@vnet.ibm.com>
> 
> 	* config/rs6000/rs6000.c (rs6000_gimple_fold_builtin): Add handling
> 	  for early folding of vector loads (ALTIVEC_BUILTIN_LVX_*).
> 	* config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin):
> 	  Remove obsoleted code for handling ALTIVEC_BUILTIN_VEC_LD.
> 
> diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c
> index 897306c..73e14d9 100644
> --- a/gcc/config/rs6000/rs6000-c.c
> +++ b/gcc/config/rs6000/rs6000-c.c
> @@ -6459,92 +6459,19 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl,
> 		     convert (TREE_TYPE (stmt), arg0));
>       stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl);
>       return stmt;
>     }
> 
> -  /* Expand vec_ld into an expression that masks the address and
> -     performs the load.  We need to expand this early to allow
> +  /* Expand vec_st into an expression that masks the address and
> +     performs the store.  We need to expand this early to allow
>      the best aliasing, as by the time we get into RTL we no longer
>      are able to honor __restrict__, for example.  We may want to
>      consider this for all memory access built-ins.
> 
>      When -maltivec=be is specified, or the wrong number of arguments
>      is provided, simply punt to existing built-in processing.  */
> -  if (fcode == ALTIVEC_BUILTIN_VEC_LD
> -      && (BYTES_BIG_ENDIAN || !VECTOR_ELT_ORDER_BIG)
> -      && nargs == 2)
> -    {
> -      tree arg0 = (*arglist)[0];
> -      tree arg1 = (*arglist)[1];
> -
> -      /* Strip qualifiers like "const" from the pointer arg.  */
> -      tree arg1_type = TREE_TYPE (arg1);
> -      if (!POINTER_TYPE_P (arg1_type) && TREE_CODE (arg1_type) != ARRAY_TYPE)
> -	goto bad;
> -
> -      tree inner_type = TREE_TYPE (arg1_type);
> -      if (TYPE_QUALS (TREE_TYPE (arg1_type)) != 0)
> -	{
> -	  arg1_type = build_pointer_type (build_qualified_type (inner_type,
> -								0));
> -	  arg1 = fold_convert (arg1_type, arg1);
> -	}
> -
> -      /* Construct the masked address.  Let existing error handling take
> -	 over if we don't have a constant offset.  */
> -      arg0 = fold (arg0);
> -
> -      if (TREE_CODE (arg0) == INTEGER_CST)
> -	{
> -	  if (!ptrofftype_p (TREE_TYPE (arg0)))
> -	    arg0 = build1 (NOP_EXPR, sizetype, arg0);
> -
> -	  tree arg1_type = TREE_TYPE (arg1);
> -	  if (TREE_CODE (arg1_type) == ARRAY_TYPE)
> -	    {
> -	      arg1_type = TYPE_POINTER_TO (TREE_TYPE (arg1_type));
> -	      tree const0 = build_int_cstu (sizetype, 0);
> -	      tree arg1_elt0 = build_array_ref (loc, arg1, const0);
> -	      arg1 = build1 (ADDR_EXPR, arg1_type, arg1_elt0);
> -	    }
> -
> -	  tree addr = fold_build2_loc (loc, POINTER_PLUS_EXPR, arg1_type,
> -				       arg1, arg0);
> -	  tree aligned = fold_build2_loc (loc, BIT_AND_EXPR, arg1_type, addr,
> -					  build_int_cst (arg1_type, -16));
> -
> -	  /* Find the built-in to get the return type so we can convert
> -	     the result properly (or fall back to default handling if the
> -	     arguments aren't compatible).  */
> -	  for (desc = altivec_overloaded_builtins;
> -	       desc->code && desc->code != fcode; desc++)
> -	    continue;
> -
> -	  for (; desc->code == fcode; desc++)
> -	    if (rs6000_builtin_type_compatible (TREE_TYPE (arg0), desc->op1)
> -		&& (rs6000_builtin_type_compatible (TREE_TYPE (arg1),
> -						    desc->op2)))
> -	      {
> -		tree ret_type = rs6000_builtin_type (desc->ret_type);
> -		if (TYPE_MODE (ret_type) == V2DImode)
> -		  /* Type-based aliasing analysis thinks vector long
> -		     and vector long long are different and will put them
> -		     in distinct alias classes.  Force our return type
> -		     to be a may-alias type to avoid this.  */
> -		  ret_type
> -		    = build_pointer_type_for_mode (ret_type, Pmode,
> -						   true/*can_alias_all*/);
> -		else
> -		  ret_type = build_pointer_type (ret_type);
> -		aligned = build1 (NOP_EXPR, ret_type, aligned);
> -		tree ret_val = build_indirect_ref (loc, aligned, RO_NULL);
> -		return ret_val;
> -	      }
> -	}
> -    }
> 
> -  /* Similarly for stvx.  */
>   if (fcode == ALTIVEC_BUILTIN_VEC_ST
>       && (BYTES_BIG_ENDIAN || !VECTOR_ELT_ORDER_BIG)
>       && nargs == 3)
>     {
>       tree arg0 = (*arglist)[0];
> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> index cf744d8..5b14789 100644
> --- a/gcc/config/rs6000/rs6000.c
> +++ b/gcc/config/rs6000/rs6000.c
> @@ -16473,10 +16473,65 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi)
> 	res = gimple_build (&stmts, VIEW_CONVERT_EXPR, TREE_TYPE (lhs), res);
> 	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> 	update_call_from_tree (gsi, res);
> 	return true;
>       }
> +    /* Vector loads.  */
> +    case ALTIVEC_BUILTIN_LVX_V16QI:
> +    case ALTIVEC_BUILTIN_LVX_V8HI:
> +    case ALTIVEC_BUILTIN_LVX_V4SI:
> +    case ALTIVEC_BUILTIN_LVX_V4SF:
> +    case ALTIVEC_BUILTIN_LVX_V2DI:
> +    case ALTIVEC_BUILTIN_LVX_V2DF:
> +      {
> +	 gimple *g;
> +	 arg0 = gimple_call_arg (stmt, 0);  // offset
> +	 arg1 = gimple_call_arg (stmt, 1);  // address
> +
> +	 /* Limit folding of loads to LE targets.  */
> +	 if (BYTES_BIG_ENDIAN || VECTOR_ELT_ORDER_BIG)
> +	   return false;

Why?  This transformation shouldn't be endian-dependent.

Thanks,
Bill

> +
> +	 lhs = gimple_call_lhs (stmt);
> +	 location_t loc = gimple_location (stmt);
> +
> +	 tree arg1_type = TREE_TYPE (arg1);
> +	 tree lhs_type = TREE_TYPE (lhs);
> +
> +	 /* POINTER_PLUS_EXPR wants the offset to be of type 'sizetype'.  Create
> +	    the tree using the value from arg0.  The resulting type will match
> +	    the type of arg1.  */
> +	 tree temp_offset = create_tmp_reg_or_ssa_name (sizetype);
> +	 g = gimple_build_assign (temp_offset, NOP_EXPR, arg0);
> +	 gimple_set_location (g, loc);
> +	 gsi_insert_before (gsi, g, GSI_SAME_STMT);
> +	 tree temp_addr = create_tmp_reg_or_ssa_name (arg1_type);
> +	 g = gimple_build_assign (temp_addr, POINTER_PLUS_EXPR, arg1,
> +				  temp_offset);
> +	 gimple_set_location (g, loc);
> +	 gsi_insert_before (gsi, g, GSI_SAME_STMT);
> +
> +	 /* Mask off any lower bits from the address.  */
> +	 tree alignment_mask = build_int_cst (arg1_type, -16);
> +	 tree aligned_addr = create_tmp_reg_or_ssa_name (arg1_type);
> +	 g = gimple_build_assign (aligned_addr, BIT_AND_EXPR,
> +				 temp_addr, alignment_mask);
> +	 gimple_set_location (g, loc);
> +	 gsi_insert_before (gsi, g, GSI_SAME_STMT);
> +
> +	 /* Use the build2 helper to set up the mem_ref.  The MEM_REF could also
> +	    take an offset, but since we've already incorporated the offset
> +	    above, here we just pass in a zero.  */
> +	 g = gimple_build_assign (lhs, build2 (MEM_REF, lhs_type, aligned_addr,
> +						build_int_cst (arg1_type, 0)));
> +	 gimple_set_location (g, loc);
> +	 gsi_replace (gsi, g, true);
> +
> +	 return true;
> +
> +      }
> +
>     default:
> 	if (TARGET_DEBUG_BUILTIN)
> 	   fprintf (stderr, "gimple builtin intrinsic not matched:%d %s %s\n",
> 		    fn_code, fn_name1, fn_name2);
>       break;
> 
>
will schmidt Sept. 12, 2017, 5:45 p.m. UTC | #2
On Tue, 2017-09-12 at 10:22 -0500, Bill Schmidt wrote:
> > On Sep 12, 2017, at 9:41 AM, Will Schmidt <will_schmidt@vnet.ibm.com> wrote:
> > 
> > Hi
> > 
> > [PATCH, rs6000] Folding of vector loads in GIMPLE
> > 
> > Folding of vector loads in GIMPLE.
> > 
> > - Add code to handle gimple folding for the vec_ld builtins.
> > - Remove the now obsoleted folding code for vec_ld from rs6000-c.c. Surrounding
> > comments have been adjusted slightly so they continue to read OK for the
> > vec_st code that remains.
> > 
> > The resulting code is specifically verified by the powerpc/fold-vec-ld-*.c
> > tests which have been posted separately. (a few minutes ago).
> > 
> > Regtest successfully completed on power6 and newer. (p6,p7,p8le,p8be,p9).
> > 
> > OK for trunk?
> > 
> > Thanks,
> > -Will
> > 
> > [gcc]
> > 
> >        2017-09-12  Will Schmidt  <will_schmidt@vnet.ibm.com>
> > 
> > 	* config/rs6000/rs6000.c (rs6000_gimple_fold_builtin): Add handling
> > 	  for early folding of vector loads (ALTIVEC_BUILTIN_LVX_*).
> > 	* config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin):
> > 	  Remove obsoleted code for handling ALTIVEC_BUILTIN_VEC_LD.
> > 
> > diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c
> > index 897306c..73e14d9 100644
> > --- a/gcc/config/rs6000/rs6000-c.c
> > +++ b/gcc/config/rs6000/rs6000-c.c
> > @@ -6459,92 +6459,19 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl,
> > 		     convert (TREE_TYPE (stmt), arg0));
> >       stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl);
> >       return stmt;
> >     }
> > 
> > -  /* Expand vec_ld into an expression that masks the address and
> > -     performs the load.  We need to expand this early to allow
> > +  /* Expand vec_st into an expression that masks the address and
> > +     performs the store.  We need to expand this early to allow
> >      the best aliasing, as by the time we get into RTL we no longer
> >      are able to honor __restrict__, for example.  We may want to
> >      consider this for all memory access built-ins.
> > 
> >      When -maltivec=be is specified, or the wrong number of arguments
> >      is provided, simply punt to existing built-in processing.  */
> > -  if (fcode == ALTIVEC_BUILTIN_VEC_LD
> > -      && (BYTES_BIG_ENDIAN || !VECTOR_ELT_ORDER_BIG)
> > -      && nargs == 2)
> > -    {
> > -      tree arg0 = (*arglist)[0];
> > -      tree arg1 = (*arglist)[1];
> > -
> > -      /* Strip qualifiers like "const" from the pointer arg.  */
> > -      tree arg1_type = TREE_TYPE (arg1);
> > -      if (!POINTER_TYPE_P (arg1_type) && TREE_CODE (arg1_type) != ARRAY_TYPE)
> > -	goto bad;
> > -
> > -      tree inner_type = TREE_TYPE (arg1_type);
> > -      if (TYPE_QUALS (TREE_TYPE (arg1_type)) != 0)
> > -	{
> > -	  arg1_type = build_pointer_type (build_qualified_type (inner_type,
> > -								0));
> > -	  arg1 = fold_convert (arg1_type, arg1);
> > -	}
> > -
> > -      /* Construct the masked address.  Let existing error handling take
> > -	 over if we don't have a constant offset.  */
> > -      arg0 = fold (arg0);
> > -
> > -      if (TREE_CODE (arg0) == INTEGER_CST)
> > -	{
> > -	  if (!ptrofftype_p (TREE_TYPE (arg0)))
> > -	    arg0 = build1 (NOP_EXPR, sizetype, arg0);
> > -
> > -	  tree arg1_type = TREE_TYPE (arg1);
> > -	  if (TREE_CODE (arg1_type) == ARRAY_TYPE)
> > -	    {
> > -	      arg1_type = TYPE_POINTER_TO (TREE_TYPE (arg1_type));
> > -	      tree const0 = build_int_cstu (sizetype, 0);
> > -	      tree arg1_elt0 = build_array_ref (loc, arg1, const0);
> > -	      arg1 = build1 (ADDR_EXPR, arg1_type, arg1_elt0);
> > -	    }
> > -
> > -	  tree addr = fold_build2_loc (loc, POINTER_PLUS_EXPR, arg1_type,
> > -				       arg1, arg0);
> > -	  tree aligned = fold_build2_loc (loc, BIT_AND_EXPR, arg1_type, addr,
> > -					  build_int_cst (arg1_type, -16));
> > -
> > -	  /* Find the built-in to get the return type so we can convert
> > -	     the result properly (or fall back to default handling if the
> > -	     arguments aren't compatible).  */
> > -	  for (desc = altivec_overloaded_builtins;
> > -	       desc->code && desc->code != fcode; desc++)
> > -	    continue;
> > -
> > -	  for (; desc->code == fcode; desc++)
> > -	    if (rs6000_builtin_type_compatible (TREE_TYPE (arg0), desc->op1)
> > -		&& (rs6000_builtin_type_compatible (TREE_TYPE (arg1),
> > -						    desc->op2)))
> > -	      {
> > -		tree ret_type = rs6000_builtin_type (desc->ret_type);
> > -		if (TYPE_MODE (ret_type) == V2DImode)
> > -		  /* Type-based aliasing analysis thinks vector long
> > -		     and vector long long are different and will put them
> > -		     in distinct alias classes.  Force our return type
> > -		     to be a may-alias type to avoid this.  */
> > -		  ret_type
> > -		    = build_pointer_type_for_mode (ret_type, Pmode,
> > -						   true/*can_alias_all*/);
> > -		else
> > -		  ret_type = build_pointer_type (ret_type);
> > -		aligned = build1 (NOP_EXPR, ret_type, aligned);
> > -		tree ret_val = build_indirect_ref (loc, aligned, RO_NULL);
> > -		return ret_val;
> > -	      }
> > -	}
> > -    }
> > 
> > -  /* Similarly for stvx.  */
> >   if (fcode == ALTIVEC_BUILTIN_VEC_ST
> >       && (BYTES_BIG_ENDIAN || !VECTOR_ELT_ORDER_BIG)
> >       && nargs == 3)
> >     {
> >       tree arg0 = (*arglist)[0];
> > diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> > index cf744d8..5b14789 100644
> > --- a/gcc/config/rs6000/rs6000.c
> > +++ b/gcc/config/rs6000/rs6000.c
> > @@ -16473,10 +16473,65 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi)
> > 	res = gimple_build (&stmts, VIEW_CONVERT_EXPR, TREE_TYPE (lhs), res);
> > 	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> > 	update_call_from_tree (gsi, res);
> > 	return true;
> >       }
> > +    /* Vector loads.  */
> > +    case ALTIVEC_BUILTIN_LVX_V16QI:
> > +    case ALTIVEC_BUILTIN_LVX_V8HI:
> > +    case ALTIVEC_BUILTIN_LVX_V4SI:
> > +    case ALTIVEC_BUILTIN_LVX_V4SF:
> > +    case ALTIVEC_BUILTIN_LVX_V2DI:
> > +    case ALTIVEC_BUILTIN_LVX_V2DF:
> > +      {
> > +	 gimple *g;
> > +	 arg0 = gimple_call_arg (stmt, 0);  // offset
> > +	 arg1 = gimple_call_arg (stmt, 1);  // address
> > +
> > +	 /* Limit folding of loads to LE targets.  */
> > +	 if (BYTES_BIG_ENDIAN || VECTOR_ELT_ORDER_BIG)
> > +	   return false;
> 
> Why?  This transformation shouldn't be endian-dependent.

I was seeing errors in some of the existing tests specific to BE.
FAIL: gcc.dg/vmx/ld-be-order.c   -Os  execution test
FAIL: gcc.dg/vmx/ld-vsx-be-order.c   -O0  execution test

I'll give this another attempt without that exclusion and verify.  I'll
admit it is possible the ld*be-order tests were failing for other
reasons.

Thanks
-Will

> 
> Thanks,
> Bill
> 
> > +
> > +	 lhs = gimple_call_lhs (stmt);
> > +	 location_t loc = gimple_location (stmt);
> > +
> > +	 tree arg1_type = TREE_TYPE (arg1);
> > +	 tree lhs_type = TREE_TYPE (lhs);
> > +
> > +	 /* POINTER_PLUS_EXPR wants the offset to be of type 'sizetype'.  Create
> > +	    the tree using the value from arg0.  The resulting type will match
> > +	    the type of arg1.  */
> > +	 tree temp_offset = create_tmp_reg_or_ssa_name (sizetype);
> > +	 g = gimple_build_assign (temp_offset, NOP_EXPR, arg0);
> > +	 gimple_set_location (g, loc);
> > +	 gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > +	 tree temp_addr = create_tmp_reg_or_ssa_name (arg1_type);
> > +	 g = gimple_build_assign (temp_addr, POINTER_PLUS_EXPR, arg1,
> > +				  temp_offset);
> > +	 gimple_set_location (g, loc);
> > +	 gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > +
> > +	 /* Mask off any lower bits from the address.  */
> > +	 tree alignment_mask = build_int_cst (arg1_type, -16);
> > +	 tree aligned_addr = create_tmp_reg_or_ssa_name (arg1_type);
> > +	 g = gimple_build_assign (aligned_addr, BIT_AND_EXPR,
> > +				 temp_addr, alignment_mask);
> > +	 gimple_set_location (g, loc);
> > +	 gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > +
> > +	 /* Use the build2 helper to set up the mem_ref.  The MEM_REF could also
> > +	    take an offset, but since we've already incorporated the offset
> > +	    above, here we just pass in a zero.  */
> > +	 g = gimple_build_assign (lhs, build2 (MEM_REF, lhs_type, aligned_addr,
> > +						build_int_cst (arg1_type, 0)));
> > +	 gimple_set_location (g, loc);
> > +	 gsi_replace (gsi, g, true);
> > +
> > +	 return true;
> > +
> > +      }
> > +
> >     default:
> > 	if (TARGET_DEBUG_BUILTIN)
> > 	   fprintf (stderr, "gimple builtin intrinsic not matched:%d %s %s\n",
> > 		    fn_code, fn_name1, fn_name2);
> >       break;
> > 
> > 
>
will schmidt Sept. 12, 2017, 9:08 p.m. UTC | #3
Hi,

[PATCH, rs6000] [v2] Folding of vector loads in GIMPLE
    
Folding of vector loads in GIMPLE.
    
Add code to handle gimple folding for the vec_ld builtins.
Remove the now obsoleted folding code for vec_ld from rs6000-c.c. Surrounding
comments have been adjusted slightly so they continue to read OK for the
existing vec_st code.
    
The resulting code is specifically verified by the powerpc/fold-vec-ld-*.c
tests which have been posted separately.

For V2 of this patch, I've removed the chunk of code that prohibited the
gimple fold from occurring in BE environments.   This had fixed an issue
for me earlier during my development of the code, and turns out this was
not necessary.  I've sniff-tested after removing that check and it looks
OK.

>+ /* Limit folding of loads to LE targets.  */ 
> +	 if (BYTES_BIG_ENDIAN || VECTOR_ELT_ORDER_BIG)
> +	   return false;

I've restarted a regression test on this updated version.
    
OK for trunk (assuming successful regression test completion)  ?
    
Thanks,
-Will
    
[gcc]

        2017-09-12  Will Schmidt  <will_schmidt@vnet.ibm.com>

        * config/rs6000/rs6000.c (rs6000_gimple_fold_builtin): Add handling
        for early folding of vector loads (ALTIVEC_BUILTIN_LVX_*).
        * config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin):
        Remove obsoleted code for handling ALTIVEC_BUILTIN_VEC_LD.

diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c
index fbab0a2..bb8a77d 100644
--- a/gcc/config/rs6000/rs6000-c.c
+++ b/gcc/config/rs6000/rs6000-c.c
@@ -6470,92 +6470,19 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl,
 		     convert (TREE_TYPE (stmt), arg0));
       stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl);
       return stmt;
     }
 
-  /* Expand vec_ld into an expression that masks the address and
-     performs the load.  We need to expand this early to allow
+  /* Expand vec_st into an expression that masks the address and
+     performs the store.  We need to expand this early to allow
      the best aliasing, as by the time we get into RTL we no longer
      are able to honor __restrict__, for example.  We may want to
      consider this for all memory access built-ins.
 
      When -maltivec=be is specified, or the wrong number of arguments
      is provided, simply punt to existing built-in processing.  */
-  if (fcode == ALTIVEC_BUILTIN_VEC_LD
-      && (BYTES_BIG_ENDIAN || !VECTOR_ELT_ORDER_BIG)
-      && nargs == 2)
-    {
-      tree arg0 = (*arglist)[0];
-      tree arg1 = (*arglist)[1];
-
-      /* Strip qualifiers like "const" from the pointer arg.  */
-      tree arg1_type = TREE_TYPE (arg1);
-      if (!POINTER_TYPE_P (arg1_type) && TREE_CODE (arg1_type) != ARRAY_TYPE)
-	goto bad;
-
-      tree inner_type = TREE_TYPE (arg1_type);
-      if (TYPE_QUALS (TREE_TYPE (arg1_type)) != 0)
-	{
-	  arg1_type = build_pointer_type (build_qualified_type (inner_type,
-								0));
-	  arg1 = fold_convert (arg1_type, arg1);
-	}
-
-      /* Construct the masked address.  Let existing error handling take
-	 over if we don't have a constant offset.  */
-      arg0 = fold (arg0);
-
-      if (TREE_CODE (arg0) == INTEGER_CST)
-	{
-	  if (!ptrofftype_p (TREE_TYPE (arg0)))
-	    arg0 = build1 (NOP_EXPR, sizetype, arg0);
-
-	  tree arg1_type = TREE_TYPE (arg1);
-	  if (TREE_CODE (arg1_type) == ARRAY_TYPE)
-	    {
-	      arg1_type = TYPE_POINTER_TO (TREE_TYPE (arg1_type));
-	      tree const0 = build_int_cstu (sizetype, 0);
-	      tree arg1_elt0 = build_array_ref (loc, arg1, const0);
-	      arg1 = build1 (ADDR_EXPR, arg1_type, arg1_elt0);
-	    }
-
-	  tree addr = fold_build2_loc (loc, POINTER_PLUS_EXPR, arg1_type,
-				       arg1, arg0);
-	  tree aligned = fold_build2_loc (loc, BIT_AND_EXPR, arg1_type, addr,
-					  build_int_cst (arg1_type, -16));
-
-	  /* Find the built-in to get the return type so we can convert
-	     the result properly (or fall back to default handling if the
-	     arguments aren't compatible).  */
-	  for (desc = altivec_overloaded_builtins;
-	       desc->code && desc->code != fcode; desc++)
-	    continue;
-
-	  for (; desc->code == fcode; desc++)
-	    if (rs6000_builtin_type_compatible (TREE_TYPE (arg0), desc->op1)
-		&& (rs6000_builtin_type_compatible (TREE_TYPE (arg1),
-						    desc->op2)))
-	      {
-		tree ret_type = rs6000_builtin_type (desc->ret_type);
-		if (TYPE_MODE (ret_type) == V2DImode)
-		  /* Type-based aliasing analysis thinks vector long
-		     and vector long long are different and will put them
-		     in distinct alias classes.  Force our return type
-		     to be a may-alias type to avoid this.  */
-		  ret_type
-		    = build_pointer_type_for_mode (ret_type, Pmode,
-						   true/*can_alias_all*/);
-		else
-		  ret_type = build_pointer_type (ret_type);
-		aligned = build1 (NOP_EXPR, ret_type, aligned);
-		tree ret_val = build_indirect_ref (loc, aligned, RO_NULL);
-		return ret_val;
-	      }
-	}
-    }
 
-  /* Similarly for stvx.  */
   if (fcode == ALTIVEC_BUILTIN_VEC_ST
       && (BYTES_BIG_ENDIAN || !VECTOR_ELT_ORDER_BIG)
       && nargs == 3)
     {
       tree arg0 = (*arglist)[0];
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 1338371..1fb5f44 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -16547,10 +16547,61 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi)
 	res = gimple_build (&stmts, VIEW_CONVERT_EXPR, TREE_TYPE (lhs), res);
 	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
 	update_call_from_tree (gsi, res);
 	return true;
       }
+    /* Vector loads.  */
+    case ALTIVEC_BUILTIN_LVX_V16QI:
+    case ALTIVEC_BUILTIN_LVX_V8HI:
+    case ALTIVEC_BUILTIN_LVX_V4SI:
+    case ALTIVEC_BUILTIN_LVX_V4SF:
+    case ALTIVEC_BUILTIN_LVX_V2DI:
+    case ALTIVEC_BUILTIN_LVX_V2DF:
+      {
+	 gimple *g;
+	 arg0 = gimple_call_arg (stmt, 0);  // offset
+	 arg1 = gimple_call_arg (stmt, 1);  // address
+
+	 lhs = gimple_call_lhs (stmt);
+	 location_t loc = gimple_location (stmt);
+
+	 tree arg1_type = TREE_TYPE (arg1);
+	 tree lhs_type = TREE_TYPE (lhs);
+
+	 /* POINTER_PLUS_EXPR wants the offset to be of type 'sizetype'.  Create
+	    the tree using the value from arg0.  The resulting type will match
+	    the type of arg1.  */
+	 tree temp_offset = create_tmp_reg_or_ssa_name (sizetype);
+	 g = gimple_build_assign (temp_offset, NOP_EXPR, arg0);
+	 gimple_set_location (g, loc);
+	 gsi_insert_before (gsi, g, GSI_SAME_STMT);
+	 tree temp_addr = create_tmp_reg_or_ssa_name (arg1_type);
+	 g = gimple_build_assign (temp_addr, POINTER_PLUS_EXPR, arg1,
+				  temp_offset);
+	 gimple_set_location (g, loc);
+	 gsi_insert_before (gsi, g, GSI_SAME_STMT);
+
+	 /* Mask off any lower bits from the address.  */
+	 tree alignment_mask = build_int_cst (arg1_type, -16);
+	 tree aligned_addr = create_tmp_reg_or_ssa_name (arg1_type);
+	 g = gimple_build_assign (aligned_addr, BIT_AND_EXPR,
+				 temp_addr, alignment_mask);
+	 gimple_set_location (g, loc);
+	 gsi_insert_before (gsi, g, GSI_SAME_STMT);
+
+	 /* Use the build2 helper to set up the mem_ref.  The MEM_REF could also
+	    take an offset, but since we've already incorporated the offset
+	    above, here we just pass in a zero.  */
+	 g = gimple_build_assign (lhs, build2 (MEM_REF, lhs_type, aligned_addr,
+						build_int_cst (arg1_type, 0)));
+	 gimple_set_location (g, loc);
+	 gsi_replace (gsi, g, true);
+
+	 return true;
+
+      }
+
     default:
 	if (TARGET_DEBUG_BUILTIN)
 	   fprintf (stderr, "gimple builtin intrinsic not matched:%d %s %s\n",
 		    fn_code, fn_name1, fn_name2);
       break;
Bill Schmidt Sept. 12, 2017, 9:53 p.m. UTC | #4
> On Sep 12, 2017, at 4:08 PM, Will Schmidt <will_schmidt@vnet.ibm.com> wrote:
> 
> Hi,
> 
> [PATCH, rs6000] [v2] Folding of vector loads in GIMPLE
> 
> Folding of vector loads in GIMPLE.
> 
> Add code to handle gimple folding for the vec_ld builtins.
> Remove the now obsoleted folding code for vec_ld from rs6000-c.c. Surrounding
> comments have been adjusted slightly so they continue to read OK for the
> existing vec_st code.
> 
> The resulting code is specifically verified by the powerpc/fold-vec-ld-*.c
> tests which have been posted separately.
> 
> For V2 of this patch, I've removed the chunk of code that prohibited the
> gimple fold from occurring in BE environments.   This had fixed an issue
> for me earlier during my development of the code, and turns out this was
> not necessary.  I've sniff-tested after removing that check and it looks
> OK.

Thanks!
> 
>> + /* Limit folding of loads to LE targets.  */ 
>> +	 if (BYTES_BIG_ENDIAN || VECTOR_ELT_ORDER_BIG)
>> +	   return false;
> 
> I've restarted a regression test on this updated version.
> 
> OK for trunk (assuming successful regression test completion)  ?

Looks good to me otherwise, but Richard may have streamlining
improvements, so please wait for his review.  And of course Segher's.

Thanks,
Bill
> 
> Thanks,
> -Will
> 
> [gcc]
> 
>        2017-09-12  Will Schmidt  <will_schmidt@vnet.ibm.com>
> 
>        * config/rs6000/rs6000.c (rs6000_gimple_fold_builtin): Add handling
>        for early folding of vector loads (ALTIVEC_BUILTIN_LVX_*).
>        * config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin):
>        Remove obsoleted code for handling ALTIVEC_BUILTIN_VEC_LD.
> 
> diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c
> index fbab0a2..bb8a77d 100644
> --- a/gcc/config/rs6000/rs6000-c.c
> +++ b/gcc/config/rs6000/rs6000-c.c
> @@ -6470,92 +6470,19 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl,
> 		     convert (TREE_TYPE (stmt), arg0));
>       stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl);
>       return stmt;
>     }
> 
> -  /* Expand vec_ld into an expression that masks the address and
> -     performs the load.  We need to expand this early to allow
> +  /* Expand vec_st into an expression that masks the address and
> +     performs the store.  We need to expand this early to allow
>      the best aliasing, as by the time we get into RTL we no longer
>      are able to honor __restrict__, for example.  We may want to
>      consider this for all memory access built-ins.
> 
>      When -maltivec=be is specified, or the wrong number of arguments
>      is provided, simply punt to existing built-in processing.  */
> -  if (fcode == ALTIVEC_BUILTIN_VEC_LD
> -      && (BYTES_BIG_ENDIAN || !VECTOR_ELT_ORDER_BIG)
> -      && nargs == 2)
> -    {
> -      tree arg0 = (*arglist)[0];
> -      tree arg1 = (*arglist)[1];
> -
> -      /* Strip qualifiers like "const" from the pointer arg.  */
> -      tree arg1_type = TREE_TYPE (arg1);
> -      if (!POINTER_TYPE_P (arg1_type) && TREE_CODE (arg1_type) != ARRAY_TYPE)
> -	goto bad;
> -
> -      tree inner_type = TREE_TYPE (arg1_type);
> -      if (TYPE_QUALS (TREE_TYPE (arg1_type)) != 0)
> -	{
> -	  arg1_type = build_pointer_type (build_qualified_type (inner_type,
> -								0));
> -	  arg1 = fold_convert (arg1_type, arg1);
> -	}
> -
> -      /* Construct the masked address.  Let existing error handling take
> -	 over if we don't have a constant offset.  */
> -      arg0 = fold (arg0);
> -
> -      if (TREE_CODE (arg0) == INTEGER_CST)
> -	{
> -	  if (!ptrofftype_p (TREE_TYPE (arg0)))
> -	    arg0 = build1 (NOP_EXPR, sizetype, arg0);
> -
> -	  tree arg1_type = TREE_TYPE (arg1);
> -	  if (TREE_CODE (arg1_type) == ARRAY_TYPE)
> -	    {
> -	      arg1_type = TYPE_POINTER_TO (TREE_TYPE (arg1_type));
> -	      tree const0 = build_int_cstu (sizetype, 0);
> -	      tree arg1_elt0 = build_array_ref (loc, arg1, const0);
> -	      arg1 = build1 (ADDR_EXPR, arg1_type, arg1_elt0);
> -	    }
> -
> -	  tree addr = fold_build2_loc (loc, POINTER_PLUS_EXPR, arg1_type,
> -				       arg1, arg0);
> -	  tree aligned = fold_build2_loc (loc, BIT_AND_EXPR, arg1_type, addr,
> -					  build_int_cst (arg1_type, -16));
> -
> -	  /* Find the built-in to get the return type so we can convert
> -	     the result properly (or fall back to default handling if the
> -	     arguments aren't compatible).  */
> -	  for (desc = altivec_overloaded_builtins;
> -	       desc->code && desc->code != fcode; desc++)
> -	    continue;
> -
> -	  for (; desc->code == fcode; desc++)
> -	    if (rs6000_builtin_type_compatible (TREE_TYPE (arg0), desc->op1)
> -		&& (rs6000_builtin_type_compatible (TREE_TYPE (arg1),
> -						    desc->op2)))
> -	      {
> -		tree ret_type = rs6000_builtin_type (desc->ret_type);
> -		if (TYPE_MODE (ret_type) == V2DImode)
> -		  /* Type-based aliasing analysis thinks vector long
> -		     and vector long long are different and will put them
> -		     in distinct alias classes.  Force our return type
> -		     to be a may-alias type to avoid this.  */
> -		  ret_type
> -		    = build_pointer_type_for_mode (ret_type, Pmode,
> -						   true/*can_alias_all*/);
> -		else
> -		  ret_type = build_pointer_type (ret_type);
> -		aligned = build1 (NOP_EXPR, ret_type, aligned);
> -		tree ret_val = build_indirect_ref (loc, aligned, RO_NULL);
> -		return ret_val;
> -	      }
> -	}
> -    }
> 
> -  /* Similarly for stvx.  */
>   if (fcode == ALTIVEC_BUILTIN_VEC_ST
>       && (BYTES_BIG_ENDIAN || !VECTOR_ELT_ORDER_BIG)
>       && nargs == 3)
>     {
>       tree arg0 = (*arglist)[0];
> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> index 1338371..1fb5f44 100644
> --- a/gcc/config/rs6000/rs6000.c
> +++ b/gcc/config/rs6000/rs6000.c
> @@ -16547,10 +16547,61 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi)
> 	res = gimple_build (&stmts, VIEW_CONVERT_EXPR, TREE_TYPE (lhs), res);
> 	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> 	update_call_from_tree (gsi, res);
> 	return true;
>       }
> +    /* Vector loads.  */
> +    case ALTIVEC_BUILTIN_LVX_V16QI:
> +    case ALTIVEC_BUILTIN_LVX_V8HI:
> +    case ALTIVEC_BUILTIN_LVX_V4SI:
> +    case ALTIVEC_BUILTIN_LVX_V4SF:
> +    case ALTIVEC_BUILTIN_LVX_V2DI:
> +    case ALTIVEC_BUILTIN_LVX_V2DF:
> +      {
> +	 gimple *g;
> +	 arg0 = gimple_call_arg (stmt, 0);  // offset
> +	 arg1 = gimple_call_arg (stmt, 1);  // address
> +
> +	 lhs = gimple_call_lhs (stmt);
> +	 location_t loc = gimple_location (stmt);
> +
> +	 tree arg1_type = TREE_TYPE (arg1);
> +	 tree lhs_type = TREE_TYPE (lhs);
> +
> +	 /* POINTER_PLUS_EXPR wants the offset to be of type 'sizetype'.  Create
> +	    the tree using the value from arg0.  The resulting type will match
> +	    the type of arg1.  */
> +	 tree temp_offset = create_tmp_reg_or_ssa_name (sizetype);
> +	 g = gimple_build_assign (temp_offset, NOP_EXPR, arg0);
> +	 gimple_set_location (g, loc);
> +	 gsi_insert_before (gsi, g, GSI_SAME_STMT);
> +	 tree temp_addr = create_tmp_reg_or_ssa_name (arg1_type);
> +	 g = gimple_build_assign (temp_addr, POINTER_PLUS_EXPR, arg1,
> +				  temp_offset);
> +	 gimple_set_location (g, loc);
> +	 gsi_insert_before (gsi, g, GSI_SAME_STMT);
> +
> +	 /* Mask off any lower bits from the address.  */
> +	 tree alignment_mask = build_int_cst (arg1_type, -16);
> +	 tree aligned_addr = create_tmp_reg_or_ssa_name (arg1_type);
> +	 g = gimple_build_assign (aligned_addr, BIT_AND_EXPR,
> +				 temp_addr, alignment_mask);
> +	 gimple_set_location (g, loc);
> +	 gsi_insert_before (gsi, g, GSI_SAME_STMT);
> +
> +	 /* Use the build2 helper to set up the mem_ref.  The MEM_REF could also
> +	    take an offset, but since we've already incorporated the offset
> +	    above, here we just pass in a zero.  */
> +	 g = gimple_build_assign (lhs, build2 (MEM_REF, lhs_type, aligned_addr,
> +						build_int_cst (arg1_type, 0)));
> +	 gimple_set_location (g, loc);
> +	 gsi_replace (gsi, g, true);
> +
> +	 return true;
> +
> +      }
> +
>     default:
> 	if (TARGET_DEBUG_BUILTIN)
> 	   fprintf (stderr, "gimple builtin intrinsic not matched:%d %s %s\n",
> 		    fn_code, fn_name1, fn_name2);
>       break;
> 
>
Richard Biener Sept. 13, 2017, 12:23 p.m. UTC | #5
On Tue, Sep 12, 2017 at 11:08 PM, Will Schmidt
<will_schmidt@vnet.ibm.com> wrote:
> Hi,
>
> [PATCH, rs6000] [v2] Folding of vector loads in GIMPLE
>
> Folding of vector loads in GIMPLE.
>
> Add code to handle gimple folding for the vec_ld builtins.
> Remove the now obsoleted folding code for vec_ld from rs6000-c.c. Surrounding
> comments have been adjusted slightly so they continue to read OK for the
> existing vec_st code.
>
> The resulting code is specifically verified by the powerpc/fold-vec-ld-*.c
> tests which have been posted separately.
>
> For V2 of this patch, I've removed the chunk of code that prohibited the
> gimple fold from occurring in BE environments.   This had fixed an issue
> for me earlier during my development of the code, and turns out this was
> not necessary.  I've sniff-tested after removing that check and it looks
> OK.
>
>>+ /* Limit folding of loads to LE targets.  */
>> +      if (BYTES_BIG_ENDIAN || VECTOR_ELT_ORDER_BIG)
>> +        return false;
>
> I've restarted a regression test on this updated version.
>
> OK for trunk (assuming successful regression test completion)  ?
>
> Thanks,
> -Will
>
> [gcc]
>
>         2017-09-12  Will Schmidt  <will_schmidt@vnet.ibm.com>
>
>         * config/rs6000/rs6000.c (rs6000_gimple_fold_builtin): Add handling
>         for early folding of vector loads (ALTIVEC_BUILTIN_LVX_*).
>         * config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin):
>         Remove obsoleted code for handling ALTIVEC_BUILTIN_VEC_LD.
>
> diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c
> index fbab0a2..bb8a77d 100644
> --- a/gcc/config/rs6000/rs6000-c.c
> +++ b/gcc/config/rs6000/rs6000-c.c
> @@ -6470,92 +6470,19 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl,
>                      convert (TREE_TYPE (stmt), arg0));
>        stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl);
>        return stmt;
>      }
>
> -  /* Expand vec_ld into an expression that masks the address and
> -     performs the load.  We need to expand this early to allow
> +  /* Expand vec_st into an expression that masks the address and
> +     performs the store.  We need to expand this early to allow
>       the best aliasing, as by the time we get into RTL we no longer
>       are able to honor __restrict__, for example.  We may want to
>       consider this for all memory access built-ins.
>
>       When -maltivec=be is specified, or the wrong number of arguments
>       is provided, simply punt to existing built-in processing.  */
> -  if (fcode == ALTIVEC_BUILTIN_VEC_LD
> -      && (BYTES_BIG_ENDIAN || !VECTOR_ELT_ORDER_BIG)
> -      && nargs == 2)
> -    {
> -      tree arg0 = (*arglist)[0];
> -      tree arg1 = (*arglist)[1];
> -
> -      /* Strip qualifiers like "const" from the pointer arg.  */
> -      tree arg1_type = TREE_TYPE (arg1);
> -      if (!POINTER_TYPE_P (arg1_type) && TREE_CODE (arg1_type) != ARRAY_TYPE)
> -       goto bad;
> -
> -      tree inner_type = TREE_TYPE (arg1_type);
> -      if (TYPE_QUALS (TREE_TYPE (arg1_type)) != 0)
> -       {
> -         arg1_type = build_pointer_type (build_qualified_type (inner_type,
> -                                                               0));
> -         arg1 = fold_convert (arg1_type, arg1);
> -       }
> -
> -      /* Construct the masked address.  Let existing error handling take
> -        over if we don't have a constant offset.  */
> -      arg0 = fold (arg0);
> -
> -      if (TREE_CODE (arg0) == INTEGER_CST)
> -       {
> -         if (!ptrofftype_p (TREE_TYPE (arg0)))
> -           arg0 = build1 (NOP_EXPR, sizetype, arg0);
> -
> -         tree arg1_type = TREE_TYPE (arg1);
> -         if (TREE_CODE (arg1_type) == ARRAY_TYPE)
> -           {
> -             arg1_type = TYPE_POINTER_TO (TREE_TYPE (arg1_type));
> -             tree const0 = build_int_cstu (sizetype, 0);
> -             tree arg1_elt0 = build_array_ref (loc, arg1, const0);
> -             arg1 = build1 (ADDR_EXPR, arg1_type, arg1_elt0);
> -           }
> -
> -         tree addr = fold_build2_loc (loc, POINTER_PLUS_EXPR, arg1_type,
> -                                      arg1, arg0);
> -         tree aligned = fold_build2_loc (loc, BIT_AND_EXPR, arg1_type, addr,
> -                                         build_int_cst (arg1_type, -16));
> -
> -         /* Find the built-in to get the return type so we can convert
> -            the result properly (or fall back to default handling if the
> -            arguments aren't compatible).  */
> -         for (desc = altivec_overloaded_builtins;
> -              desc->code && desc->code != fcode; desc++)
> -           continue;
> -
> -         for (; desc->code == fcode; desc++)
> -           if (rs6000_builtin_type_compatible (TREE_TYPE (arg0), desc->op1)
> -               && (rs6000_builtin_type_compatible (TREE_TYPE (arg1),
> -                                                   desc->op2)))
> -             {
> -               tree ret_type = rs6000_builtin_type (desc->ret_type);
> -               if (TYPE_MODE (ret_type) == V2DImode)
> -                 /* Type-based aliasing analysis thinks vector long
> -                    and vector long long are different and will put them
> -                    in distinct alias classes.  Force our return type
> -                    to be a may-alias type to avoid this.  */
> -                 ret_type
> -                   = build_pointer_type_for_mode (ret_type, Pmode,
> -                                                  true/*can_alias_all*/);
> -               else
> -                 ret_type = build_pointer_type (ret_type);
> -               aligned = build1 (NOP_EXPR, ret_type, aligned);
> -               tree ret_val = build_indirect_ref (loc, aligned, RO_NULL);
> -               return ret_val;
> -             }
> -       }
> -    }
>
> -  /* Similarly for stvx.  */
>    if (fcode == ALTIVEC_BUILTIN_VEC_ST
>        && (BYTES_BIG_ENDIAN || !VECTOR_ELT_ORDER_BIG)
>        && nargs == 3)
>      {
>        tree arg0 = (*arglist)[0];
> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> index 1338371..1fb5f44 100644
> --- a/gcc/config/rs6000/rs6000.c
> +++ b/gcc/config/rs6000/rs6000.c
> @@ -16547,10 +16547,61 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi)
>         res = gimple_build (&stmts, VIEW_CONVERT_EXPR, TREE_TYPE (lhs), res);
>         gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
>         update_call_from_tree (gsi, res);
>         return true;
>        }
> +    /* Vector loads.  */
> +    case ALTIVEC_BUILTIN_LVX_V16QI:
> +    case ALTIVEC_BUILTIN_LVX_V8HI:
> +    case ALTIVEC_BUILTIN_LVX_V4SI:
> +    case ALTIVEC_BUILTIN_LVX_V4SF:
> +    case ALTIVEC_BUILTIN_LVX_V2DI:
> +    case ALTIVEC_BUILTIN_LVX_V2DF:
> +      {
> +        gimple *g;
> +        arg0 = gimple_call_arg (stmt, 0);  // offset
> +        arg1 = gimple_call_arg (stmt, 1);  // address
> +
> +        lhs = gimple_call_lhs (stmt);
> +        location_t loc = gimple_location (stmt);
> +
> +        tree arg1_type = TREE_TYPE (arg1);
> +        tree lhs_type = TREE_TYPE (lhs);
> +
> +        /* POINTER_PLUS_EXPR wants the offset to be of type 'sizetype'.  Create
> +           the tree using the value from arg0.  The resulting type will match
> +           the type of arg1.  */
> +        tree temp_offset = create_tmp_reg_or_ssa_name (sizetype);
> +        g = gimple_build_assign (temp_offset, NOP_EXPR, arg0);
> +        gimple_set_location (g, loc);
> +        gsi_insert_before (gsi, g, GSI_SAME_STMT);
> +        tree temp_addr = create_tmp_reg_or_ssa_name (arg1_type);
> +        g = gimple_build_assign (temp_addr, POINTER_PLUS_EXPR, arg1,
> +                                 temp_offset);
> +        gimple_set_location (g, loc);
> +        gsi_insert_before (gsi, g, GSI_SAME_STMT);
> +
> +        /* Mask off any lower bits from the address.  */
> +        tree alignment_mask = build_int_cst (arg1_type, -16);
> +        tree aligned_addr = create_tmp_reg_or_ssa_name (arg1_type);
> +        g = gimple_build_assign (aligned_addr, BIT_AND_EXPR,
> +                                temp_addr, alignment_mask);
> +        gimple_set_location (g, loc);
> +        gsi_insert_before (gsi, g, GSI_SAME_STMT);

You could use

  gimple_seq stmts = NULL;
  tree temp_offset = gimple_convert (&stmts, loc, sizetype, arg0);
  tree temp_addr = gimple_build (&stmts, loc, POINTER_PLUS_EXPR,
arg1_type, arg1, temp_offset);
  tree aligned_addr = gimple_build (&stmts, loc, BIT_AND_EXPR,
arg1_type, temp_addr, build_int_cst (arg1_type, -16));
  gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);

> +        /* Use the build2 helper to set up the mem_ref.  The MEM_REF could also
> +           take an offset, but since we've already incorporated the offset
> +           above, here we just pass in a zero.  */
> +        g = gimple_build_assign (lhs, build2 (MEM_REF, lhs_type, aligned_addr,
> +                                               build_int_cst (arg1_type, 0)));

are you sure about arg1_type here?  I'm sure not.  For

... foo (struct S *p)
{
  return __builtin_lvx_v2df (4, (double *)p);
}

you'd end up with p as arg1 and thus struct S * as arg1_type and thus
TBAA using 'struct S' to access the memory.

I think if the builtins have any TBAA constraints you need to build those
explicitely, if not, you should use ptr_type_node aka no TBAA.

Richard.

> +        gimple_set_location (g, loc);
> +        gsi_replace (gsi, g, true);
> +
> +        return true;
> +
> +      }
> +
>      default:
>         if (TARGET_DEBUG_BUILTIN)
>            fprintf (stderr, "gimple builtin intrinsic not matched:%d %s %s\n",
>                     fn_code, fn_name1, fn_name2);
>        break;
>
>
Bill Schmidt Sept. 13, 2017, 3:40 p.m. UTC | #6
On Sep 13, 2017, at 7:23 AM, Richard Biener <richard.guenther@gmail.com> wrote:
> 
> On Tue, Sep 12, 2017 at 11:08 PM, Will Schmidt
> <will_schmidt@vnet.ibm.com> wrote:
>> Hi,
>> 
>> [PATCH, rs6000] [v2] Folding of vector loads in GIMPLE
>> 
>> Folding of vector loads in GIMPLE.
>> 
>> Add code to handle gimple folding for the vec_ld builtins.
>> Remove the now obsoleted folding code for vec_ld from rs6000-c.c. Surrounding
>> comments have been adjusted slightly so they continue to read OK for the
>> existing vec_st code.
>> 
>> The resulting code is specifically verified by the powerpc/fold-vec-ld-*.c
>> tests which have been posted separately.
>> 
>> For V2 of this patch, I've removed the chunk of code that prohibited the
>> gimple fold from occurring in BE environments.   This had fixed an issue
>> for me earlier during my development of the code, and turns out this was
>> not necessary.  I've sniff-tested after removing that check and it looks
>> OK.
>> 
>>> + /* Limit folding of loads to LE targets.  */
>>> +      if (BYTES_BIG_ENDIAN || VECTOR_ELT_ORDER_BIG)
>>> +        return false;
>> 
>> I've restarted a regression test on this updated version.
>> 
>> OK for trunk (assuming successful regression test completion)  ?
>> 
>> Thanks,
>> -Will
>> 
>> [gcc]
>> 
>>        2017-09-12  Will Schmidt  <will_schmidt@vnet.ibm.com>
>> 
>>        * config/rs6000/rs6000.c (rs6000_gimple_fold_builtin): Add handling
>>        for early folding of vector loads (ALTIVEC_BUILTIN_LVX_*).
>>        * config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin):
>>        Remove obsoleted code for handling ALTIVEC_BUILTIN_VEC_LD.
>> 
>> diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c
>> index fbab0a2..bb8a77d 100644
>> --- a/gcc/config/rs6000/rs6000-c.c
>> +++ b/gcc/config/rs6000/rs6000-c.c
>> @@ -6470,92 +6470,19 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl,
>>                     convert (TREE_TYPE (stmt), arg0));
>>       stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl);
>>       return stmt;
>>     }
>> 
>> -  /* Expand vec_ld into an expression that masks the address and
>> -     performs the load.  We need to expand this early to allow
>> +  /* Expand vec_st into an expression that masks the address and
>> +     performs the store.  We need to expand this early to allow
>>      the best aliasing, as by the time we get into RTL we no longer
>>      are able to honor __restrict__, for example.  We may want to
>>      consider this for all memory access built-ins.
>> 
>>      When -maltivec=be is specified, or the wrong number of arguments
>>      is provided, simply punt to existing built-in processing.  */
>> -  if (fcode == ALTIVEC_BUILTIN_VEC_LD
>> -      && (BYTES_BIG_ENDIAN || !VECTOR_ELT_ORDER_BIG)
>> -      && nargs == 2)
>> -    {
>> -      tree arg0 = (*arglist)[0];
>> -      tree arg1 = (*arglist)[1];
>> -
>> -      /* Strip qualifiers like "const" from the pointer arg.  */
>> -      tree arg1_type = TREE_TYPE (arg1);
>> -      if (!POINTER_TYPE_P (arg1_type) && TREE_CODE (arg1_type) != ARRAY_TYPE)
>> -       goto bad;
>> -
>> -      tree inner_type = TREE_TYPE (arg1_type);
>> -      if (TYPE_QUALS (TREE_TYPE (arg1_type)) != 0)
>> -       {
>> -         arg1_type = build_pointer_type (build_qualified_type (inner_type,
>> -                                                               0));
>> -         arg1 = fold_convert (arg1_type, arg1);
>> -       }
>> -
>> -      /* Construct the masked address.  Let existing error handling take
>> -        over if we don't have a constant offset.  */
>> -      arg0 = fold (arg0);
>> -
>> -      if (TREE_CODE (arg0) == INTEGER_CST)
>> -       {
>> -         if (!ptrofftype_p (TREE_TYPE (arg0)))
>> -           arg0 = build1 (NOP_EXPR, sizetype, arg0);
>> -
>> -         tree arg1_type = TREE_TYPE (arg1);
>> -         if (TREE_CODE (arg1_type) == ARRAY_TYPE)
>> -           {
>> -             arg1_type = TYPE_POINTER_TO (TREE_TYPE (arg1_type));
>> -             tree const0 = build_int_cstu (sizetype, 0);
>> -             tree arg1_elt0 = build_array_ref (loc, arg1, const0);
>> -             arg1 = build1 (ADDR_EXPR, arg1_type, arg1_elt0);
>> -           }
>> -
>> -         tree addr = fold_build2_loc (loc, POINTER_PLUS_EXPR, arg1_type,
>> -                                      arg1, arg0);
>> -         tree aligned = fold_build2_loc (loc, BIT_AND_EXPR, arg1_type, addr,
>> -                                         build_int_cst (arg1_type, -16));
>> -
>> -         /* Find the built-in to get the return type so we can convert
>> -            the result properly (or fall back to default handling if the
>> -            arguments aren't compatible).  */
>> -         for (desc = altivec_overloaded_builtins;
>> -              desc->code && desc->code != fcode; desc++)
>> -           continue;
>> -
>> -         for (; desc->code == fcode; desc++)
>> -           if (rs6000_builtin_type_compatible (TREE_TYPE (arg0), desc->op1)
>> -               && (rs6000_builtin_type_compatible (TREE_TYPE (arg1),
>> -                                                   desc->op2)))
>> -             {
>> -               tree ret_type = rs6000_builtin_type (desc->ret_type);
>> -               if (TYPE_MODE (ret_type) == V2DImode)
>> -                 /* Type-based aliasing analysis thinks vector long
>> -                    and vector long long are different and will put them
>> -                    in distinct alias classes.  Force our return type
>> -                    to be a may-alias type to avoid this.  */
>> -                 ret_type
>> -                   = build_pointer_type_for_mode (ret_type, Pmode,
>> -                                                  true/*can_alias_all*/);
>> -               else
>> -                 ret_type = build_pointer_type (ret_type);
>> -               aligned = build1 (NOP_EXPR, ret_type, aligned);
>> -               tree ret_val = build_indirect_ref (loc, aligned, RO_NULL);
>> -               return ret_val;
>> -             }
>> -       }
>> -    }
>> 
>> -  /* Similarly for stvx.  */
>>   if (fcode == ALTIVEC_BUILTIN_VEC_ST
>>       && (BYTES_BIG_ENDIAN || !VECTOR_ELT_ORDER_BIG)
>>       && nargs == 3)
>>     {
>>       tree arg0 = (*arglist)[0];
>> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
>> index 1338371..1fb5f44 100644
>> --- a/gcc/config/rs6000/rs6000.c
>> +++ b/gcc/config/rs6000/rs6000.c
>> @@ -16547,10 +16547,61 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi)
>>        res = gimple_build (&stmts, VIEW_CONVERT_EXPR, TREE_TYPE (lhs), res);
>>        gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
>>        update_call_from_tree (gsi, res);
>>        return true;
>>       }
>> +    /* Vector loads.  */
>> +    case ALTIVEC_BUILTIN_LVX_V16QI:
>> +    case ALTIVEC_BUILTIN_LVX_V8HI:
>> +    case ALTIVEC_BUILTIN_LVX_V4SI:
>> +    case ALTIVEC_BUILTIN_LVX_V4SF:
>> +    case ALTIVEC_BUILTIN_LVX_V2DI:
>> +    case ALTIVEC_BUILTIN_LVX_V2DF:
>> +      {
>> +        gimple *g;
>> +        arg0 = gimple_call_arg (stmt, 0);  // offset
>> +        arg1 = gimple_call_arg (stmt, 1);  // address
>> +
>> +        lhs = gimple_call_lhs (stmt);
>> +        location_t loc = gimple_location (stmt);
>> +
>> +        tree arg1_type = TREE_TYPE (arg1);
>> +        tree lhs_type = TREE_TYPE (lhs);
>> +
>> +        /* POINTER_PLUS_EXPR wants the offset to be of type 'sizetype'.  Create
>> +           the tree using the value from arg0.  The resulting type will match
>> +           the type of arg1.  */
>> +        tree temp_offset = create_tmp_reg_or_ssa_name (sizetype);
>> +        g = gimple_build_assign (temp_offset, NOP_EXPR, arg0);
>> +        gimple_set_location (g, loc);
>> +        gsi_insert_before (gsi, g, GSI_SAME_STMT);
>> +        tree temp_addr = create_tmp_reg_or_ssa_name (arg1_type);
>> +        g = gimple_build_assign (temp_addr, POINTER_PLUS_EXPR, arg1,
>> +                                 temp_offset);
>> +        gimple_set_location (g, loc);
>> +        gsi_insert_before (gsi, g, GSI_SAME_STMT);
>> +
>> +        /* Mask off any lower bits from the address.  */
>> +        tree alignment_mask = build_int_cst (arg1_type, -16);
>> +        tree aligned_addr = create_tmp_reg_or_ssa_name (arg1_type);
>> +        g = gimple_build_assign (aligned_addr, BIT_AND_EXPR,
>> +                                temp_addr, alignment_mask);
>> +        gimple_set_location (g, loc);
>> +        gsi_insert_before (gsi, g, GSI_SAME_STMT);
> 
> You could use
> 
>  gimple_seq stmts = NULL;
>  tree temp_offset = gimple_convert (&stmts, loc, sizetype, arg0);
>  tree temp_addr = gimple_build (&stmts, loc, POINTER_PLUS_EXPR,
> arg1_type, arg1, temp_offset);
>  tree aligned_addr = gimple_build (&stmts, loc, BIT_AND_EXPR,
> arg1_type, temp_addr, build_int_cst (arg1_type, -16));
>  gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> 
>> +        /* Use the build2 helper to set up the mem_ref.  The MEM_REF could also
>> +           take an offset, but since we've already incorporated the offset
>> +           above, here we just pass in a zero.  */
>> +        g = gimple_build_assign (lhs, build2 (MEM_REF, lhs_type, aligned_addr,
>> +                                               build_int_cst (arg1_type, 0)));
> 
> are you sure about arg1_type here?  I'm sure not.  For
> 
> ... foo (struct S *p)
> {
>  return __builtin_lvx_v2df (4, (double *)p);
> }
> 
> you'd end up with p as arg1 and thus struct S * as arg1_type and thus
> TBAA using 'struct S' to access the memory.

Hm, is that so?  Wouldn't arg1_type be double* since arg1 is (double *)p?
Will, you should probably test this example and see, but I'm pretty confident
about this (see below).

> 
> I think if the builtins have any TBAA constraints you need to build those
> explicitely, if not, you should use ptr_type_node aka no TBAA.

The type signatures are constrained during parsing, so we should only
see allowed pointer types on arg1 by the time we get to gimple folding.  I
think that using arg1_type should work, but I am probably missing
something subtle, so please feel free to whack me on the temple until
I get it. :-)

Bill
> 
> Richard.
> 
>> +        gimple_set_location (g, loc);
>> +        gsi_replace (gsi, g, true);
>> +
>> +        return true;
>> +
>> +      }
>> +
>>     default:
>>        if (TARGET_DEBUG_BUILTIN)
>>           fprintf (stderr, "gimple builtin intrinsic not matched:%d %s %s\n",
>>                    fn_code, fn_name1, fn_name2);
>>       break;
Bill Schmidt Sept. 13, 2017, 8:14 p.m. UTC | #7
On Sep 13, 2017, at 10:40 AM, Bill Schmidt <wschmidt@linux.vnet.ibm.com> wrote:
> 
> On Sep 13, 2017, at 7:23 AM, Richard Biener <richard.guenther@gmail.com> wrote:
>> 
>> On Tue, Sep 12, 2017 at 11:08 PM, Will Schmidt
>> <will_schmidt@vnet.ibm.com> wrote:
>>> Hi,
>>> 
>>> [PATCH, rs6000] [v2] Folding of vector loads in GIMPLE
>>> 
>>> Folding of vector loads in GIMPLE.
>>> 
>>> Add code to handle gimple folding for the vec_ld builtins.
>>> Remove the now obsoleted folding code for vec_ld from rs6000-c.c. Surrounding
>>> comments have been adjusted slightly so they continue to read OK for the
>>> existing vec_st code.
>>> 
>>> The resulting code is specifically verified by the powerpc/fold-vec-ld-*.c
>>> tests which have been posted separately.
>>> 
>>> For V2 of this patch, I've removed the chunk of code that prohibited the
>>> gimple fold from occurring in BE environments.   This had fixed an issue
>>> for me earlier during my development of the code, and turns out this was
>>> not necessary.  I've sniff-tested after removing that check and it looks
>>> OK.
>>> 
>>>> + /* Limit folding of loads to LE targets.  */
>>>> +      if (BYTES_BIG_ENDIAN || VECTOR_ELT_ORDER_BIG)
>>>> +        return false;
>>> 
>>> I've restarted a regression test on this updated version.
>>> 
>>> OK for trunk (assuming successful regression test completion)  ?
>>> 
>>> Thanks,
>>> -Will
>>> 
>>> [gcc]
>>> 
>>>       2017-09-12  Will Schmidt  <will_schmidt@vnet.ibm.com>
>>> 
>>>       * config/rs6000/rs6000.c (rs6000_gimple_fold_builtin): Add handling
>>>       for early folding of vector loads (ALTIVEC_BUILTIN_LVX_*).
>>>       * config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin):
>>>       Remove obsoleted code for handling ALTIVEC_BUILTIN_VEC_LD.
>>> 
>>> diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c
>>> index fbab0a2..bb8a77d 100644
>>> --- a/gcc/config/rs6000/rs6000-c.c
>>> +++ b/gcc/config/rs6000/rs6000-c.c
>>> @@ -6470,92 +6470,19 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl,
>>>                    convert (TREE_TYPE (stmt), arg0));
>>>      stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl);
>>>      return stmt;
>>>    }
>>> 
>>> -  /* Expand vec_ld into an expression that masks the address and
>>> -     performs the load.  We need to expand this early to allow
>>> +  /* Expand vec_st into an expression that masks the address and
>>> +     performs the store.  We need to expand this early to allow
>>>     the best aliasing, as by the time we get into RTL we no longer
>>>     are able to honor __restrict__, for example.  We may want to
>>>     consider this for all memory access built-ins.
>>> 
>>>     When -maltivec=be is specified, or the wrong number of arguments
>>>     is provided, simply punt to existing built-in processing.  */
>>> -  if (fcode == ALTIVEC_BUILTIN_VEC_LD
>>> -      && (BYTES_BIG_ENDIAN || !VECTOR_ELT_ORDER_BIG)
>>> -      && nargs == 2)
>>> -    {
>>> -      tree arg0 = (*arglist)[0];
>>> -      tree arg1 = (*arglist)[1];
>>> -
>>> -      /* Strip qualifiers like "const" from the pointer arg.  */
>>> -      tree arg1_type = TREE_TYPE (arg1);
>>> -      if (!POINTER_TYPE_P (arg1_type) && TREE_CODE (arg1_type) != ARRAY_TYPE)
>>> -       goto bad;
>>> -
>>> -      tree inner_type = TREE_TYPE (arg1_type);
>>> -      if (TYPE_QUALS (TREE_TYPE (arg1_type)) != 0)
>>> -       {
>>> -         arg1_type = build_pointer_type (build_qualified_type (inner_type,
>>> -                                                               0));
>>> -         arg1 = fold_convert (arg1_type, arg1);
>>> -       }
>>> -
>>> -      /* Construct the masked address.  Let existing error handling take
>>> -        over if we don't have a constant offset.  */
>>> -      arg0 = fold (arg0);
>>> -
>>> -      if (TREE_CODE (arg0) == INTEGER_CST)
>>> -       {
>>> -         if (!ptrofftype_p (TREE_TYPE (arg0)))
>>> -           arg0 = build1 (NOP_EXPR, sizetype, arg0);
>>> -
>>> -         tree arg1_type = TREE_TYPE (arg1);
>>> -         if (TREE_CODE (arg1_type) == ARRAY_TYPE)
>>> -           {
>>> -             arg1_type = TYPE_POINTER_TO (TREE_TYPE (arg1_type));
>>> -             tree const0 = build_int_cstu (sizetype, 0);
>>> -             tree arg1_elt0 = build_array_ref (loc, arg1, const0);
>>> -             arg1 = build1 (ADDR_EXPR, arg1_type, arg1_elt0);
>>> -           }
>>> -
>>> -         tree addr = fold_build2_loc (loc, POINTER_PLUS_EXPR, arg1_type,
>>> -                                      arg1, arg0);
>>> -         tree aligned = fold_build2_loc (loc, BIT_AND_EXPR, arg1_type, addr,
>>> -                                         build_int_cst (arg1_type, -16));
>>> -
>>> -         /* Find the built-in to get the return type so we can convert
>>> -            the result properly (or fall back to default handling if the
>>> -            arguments aren't compatible).  */
>>> -         for (desc = altivec_overloaded_builtins;
>>> -              desc->code && desc->code != fcode; desc++)
>>> -           continue;
>>> -
>>> -         for (; desc->code == fcode; desc++)
>>> -           if (rs6000_builtin_type_compatible (TREE_TYPE (arg0), desc->op1)
>>> -               && (rs6000_builtin_type_compatible (TREE_TYPE (arg1),
>>> -                                                   desc->op2)))
>>> -             {
>>> -               tree ret_type = rs6000_builtin_type (desc->ret_type);
>>> -               if (TYPE_MODE (ret_type) == V2DImode)
>>> -                 /* Type-based aliasing analysis thinks vector long
>>> -                    and vector long long are different and will put them
>>> -                    in distinct alias classes.  Force our return type
>>> -                    to be a may-alias type to avoid this.  */
>>> -                 ret_type
>>> -                   = build_pointer_type_for_mode (ret_type, Pmode,
>>> -                                                  true/*can_alias_all*/);
>>> -               else
>>> -                 ret_type = build_pointer_type (ret_type);
>>> -               aligned = build1 (NOP_EXPR, ret_type, aligned);
>>> -               tree ret_val = build_indirect_ref (loc, aligned, RO_NULL);
>>> -               return ret_val;
>>> -             }
>>> -       }
>>> -    }
>>> 
>>> -  /* Similarly for stvx.  */
>>>  if (fcode == ALTIVEC_BUILTIN_VEC_ST
>>>      && (BYTES_BIG_ENDIAN || !VECTOR_ELT_ORDER_BIG)
>>>      && nargs == 3)
>>>    {
>>>      tree arg0 = (*arglist)[0];
>>> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
>>> index 1338371..1fb5f44 100644
>>> --- a/gcc/config/rs6000/rs6000.c
>>> +++ b/gcc/config/rs6000/rs6000.c
>>> @@ -16547,10 +16547,61 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi)
>>>       res = gimple_build (&stmts, VIEW_CONVERT_EXPR, TREE_TYPE (lhs), res);
>>>       gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
>>>       update_call_from_tree (gsi, res);
>>>       return true;
>>>      }
>>> +    /* Vector loads.  */
>>> +    case ALTIVEC_BUILTIN_LVX_V16QI:
>>> +    case ALTIVEC_BUILTIN_LVX_V8HI:
>>> +    case ALTIVEC_BUILTIN_LVX_V4SI:
>>> +    case ALTIVEC_BUILTIN_LVX_V4SF:
>>> +    case ALTIVEC_BUILTIN_LVX_V2DI:
>>> +    case ALTIVEC_BUILTIN_LVX_V2DF:
>>> +      {
>>> +        gimple *g;
>>> +        arg0 = gimple_call_arg (stmt, 0);  // offset
>>> +        arg1 = gimple_call_arg (stmt, 1);  // address
>>> +
>>> +        lhs = gimple_call_lhs (stmt);
>>> +        location_t loc = gimple_location (stmt);
>>> +
>>> +        tree arg1_type = TREE_TYPE (arg1);
>>> +        tree lhs_type = TREE_TYPE (lhs);
>>> +
>>> +        /* POINTER_PLUS_EXPR wants the offset to be of type 'sizetype'.  Create
>>> +           the tree using the value from arg0.  The resulting type will match
>>> +           the type of arg1.  */
>>> +        tree temp_offset = create_tmp_reg_or_ssa_name (sizetype);
>>> +        g = gimple_build_assign (temp_offset, NOP_EXPR, arg0);
>>> +        gimple_set_location (g, loc);
>>> +        gsi_insert_before (gsi, g, GSI_SAME_STMT);
>>> +        tree temp_addr = create_tmp_reg_or_ssa_name (arg1_type);
>>> +        g = gimple_build_assign (temp_addr, POINTER_PLUS_EXPR, arg1,
>>> +                                 temp_offset);
>>> +        gimple_set_location (g, loc);
>>> +        gsi_insert_before (gsi, g, GSI_SAME_STMT);
>>> +
>>> +        /* Mask off any lower bits from the address.  */
>>> +        tree alignment_mask = build_int_cst (arg1_type, -16);
>>> +        tree aligned_addr = create_tmp_reg_or_ssa_name (arg1_type);
>>> +        g = gimple_build_assign (aligned_addr, BIT_AND_EXPR,
>>> +                                temp_addr, alignment_mask);
>>> +        gimple_set_location (g, loc);
>>> +        gsi_insert_before (gsi, g, GSI_SAME_STMT);
>> 
>> You could use
>> 
>> gimple_seq stmts = NULL;
>> tree temp_offset = gimple_convert (&stmts, loc, sizetype, arg0);
>> tree temp_addr = gimple_build (&stmts, loc, POINTER_PLUS_EXPR,
>> arg1_type, arg1, temp_offset);
>> tree aligned_addr = gimple_build (&stmts, loc, BIT_AND_EXPR,
>> arg1_type, temp_addr, build_int_cst (arg1_type, -16));
>> gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
>> 
>>> +        /* Use the build2 helper to set up the mem_ref.  The MEM_REF could also
>>> +           take an offset, but since we've already incorporated the offset
>>> +           above, here we just pass in a zero.  */
>>> +        g = gimple_build_assign (lhs, build2 (MEM_REF, lhs_type, aligned_addr,
>>> +                                               build_int_cst (arg1_type, 0)));
>> 
>> are you sure about arg1_type here?  I'm sure not.  For
>> 
>> ... foo (struct S *p)
>> {
>> return __builtin_lvx_v2df (4, (double *)p);
>> }
>> 
>> you'd end up with p as arg1 and thus struct S * as arg1_type and thus
>> TBAA using 'struct S' to access the memory.
> 
> Hm, is that so?  Wouldn't arg1_type be double* since arg1 is (double *)p?
> Will, you should probably test this example and see, but I'm pretty confident
> about this (see below).

But, as I should have suspected, you're right.  For some reason 
gimple_call_arg is returning p, stripped of the cast information where the
user asserted that p points to a double*.

Can you explain to me why this should be so?  I assume that somebody
has decided to strip_nops the argument and lose the cast.

Using ptr_type_node loses all type information, so that would be a
regression from what we do today.  In some cases we could reconstruct
that this was necessarily, say, a double*, but I don't know how we would
recover the signedness for an integer type.

Bill
> 
>> 
>> I think if the builtins have any TBAA constraints you need to build those
>> explicitely, if not, you should use ptr_type_node aka no TBAA.
> 
> The type signatures are constrained during parsing, so we should only
> see allowed pointer types on arg1 by the time we get to gimple folding.  I
> think that using arg1_type should work, but I am probably missing
> something subtle, so please feel free to whack me on the temple until
> I get it. :-)
> 
> Bill
>> 
>> Richard.
>> 
>>> +        gimple_set_location (g, loc);
>>> +        gsi_replace (gsi, g, true);
>>> +
>>> +        return true;
>>> +
>>> +      }
>>> +
>>>    default:
>>>       if (TARGET_DEBUG_BUILTIN)
>>>          fprintf (stderr, "gimple builtin intrinsic not matched:%d %s %s\n",
>>>                   fn_code, fn_name1, fn_name2);
>>>      break;
Richard Biener Sept. 14, 2017, 10:15 a.m. UTC | #8
On Wed, Sep 13, 2017 at 10:14 PM, Bill Schmidt
<wschmidt@linux.vnet.ibm.com> wrote:
> On Sep 13, 2017, at 10:40 AM, Bill Schmidt <wschmidt@linux.vnet.ibm.com> wrote:
>>
>> On Sep 13, 2017, at 7:23 AM, Richard Biener <richard.guenther@gmail.com> wrote:
>>>
>>> On Tue, Sep 12, 2017 at 11:08 PM, Will Schmidt
>>> <will_schmidt@vnet.ibm.com> wrote:
>>>> Hi,
>>>>
>>>> [PATCH, rs6000] [v2] Folding of vector loads in GIMPLE
>>>>
>>>> Folding of vector loads in GIMPLE.
>>>>
>>>> Add code to handle gimple folding for the vec_ld builtins.
>>>> Remove the now obsoleted folding code for vec_ld from rs6000-c.c. Surrounding
>>>> comments have been adjusted slightly so they continue to read OK for the
>>>> existing vec_st code.
>>>>
>>>> The resulting code is specifically verified by the powerpc/fold-vec-ld-*.c
>>>> tests which have been posted separately.
>>>>
>>>> For V2 of this patch, I've removed the chunk of code that prohibited the
>>>> gimple fold from occurring in BE environments.   This had fixed an issue
>>>> for me earlier during my development of the code, and turns out this was
>>>> not necessary.  I've sniff-tested after removing that check and it looks
>>>> OK.
>>>>
>>>>> + /* Limit folding of loads to LE targets.  */
>>>>> +      if (BYTES_BIG_ENDIAN || VECTOR_ELT_ORDER_BIG)
>>>>> +        return false;
>>>>
>>>> I've restarted a regression test on this updated version.
>>>>
>>>> OK for trunk (assuming successful regression test completion)  ?
>>>>
>>>> Thanks,
>>>> -Will
>>>>
>>>> [gcc]
>>>>
>>>>       2017-09-12  Will Schmidt  <will_schmidt@vnet.ibm.com>
>>>>
>>>>       * config/rs6000/rs6000.c (rs6000_gimple_fold_builtin): Add handling
>>>>       for early folding of vector loads (ALTIVEC_BUILTIN_LVX_*).
>>>>       * config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin):
>>>>       Remove obsoleted code for handling ALTIVEC_BUILTIN_VEC_LD.
>>>>
>>>> diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c
>>>> index fbab0a2..bb8a77d 100644
>>>> --- a/gcc/config/rs6000/rs6000-c.c
>>>> +++ b/gcc/config/rs6000/rs6000-c.c
>>>> @@ -6470,92 +6470,19 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl,
>>>>                    convert (TREE_TYPE (stmt), arg0));
>>>>      stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl);
>>>>      return stmt;
>>>>    }
>>>>
>>>> -  /* Expand vec_ld into an expression that masks the address and
>>>> -     performs the load.  We need to expand this early to allow
>>>> +  /* Expand vec_st into an expression that masks the address and
>>>> +     performs the store.  We need to expand this early to allow
>>>>     the best aliasing, as by the time we get into RTL we no longer
>>>>     are able to honor __restrict__, for example.  We may want to
>>>>     consider this for all memory access built-ins.
>>>>
>>>>     When -maltivec=be is specified, or the wrong number of arguments
>>>>     is provided, simply punt to existing built-in processing.  */
>>>> -  if (fcode == ALTIVEC_BUILTIN_VEC_LD
>>>> -      && (BYTES_BIG_ENDIAN || !VECTOR_ELT_ORDER_BIG)
>>>> -      && nargs == 2)
>>>> -    {
>>>> -      tree arg0 = (*arglist)[0];
>>>> -      tree arg1 = (*arglist)[1];
>>>> -
>>>> -      /* Strip qualifiers like "const" from the pointer arg.  */
>>>> -      tree arg1_type = TREE_TYPE (arg1);
>>>> -      if (!POINTER_TYPE_P (arg1_type) && TREE_CODE (arg1_type) != ARRAY_TYPE)
>>>> -       goto bad;
>>>> -
>>>> -      tree inner_type = TREE_TYPE (arg1_type);
>>>> -      if (TYPE_QUALS (TREE_TYPE (arg1_type)) != 0)
>>>> -       {
>>>> -         arg1_type = build_pointer_type (build_qualified_type (inner_type,
>>>> -                                                               0));
>>>> -         arg1 = fold_convert (arg1_type, arg1);
>>>> -       }
>>>> -
>>>> -      /* Construct the masked address.  Let existing error handling take
>>>> -        over if we don't have a constant offset.  */
>>>> -      arg0 = fold (arg0);
>>>> -
>>>> -      if (TREE_CODE (arg0) == INTEGER_CST)
>>>> -       {
>>>> -         if (!ptrofftype_p (TREE_TYPE (arg0)))
>>>> -           arg0 = build1 (NOP_EXPR, sizetype, arg0);
>>>> -
>>>> -         tree arg1_type = TREE_TYPE (arg1);
>>>> -         if (TREE_CODE (arg1_type) == ARRAY_TYPE)
>>>> -           {
>>>> -             arg1_type = TYPE_POINTER_TO (TREE_TYPE (arg1_type));
>>>> -             tree const0 = build_int_cstu (sizetype, 0);
>>>> -             tree arg1_elt0 = build_array_ref (loc, arg1, const0);
>>>> -             arg1 = build1 (ADDR_EXPR, arg1_type, arg1_elt0);
>>>> -           }
>>>> -
>>>> -         tree addr = fold_build2_loc (loc, POINTER_PLUS_EXPR, arg1_type,
>>>> -                                      arg1, arg0);
>>>> -         tree aligned = fold_build2_loc (loc, BIT_AND_EXPR, arg1_type, addr,
>>>> -                                         build_int_cst (arg1_type, -16));
>>>> -
>>>> -         /* Find the built-in to get the return type so we can convert
>>>> -            the result properly (or fall back to default handling if the
>>>> -            arguments aren't compatible).  */
>>>> -         for (desc = altivec_overloaded_builtins;
>>>> -              desc->code && desc->code != fcode; desc++)
>>>> -           continue;
>>>> -
>>>> -         for (; desc->code == fcode; desc++)
>>>> -           if (rs6000_builtin_type_compatible (TREE_TYPE (arg0), desc->op1)
>>>> -               && (rs6000_builtin_type_compatible (TREE_TYPE (arg1),
>>>> -                                                   desc->op2)))
>>>> -             {
>>>> -               tree ret_type = rs6000_builtin_type (desc->ret_type);
>>>> -               if (TYPE_MODE (ret_type) == V2DImode)
>>>> -                 /* Type-based aliasing analysis thinks vector long
>>>> -                    and vector long long are different and will put them
>>>> -                    in distinct alias classes.  Force our return type
>>>> -                    to be a may-alias type to avoid this.  */
>>>> -                 ret_type
>>>> -                   = build_pointer_type_for_mode (ret_type, Pmode,
>>>> -                                                  true/*can_alias_all*/);
>>>> -               else
>>>> -                 ret_type = build_pointer_type (ret_type);
>>>> -               aligned = build1 (NOP_EXPR, ret_type, aligned);
>>>> -               tree ret_val = build_indirect_ref (loc, aligned, RO_NULL);
>>>> -               return ret_val;
>>>> -             }
>>>> -       }
>>>> -    }
>>>>
>>>> -  /* Similarly for stvx.  */
>>>>  if (fcode == ALTIVEC_BUILTIN_VEC_ST
>>>>      && (BYTES_BIG_ENDIAN || !VECTOR_ELT_ORDER_BIG)
>>>>      && nargs == 3)
>>>>    {
>>>>      tree arg0 = (*arglist)[0];
>>>> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
>>>> index 1338371..1fb5f44 100644
>>>> --- a/gcc/config/rs6000/rs6000.c
>>>> +++ b/gcc/config/rs6000/rs6000.c
>>>> @@ -16547,10 +16547,61 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi)
>>>>       res = gimple_build (&stmts, VIEW_CONVERT_EXPR, TREE_TYPE (lhs), res);
>>>>       gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
>>>>       update_call_from_tree (gsi, res);
>>>>       return true;
>>>>      }
>>>> +    /* Vector loads.  */
>>>> +    case ALTIVEC_BUILTIN_LVX_V16QI:
>>>> +    case ALTIVEC_BUILTIN_LVX_V8HI:
>>>> +    case ALTIVEC_BUILTIN_LVX_V4SI:
>>>> +    case ALTIVEC_BUILTIN_LVX_V4SF:
>>>> +    case ALTIVEC_BUILTIN_LVX_V2DI:
>>>> +    case ALTIVEC_BUILTIN_LVX_V2DF:
>>>> +      {
>>>> +        gimple *g;
>>>> +        arg0 = gimple_call_arg (stmt, 0);  // offset
>>>> +        arg1 = gimple_call_arg (stmt, 1);  // address
>>>> +
>>>> +        lhs = gimple_call_lhs (stmt);
>>>> +        location_t loc = gimple_location (stmt);
>>>> +
>>>> +        tree arg1_type = TREE_TYPE (arg1);
>>>> +        tree lhs_type = TREE_TYPE (lhs);
>>>> +
>>>> +        /* POINTER_PLUS_EXPR wants the offset to be of type 'sizetype'.  Create
>>>> +           the tree using the value from arg0.  The resulting type will match
>>>> +           the type of arg1.  */
>>>> +        tree temp_offset = create_tmp_reg_or_ssa_name (sizetype);
>>>> +        g = gimple_build_assign (temp_offset, NOP_EXPR, arg0);
>>>> +        gimple_set_location (g, loc);
>>>> +        gsi_insert_before (gsi, g, GSI_SAME_STMT);
>>>> +        tree temp_addr = create_tmp_reg_or_ssa_name (arg1_type);
>>>> +        g = gimple_build_assign (temp_addr, POINTER_PLUS_EXPR, arg1,
>>>> +                                 temp_offset);
>>>> +        gimple_set_location (g, loc);
>>>> +        gsi_insert_before (gsi, g, GSI_SAME_STMT);
>>>> +
>>>> +        /* Mask off any lower bits from the address.  */
>>>> +        tree alignment_mask = build_int_cst (arg1_type, -16);
>>>> +        tree aligned_addr = create_tmp_reg_or_ssa_name (arg1_type);
>>>> +        g = gimple_build_assign (aligned_addr, BIT_AND_EXPR,
>>>> +                                temp_addr, alignment_mask);
>>>> +        gimple_set_location (g, loc);
>>>> +        gsi_insert_before (gsi, g, GSI_SAME_STMT);
>>>
>>> You could use
>>>
>>> gimple_seq stmts = NULL;
>>> tree temp_offset = gimple_convert (&stmts, loc, sizetype, arg0);
>>> tree temp_addr = gimple_build (&stmts, loc, POINTER_PLUS_EXPR,
>>> arg1_type, arg1, temp_offset);
>>> tree aligned_addr = gimple_build (&stmts, loc, BIT_AND_EXPR,
>>> arg1_type, temp_addr, build_int_cst (arg1_type, -16));
>>> gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
>>>
>>>> +        /* Use the build2 helper to set up the mem_ref.  The MEM_REF could also
>>>> +           take an offset, but since we've already incorporated the offset
>>>> +           above, here we just pass in a zero.  */
>>>> +        g = gimple_build_assign (lhs, build2 (MEM_REF, lhs_type, aligned_addr,
>>>> +                                               build_int_cst (arg1_type, 0)));
>>>
>>> are you sure about arg1_type here?  I'm sure not.  For
>>>
>>> ... foo (struct S *p)
>>> {
>>> return __builtin_lvx_v2df (4, (double *)p);
>>> }
>>>
>>> you'd end up with p as arg1 and thus struct S * as arg1_type and thus
>>> TBAA using 'struct S' to access the memory.
>>
>> Hm, is that so?  Wouldn't arg1_type be double* since arg1 is (double *)p?
>> Will, you should probably test this example and see, but I'm pretty confident
>> about this (see below).
>
> But, as I should have suspected, you're right.  For some reason
> gimple_call_arg is returning p, stripped of the cast information where the
> user asserted that p points to a double*.
>
> Can you explain to me why this should be so?  I assume that somebody
> has decided to strip_nops the argument and lose the cast.

pointer types have no meaning in GIMPLE so we aggressively prune them.

> Using ptr_type_node loses all type information, so that would be a
> regression from what we do today.  In some cases we could reconstruct
> that this was necessarily, say, a double*, but I don't know how we would
> recover the signedness for an integer type.

How did we handle the expansion previously - ah - it was done earlier
in the C FE.  So why are you moving it to GIMPLE?  The function is called
resolve_overloaded_builtin - what kind of overloading do you resolve here?
As said argument types might not be preserved.

Richard.

> Bill
>>
>>>
>>> I think if the builtins have any TBAA constraints you need to build those
>>> explicitely, if not, you should use ptr_type_node aka no TBAA.
>>
>> The type signatures are constrained during parsing, so we should only
>> see allowed pointer types on arg1 by the time we get to gimple folding.  I
>> think that using arg1_type should work, but I am probably missing
>> something subtle, so please feel free to whack me on the temple until
>> I get it. :-)
>>
>> Bill
>>>
>>> Richard.
>>>
>>>> +        gimple_set_location (g, loc);
>>>> +        gsi_replace (gsi, g, true);
>>>> +
>>>> +        return true;
>>>> +
>>>> +      }
>>>> +
>>>>    default:
>>>>       if (TARGET_DEBUG_BUILTIN)
>>>>          fprintf (stderr, "gimple builtin intrinsic not matched:%d %s %s\n",
>>>>                   fn_code, fn_name1, fn_name2);
>>>>      break;
>
Bill Schmidt Sept. 14, 2017, 2:38 p.m. UTC | #9
On Sep 14, 2017, at 5:15 AM, Richard Biener <richard.guenther@gmail.com> wrote:
> 
> On Wed, Sep 13, 2017 at 10:14 PM, Bill Schmidt
> <wschmidt@linux.vnet.ibm.com> wrote:
>> On Sep 13, 2017, at 10:40 AM, Bill Schmidt <wschmidt@linux.vnet.ibm.com> wrote:
>>> 
>>> On Sep 13, 2017, at 7:23 AM, Richard Biener <richard.guenther@gmail.com> wrote:
>>>> 
>>>> On Tue, Sep 12, 2017 at 11:08 PM, Will Schmidt
>>>> <will_schmidt@vnet.ibm.com> wrote:
>>>>> Hi,
>>>>> 
>>>>> [PATCH, rs6000] [v2] Folding of vector loads in GIMPLE
>>>>> 
>>>>> Folding of vector loads in GIMPLE.
>>>>> 
>>>>> Add code to handle gimple folding for the vec_ld builtins.
>>>>> Remove the now obsoleted folding code for vec_ld from rs6000-c.c. Surrounding
>>>>> comments have been adjusted slightly so they continue to read OK for the
>>>>> existing vec_st code.
>>>>> 
>>>>> The resulting code is specifically verified by the powerpc/fold-vec-ld-*.c
>>>>> tests which have been posted separately.
>>>>> 
>>>>> For V2 of this patch, I've removed the chunk of code that prohibited the
>>>>> gimple fold from occurring in BE environments.   This had fixed an issue
>>>>> for me earlier during my development of the code, and turns out this was
>>>>> not necessary.  I've sniff-tested after removing that check and it looks
>>>>> OK.
>>>>> 
>>>>>> + /* Limit folding of loads to LE targets.  */
>>>>>> +      if (BYTES_BIG_ENDIAN || VECTOR_ELT_ORDER_BIG)
>>>>>> +        return false;
>>>>> 
>>>>> I've restarted a regression test on this updated version.
>>>>> 
>>>>> OK for trunk (assuming successful regression test completion)  ?
>>>>> 
>>>>> Thanks,
>>>>> -Will
>>>>> 
>>>>> [gcc]
>>>>> 
>>>>>      2017-09-12  Will Schmidt  <will_schmidt@vnet.ibm.com>
>>>>> 
>>>>>      * config/rs6000/rs6000.c (rs6000_gimple_fold_builtin): Add handling
>>>>>      for early folding of vector loads (ALTIVEC_BUILTIN_LVX_*).
>>>>>      * config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin):
>>>>>      Remove obsoleted code for handling ALTIVEC_BUILTIN_VEC_LD.
>>>>> 
>>>>> diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c
>>>>> index fbab0a2..bb8a77d 100644
>>>>> --- a/gcc/config/rs6000/rs6000-c.c
>>>>> +++ b/gcc/config/rs6000/rs6000-c.c
>>>>> @@ -6470,92 +6470,19 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl,
>>>>>                   convert (TREE_TYPE (stmt), arg0));
>>>>>     stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl);
>>>>>     return stmt;
>>>>>   }
>>>>> 
>>>>> -  /* Expand vec_ld into an expression that masks the address and
>>>>> -     performs the load.  We need to expand this early to allow
>>>>> +  /* Expand vec_st into an expression that masks the address and
>>>>> +     performs the store.  We need to expand this early to allow
>>>>>    the best aliasing, as by the time we get into RTL we no longer
>>>>>    are able to honor __restrict__, for example.  We may want to
>>>>>    consider this for all memory access built-ins.
>>>>> 
>>>>>    When -maltivec=be is specified, or the wrong number of arguments
>>>>>    is provided, simply punt to existing built-in processing.  */
>>>>> -  if (fcode == ALTIVEC_BUILTIN_VEC_LD
>>>>> -      && (BYTES_BIG_ENDIAN || !VECTOR_ELT_ORDER_BIG)
>>>>> -      && nargs == 2)
>>>>> -    {
>>>>> -      tree arg0 = (*arglist)[0];
>>>>> -      tree arg1 = (*arglist)[1];
>>>>> -
>>>>> -      /* Strip qualifiers like "const" from the pointer arg.  */
>>>>> -      tree arg1_type = TREE_TYPE (arg1);
>>>>> -      if (!POINTER_TYPE_P (arg1_type) && TREE_CODE (arg1_type) != ARRAY_TYPE)
>>>>> -       goto bad;
>>>>> -
>>>>> -      tree inner_type = TREE_TYPE (arg1_type);
>>>>> -      if (TYPE_QUALS (TREE_TYPE (arg1_type)) != 0)
>>>>> -       {
>>>>> -         arg1_type = build_pointer_type (build_qualified_type (inner_type,
>>>>> -                                                               0));
>>>>> -         arg1 = fold_convert (arg1_type, arg1);
>>>>> -       }
>>>>> -
>>>>> -      /* Construct the masked address.  Let existing error handling take
>>>>> -        over if we don't have a constant offset.  */
>>>>> -      arg0 = fold (arg0);
>>>>> -
>>>>> -      if (TREE_CODE (arg0) == INTEGER_CST)
>>>>> -       {
>>>>> -         if (!ptrofftype_p (TREE_TYPE (arg0)))
>>>>> -           arg0 = build1 (NOP_EXPR, sizetype, arg0);
>>>>> -
>>>>> -         tree arg1_type = TREE_TYPE (arg1);
>>>>> -         if (TREE_CODE (arg1_type) == ARRAY_TYPE)
>>>>> -           {
>>>>> -             arg1_type = TYPE_POINTER_TO (TREE_TYPE (arg1_type));
>>>>> -             tree const0 = build_int_cstu (sizetype, 0);
>>>>> -             tree arg1_elt0 = build_array_ref (loc, arg1, const0);
>>>>> -             arg1 = build1 (ADDR_EXPR, arg1_type, arg1_elt0);
>>>>> -           }
>>>>> -
>>>>> -         tree addr = fold_build2_loc (loc, POINTER_PLUS_EXPR, arg1_type,
>>>>> -                                      arg1, arg0);
>>>>> -         tree aligned = fold_build2_loc (loc, BIT_AND_EXPR, arg1_type, addr,
>>>>> -                                         build_int_cst (arg1_type, -16));
>>>>> -
>>>>> -         /* Find the built-in to get the return type so we can convert
>>>>> -            the result properly (or fall back to default handling if the
>>>>> -            arguments aren't compatible).  */
>>>>> -         for (desc = altivec_overloaded_builtins;
>>>>> -              desc->code && desc->code != fcode; desc++)
>>>>> -           continue;
>>>>> -
>>>>> -         for (; desc->code == fcode; desc++)
>>>>> -           if (rs6000_builtin_type_compatible (TREE_TYPE (arg0), desc->op1)
>>>>> -               && (rs6000_builtin_type_compatible (TREE_TYPE (arg1),
>>>>> -                                                   desc->op2)))
>>>>> -             {
>>>>> -               tree ret_type = rs6000_builtin_type (desc->ret_type);
>>>>> -               if (TYPE_MODE (ret_type) == V2DImode)
>>>>> -                 /* Type-based aliasing analysis thinks vector long
>>>>> -                    and vector long long are different and will put them
>>>>> -                    in distinct alias classes.  Force our return type
>>>>> -                    to be a may-alias type to avoid this.  */
>>>>> -                 ret_type
>>>>> -                   = build_pointer_type_for_mode (ret_type, Pmode,
>>>>> -                                                  true/*can_alias_all*/);
>>>>> -               else
>>>>> -                 ret_type = build_pointer_type (ret_type);
>>>>> -               aligned = build1 (NOP_EXPR, ret_type, aligned);
>>>>> -               tree ret_val = build_indirect_ref (loc, aligned, RO_NULL);
>>>>> -               return ret_val;
>>>>> -             }
>>>>> -       }
>>>>> -    }
>>>>> 
>>>>> -  /* Similarly for stvx.  */
>>>>> if (fcode == ALTIVEC_BUILTIN_VEC_ST
>>>>>     && (BYTES_BIG_ENDIAN || !VECTOR_ELT_ORDER_BIG)
>>>>>     && nargs == 3)
>>>>>   {
>>>>>     tree arg0 = (*arglist)[0];
>>>>> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
>>>>> index 1338371..1fb5f44 100644
>>>>> --- a/gcc/config/rs6000/rs6000.c
>>>>> +++ b/gcc/config/rs6000/rs6000.c
>>>>> @@ -16547,10 +16547,61 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi)
>>>>>      res = gimple_build (&stmts, VIEW_CONVERT_EXPR, TREE_TYPE (lhs), res);
>>>>>      gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
>>>>>      update_call_from_tree (gsi, res);
>>>>>      return true;
>>>>>     }
>>>>> +    /* Vector loads.  */
>>>>> +    case ALTIVEC_BUILTIN_LVX_V16QI:
>>>>> +    case ALTIVEC_BUILTIN_LVX_V8HI:
>>>>> +    case ALTIVEC_BUILTIN_LVX_V4SI:
>>>>> +    case ALTIVEC_BUILTIN_LVX_V4SF:
>>>>> +    case ALTIVEC_BUILTIN_LVX_V2DI:
>>>>> +    case ALTIVEC_BUILTIN_LVX_V2DF:
>>>>> +      {
>>>>> +        gimple *g;
>>>>> +        arg0 = gimple_call_arg (stmt, 0);  // offset
>>>>> +        arg1 = gimple_call_arg (stmt, 1);  // address
>>>>> +
>>>>> +        lhs = gimple_call_lhs (stmt);
>>>>> +        location_t loc = gimple_location (stmt);
>>>>> +
>>>>> +        tree arg1_type = TREE_TYPE (arg1);
>>>>> +        tree lhs_type = TREE_TYPE (lhs);
>>>>> +
>>>>> +        /* POINTER_PLUS_EXPR wants the offset to be of type 'sizetype'.  Create
>>>>> +           the tree using the value from arg0.  The resulting type will match
>>>>> +           the type of arg1.  */
>>>>> +        tree temp_offset = create_tmp_reg_or_ssa_name (sizetype);
>>>>> +        g = gimple_build_assign (temp_offset, NOP_EXPR, arg0);
>>>>> +        gimple_set_location (g, loc);
>>>>> +        gsi_insert_before (gsi, g, GSI_SAME_STMT);
>>>>> +        tree temp_addr = create_tmp_reg_or_ssa_name (arg1_type);
>>>>> +        g = gimple_build_assign (temp_addr, POINTER_PLUS_EXPR, arg1,
>>>>> +                                 temp_offset);
>>>>> +        gimple_set_location (g, loc);
>>>>> +        gsi_insert_before (gsi, g, GSI_SAME_STMT);
>>>>> +
>>>>> +        /* Mask off any lower bits from the address.  */
>>>>> +        tree alignment_mask = build_int_cst (arg1_type, -16);
>>>>> +        tree aligned_addr = create_tmp_reg_or_ssa_name (arg1_type);
>>>>> +        g = gimple_build_assign (aligned_addr, BIT_AND_EXPR,
>>>>> +                                temp_addr, alignment_mask);
>>>>> +        gimple_set_location (g, loc);
>>>>> +        gsi_insert_before (gsi, g, GSI_SAME_STMT);
>>>> 
>>>> You could use
>>>> 
>>>> gimple_seq stmts = NULL;
>>>> tree temp_offset = gimple_convert (&stmts, loc, sizetype, arg0);
>>>> tree temp_addr = gimple_build (&stmts, loc, POINTER_PLUS_EXPR,
>>>> arg1_type, arg1, temp_offset);
>>>> tree aligned_addr = gimple_build (&stmts, loc, BIT_AND_EXPR,
>>>> arg1_type, temp_addr, build_int_cst (arg1_type, -16));
>>>> gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
>>>> 
>>>>> +        /* Use the build2 helper to set up the mem_ref.  The MEM_REF could also
>>>>> +           take an offset, but since we've already incorporated the offset
>>>>> +           above, here we just pass in a zero.  */
>>>>> +        g = gimple_build_assign (lhs, build2 (MEM_REF, lhs_type, aligned_addr,
>>>>> +                                               build_int_cst (arg1_type, 0)));
>>>> 
>>>> are you sure about arg1_type here?  I'm sure not.  For
>>>> 
>>>> ... foo (struct S *p)
>>>> {
>>>> return __builtin_lvx_v2df (4, (double *)p);
>>>> }
>>>> 
>>>> you'd end up with p as arg1 and thus struct S * as arg1_type and thus
>>>> TBAA using 'struct S' to access the memory.
>>> 
>>> Hm, is that so?  Wouldn't arg1_type be double* since arg1 is (double *)p?
>>> Will, you should probably test this example and see, but I'm pretty confident
>>> about this (see below).
>> 
>> But, as I should have suspected, you're right.  For some reason
>> gimple_call_arg is returning p, stripped of the cast information where the
>> user asserted that p points to a double*.
>> 
>> Can you explain to me why this should be so?  I assume that somebody
>> has decided to strip_nops the argument and lose the cast.
> 
> pointer types have no meaning in GIMPLE so we aggressively prune them.
> 
>> Using ptr_type_node loses all type information, so that would be a
>> regression from what we do today.  In some cases we could reconstruct
>> that this was necessarily, say, a double*, but I don't know how we would
>> recover the signedness for an integer type.
> 
> How did we handle the expansion previously - ah - it was done earlier
> in the C FE.  So why are you moving it to GIMPLE?  The function is called
> resolve_overloaded_builtin - what kind of overloading do you resolve here?
> As said argument types might not be preserved.

The AltiVec builtins allow overloaded names based on the argument types,
using a special callout during parsing to convert the overloaded names to
type-specific names.  Historically these have then remained builtin calls
until RTL expansion, which loses a lot of useful optimization.  Will has been
gradually implementing gimple folding for these builtins so that we can
optimize simple vector arithmetic and so on.  The overloading is still dealt
with during parsing.

As an example:

  double a[64];
  vector double x = vec_ld (0, a);

will get translated into

  vector double x = __builtin_altivec_lvx_v2df (0, a);

and 

  unsigned char b[64];
  vector unsigned char y = vec_ld (0, b);

will get translated into

  vector unsigned char y = __builtin_altivec_lvx_v16qi (0, b);

So in resolving the overloading we still maintain the type info for arg1.

Earlier I had dealt with the performance issue in a different way for the 
vec_ld and vec_st overloaded builtins, which created the rather grotty 
code in rs6000-c.c to modify the parse trees instead.  My hope was that
we could simplify the code by having Will deal with them as gimple folds
instead.  But if in so doing we lose type information, that may not be the
right call.

However, since you say that gimple aggressively removes the casts 
from pointer types, perhaps the code that we see in early gimple from
the existing method might also be missing the type information?  Will,
it would be worth looking at that code to see.  If it's no different then
perhaps we still go ahead with the folding.

Another note for Will:  The existing code gives up when -maltivec=be has
been specified, and you probably want to do that as well.  That may be
why you initially turned off big endian -- it is easy to misread that code.
-maltivec=be is VECTOR_ELT_ORDER_BIG && !BYTES_BIG_ENDIAN.

Thanks,
Bill
> 
> Richard.
> 
>> Bill
>>> 
>>>> 
>>>> I think if the builtins have any TBAA constraints you need to build those
>>>> explicitely, if not, you should use ptr_type_node aka no TBAA.
>>> 
>>> The type signatures are constrained during parsing, so we should only
>>> see allowed pointer types on arg1 by the time we get to gimple folding.  I
>>> think that using arg1_type should work, but I am probably missing
>>> something subtle, so please feel free to whack me on the temple until
>>> I get it. :-)
>>> 
>>> Bill
>>>> 
>>>> Richard.
>>>> 
>>>>> +        gimple_set_location (g, loc);
>>>>> +        gsi_replace (gsi, g, true);
>>>>> +
>>>>> +        return true;
>>>>> +
>>>>> +      }
>>>>> +
>>>>>   default:
>>>>>      if (TARGET_DEBUG_BUILTIN)
>>>>>         fprintf (stderr, "gimple builtin intrinsic not matched:%d %s %s\n",
>>>>>                  fn_code, fn_name1, fn_name2);
>>>>>     break;
will schmidt Sept. 14, 2017, 8:51 p.m. UTC | #10
On Thu, 2017-09-14 at 09:38 -0500, Bill Schmidt wrote:
> On Sep 14, 2017, at 5:15 AM, Richard Biener <richard.guenther@gmail.com> wrote:
> > 
> > On Wed, Sep 13, 2017 at 10:14 PM, Bill Schmidt
> > <wschmidt@linux.vnet.ibm.com> wrote:
> >> On Sep 13, 2017, at 10:40 AM, Bill Schmidt <wschmidt@linux.vnet.ibm.com> wrote:
> >>> 
> >>> On Sep 13, 2017, at 7:23 AM, Richard Biener <richard.guenther@gmail.com> wrote:
> >>>> 
> >>>> On Tue, Sep 12, 2017 at 11:08 PM, Will Schmidt
> >>>> <will_schmidt@vnet.ibm.com> wrote:
> >>>>> Hi,
> >>>>> 
> >>>>> [PATCH, rs6000] [v2] Folding of vector loads in GIMPLE
> >>>>> 
> >>>>> Folding of vector loads in GIMPLE.
> >>>>> 
> >>>>> Add code to handle gimple folding for the vec_ld builtins.
> >>>>> Remove the now obsoleted folding code for vec_ld from rs6000-c.c. Surrounding
> >>>>> comments have been adjusted slightly so they continue to read OK for the
> >>>>> existing vec_st code.
> >>>>> 
> >>>>> The resulting code is specifically verified by the powerpc/fold-vec-ld-*.c
> >>>>> tests which have been posted separately.
> >>>>> 
> >>>>> For V2 of this patch, I've removed the chunk of code that prohibited the
> >>>>> gimple fold from occurring in BE environments.   This had fixed an issue
> >>>>> for me earlier during my development of the code, and turns out this was
> >>>>> not necessary.  I've sniff-tested after removing that check and it looks
> >>>>> OK.
> >>>>> 
> >>>>>> + /* Limit folding of loads to LE targets.  */
> >>>>>> +      if (BYTES_BIG_ENDIAN || VECTOR_ELT_ORDER_BIG)
> >>>>>> +        return false;
> >>>>> 
> >>>>> I've restarted a regression test on this updated version.
> >>>>> 
> >>>>> OK for trunk (assuming successful regression test completion)  ?
> >>>>> 
> >>>>> Thanks,
> >>>>> -Will
> >>>>> 
> >>>>> [gcc]
> >>>>> 
> >>>>>      2017-09-12  Will Schmidt  <will_schmidt@vnet.ibm.com>
> >>>>> 
> >>>>>      * config/rs6000/rs6000.c (rs6000_gimple_fold_builtin): Add handling
> >>>>>      for early folding of vector loads (ALTIVEC_BUILTIN_LVX_*).
> >>>>>      * config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin):
> >>>>>      Remove obsoleted code for handling ALTIVEC_BUILTIN_VEC_LD.
> >>>>> 
> >>>>> diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c
> >>>>> index fbab0a2..bb8a77d 100644
> >>>>> --- a/gcc/config/rs6000/rs6000-c.c
> >>>>> +++ b/gcc/config/rs6000/rs6000-c.c
> >>>>> @@ -6470,92 +6470,19 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl,
> >>>>>                   convert (TREE_TYPE (stmt), arg0));
> >>>>>     stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl);
> >>>>>     return stmt;
> >>>>>   }
> >>>>> 
> >>>>> -  /* Expand vec_ld into an expression that masks the address and
> >>>>> -     performs the load.  We need to expand this early to allow
> >>>>> +  /* Expand vec_st into an expression that masks the address and
> >>>>> +     performs the store.  We need to expand this early to allow
> >>>>>    the best aliasing, as by the time we get into RTL we no longer
> >>>>>    are able to honor __restrict__, for example.  We may want to
> >>>>>    consider this for all memory access built-ins.
> >>>>> 
> >>>>>    When -maltivec=be is specified, or the wrong number of arguments
> >>>>>    is provided, simply punt to existing built-in processing.  */
> >>>>> -  if (fcode == ALTIVEC_BUILTIN_VEC_LD
> >>>>> -      && (BYTES_BIG_ENDIAN || !VECTOR_ELT_ORDER_BIG)
> >>>>> -      && nargs == 2)
> >>>>> -    {
> >>>>> -      tree arg0 = (*arglist)[0];
> >>>>> -      tree arg1 = (*arglist)[1];
> >>>>> -
> >>>>> -      /* Strip qualifiers like "const" from the pointer arg.  */
> >>>>> -      tree arg1_type = TREE_TYPE (arg1);
> >>>>> -      if (!POINTER_TYPE_P (arg1_type) && TREE_CODE (arg1_type) != ARRAY_TYPE)
> >>>>> -       goto bad;
> >>>>> -
> >>>>> -      tree inner_type = TREE_TYPE (arg1_type);
> >>>>> -      if (TYPE_QUALS (TREE_TYPE (arg1_type)) != 0)
> >>>>> -       {
> >>>>> -         arg1_type = build_pointer_type (build_qualified_type (inner_type,
> >>>>> -                                                               0));
> >>>>> -         arg1 = fold_convert (arg1_type, arg1);
> >>>>> -       }
> >>>>> -
> >>>>> -      /* Construct the masked address.  Let existing error handling take
> >>>>> -        over if we don't have a constant offset.  */
> >>>>> -      arg0 = fold (arg0);
> >>>>> -
> >>>>> -      if (TREE_CODE (arg0) == INTEGER_CST)
> >>>>> -       {
> >>>>> -         if (!ptrofftype_p (TREE_TYPE (arg0)))
> >>>>> -           arg0 = build1 (NOP_EXPR, sizetype, arg0);
> >>>>> -
> >>>>> -         tree arg1_type = TREE_TYPE (arg1);
> >>>>> -         if (TREE_CODE (arg1_type) == ARRAY_TYPE)
> >>>>> -           {
> >>>>> -             arg1_type = TYPE_POINTER_TO (TREE_TYPE (arg1_type));
> >>>>> -             tree const0 = build_int_cstu (sizetype, 0);
> >>>>> -             tree arg1_elt0 = build_array_ref (loc, arg1, const0);
> >>>>> -             arg1 = build1 (ADDR_EXPR, arg1_type, arg1_elt0);
> >>>>> -           }
> >>>>> -
> >>>>> -         tree addr = fold_build2_loc (loc, POINTER_PLUS_EXPR, arg1_type,
> >>>>> -                                      arg1, arg0);
> >>>>> -         tree aligned = fold_build2_loc (loc, BIT_AND_EXPR, arg1_type, addr,
> >>>>> -                                         build_int_cst (arg1_type, -16));
> >>>>> -
> >>>>> -         /* Find the built-in to get the return type so we can convert
> >>>>> -            the result properly (or fall back to default handling if the
> >>>>> -            arguments aren't compatible).  */
> >>>>> -         for (desc = altivec_overloaded_builtins;
> >>>>> -              desc->code && desc->code != fcode; desc++)
> >>>>> -           continue;
> >>>>> -
> >>>>> -         for (; desc->code == fcode; desc++)
> >>>>> -           if (rs6000_builtin_type_compatible (TREE_TYPE (arg0), desc->op1)
> >>>>> -               && (rs6000_builtin_type_compatible (TREE_TYPE (arg1),
> >>>>> -                                                   desc->op2)))
> >>>>> -             {
> >>>>> -               tree ret_type = rs6000_builtin_type (desc->ret_type);
> >>>>> -               if (TYPE_MODE (ret_type) == V2DImode)
> >>>>> -                 /* Type-based aliasing analysis thinks vector long
> >>>>> -                    and vector long long are different and will put them
> >>>>> -                    in distinct alias classes.  Force our return type
> >>>>> -                    to be a may-alias type to avoid this.  */
> >>>>> -                 ret_type
> >>>>> -                   = build_pointer_type_for_mode (ret_type, Pmode,
> >>>>> -                                                  true/*can_alias_all*/);
> >>>>> -               else
> >>>>> -                 ret_type = build_pointer_type (ret_type);
> >>>>> -               aligned = build1 (NOP_EXPR, ret_type, aligned);
> >>>>> -               tree ret_val = build_indirect_ref (loc, aligned, RO_NULL);
> >>>>> -               return ret_val;
> >>>>> -             }
> >>>>> -       }
> >>>>> -    }
> >>>>> 
> >>>>> -  /* Similarly for stvx.  */
> >>>>> if (fcode == ALTIVEC_BUILTIN_VEC_ST
> >>>>>     && (BYTES_BIG_ENDIAN || !VECTOR_ELT_ORDER_BIG)
> >>>>>     && nargs == 3)
> >>>>>   {
> >>>>>     tree arg0 = (*arglist)[0];
> >>>>> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> >>>>> index 1338371..1fb5f44 100644
> >>>>> --- a/gcc/config/rs6000/rs6000.c
> >>>>> +++ b/gcc/config/rs6000/rs6000.c
> >>>>> @@ -16547,10 +16547,61 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi)
> >>>>>      res = gimple_build (&stmts, VIEW_CONVERT_EXPR, TREE_TYPE (lhs), res);
> >>>>>      gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> >>>>>      update_call_from_tree (gsi, res);
> >>>>>      return true;
> >>>>>     }
> >>>>> +    /* Vector loads.  */
> >>>>> +    case ALTIVEC_BUILTIN_LVX_V16QI:
> >>>>> +    case ALTIVEC_BUILTIN_LVX_V8HI:
> >>>>> +    case ALTIVEC_BUILTIN_LVX_V4SI:
> >>>>> +    case ALTIVEC_BUILTIN_LVX_V4SF:
> >>>>> +    case ALTIVEC_BUILTIN_LVX_V2DI:
> >>>>> +    case ALTIVEC_BUILTIN_LVX_V2DF:
> >>>>> +      {
> >>>>> +        gimple *g;
> >>>>> +        arg0 = gimple_call_arg (stmt, 0);  // offset
> >>>>> +        arg1 = gimple_call_arg (stmt, 1);  // address
> >>>>> +
> >>>>> +        lhs = gimple_call_lhs (stmt);
> >>>>> +        location_t loc = gimple_location (stmt);
> >>>>> +
> >>>>> +        tree arg1_type = TREE_TYPE (arg1);
> >>>>> +        tree lhs_type = TREE_TYPE (lhs);
> >>>>> +
> >>>>> +        /* POINTER_PLUS_EXPR wants the offset to be of type 'sizetype'.  Create
> >>>>> +           the tree using the value from arg0.  The resulting type will match
> >>>>> +           the type of arg1.  */
> >>>>> +        tree temp_offset = create_tmp_reg_or_ssa_name (sizetype);
> >>>>> +        g = gimple_build_assign (temp_offset, NOP_EXPR, arg0);
> >>>>> +        gimple_set_location (g, loc);
> >>>>> +        gsi_insert_before (gsi, g, GSI_SAME_STMT);
> >>>>> +        tree temp_addr = create_tmp_reg_or_ssa_name (arg1_type);
> >>>>> +        g = gimple_build_assign (temp_addr, POINTER_PLUS_EXPR, arg1,
> >>>>> +                                 temp_offset);
> >>>>> +        gimple_set_location (g, loc);
> >>>>> +        gsi_insert_before (gsi, g, GSI_SAME_STMT);
> >>>>> +
> >>>>> +        /* Mask off any lower bits from the address.  */
> >>>>> +        tree alignment_mask = build_int_cst (arg1_type, -16);
> >>>>> +        tree aligned_addr = create_tmp_reg_or_ssa_name (arg1_type);
> >>>>> +        g = gimple_build_assign (aligned_addr, BIT_AND_EXPR,
> >>>>> +                                temp_addr, alignment_mask);
> >>>>> +        gimple_set_location (g, loc);
> >>>>> +        gsi_insert_before (gsi, g, GSI_SAME_STMT);
> >>>> 
> >>>> You could use
> >>>> 
> >>>> gimple_seq stmts = NULL;
> >>>> tree temp_offset = gimple_convert (&stmts, loc, sizetype, arg0);
> >>>> tree temp_addr = gimple_build (&stmts, loc, POINTER_PLUS_EXPR,
> >>>> arg1_type, arg1, temp_offset);
> >>>> tree aligned_addr = gimple_build (&stmts, loc, BIT_AND_EXPR,
> >>>> arg1_type, temp_addr, build_int_cst (arg1_type, -16));
> >>>> gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> >>>> 
> >>>>> +        /* Use the build2 helper to set up the mem_ref.  The MEM_REF could also
> >>>>> +           take an offset, but since we've already incorporated the offset
> >>>>> +           above, here we just pass in a zero.  */
> >>>>> +        g = gimple_build_assign (lhs, build2 (MEM_REF, lhs_type, aligned_addr,
> >>>>> +                                               build_int_cst (arg1_type, 0)));
> >>>> 
> >>>> are you sure about arg1_type here?  I'm sure not.  For
> >>>> 
> >>>> ... foo (struct S *p)
> >>>> {
> >>>> return __builtin_lvx_v2df (4, (double *)p);
> >>>> }
> >>>> 
> >>>> you'd end up with p as arg1 and thus struct S * as arg1_type and thus
> >>>> TBAA using 'struct S' to access the memory.
> >>> 
> >>> Hm, is that so?  Wouldn't arg1_type be double* since arg1 is (double *)p?
> >>> Will, you should probably test this example and see, but I'm pretty confident
> >>> about this (see below).
> >> 
> >> But, as I should have suspected, you're right.  For some reason
> >> gimple_call_arg is returning p, stripped of the cast information where the
> >> user asserted that p points to a double*.
> >> 
> >> Can you explain to me why this should be so?  I assume that somebody
> >> has decided to strip_nops the argument and lose the cast.
> > 
> > pointer types have no meaning in GIMPLE so we aggressively prune them.
> > 
> >> Using ptr_type_node loses all type information, so that would be a
> >> regression from what we do today.  In some cases we could reconstruct
> >> that this was necessarily, say, a double*, but I don't know how we would
> >> recover the signedness for an integer type.
> > 
> > How did we handle the expansion previously - ah - it was done earlier
> > in the C FE.  So why are you moving it to GIMPLE?  The function is called
> > resolve_overloaded_builtin - what kind of overloading do you resolve here?
> > As said argument types might not be preserved.
> 
> The AltiVec builtins allow overloaded names based on the argument types,
> using a special callout during parsing to convert the overloaded names to
> type-specific names.  Historically these have then remained builtin calls
> until RTL expansion, which loses a lot of useful optimization.  Will has been
> gradually implementing gimple folding for these builtins so that we can
> optimize simple vector arithmetic and so on.  The overloading is still dealt
> with during parsing.
> 
> As an example:
> 
>   double a[64];
>   vector double x = vec_ld (0, a);
> 
> will get translated into
> 
>   vector double x = __builtin_altivec_lvx_v2df (0, a);
> 
> and 
> 
>   unsigned char b[64];
>   vector unsigned char y = vec_ld (0, b);
> 
> will get translated into
> 
>   vector unsigned char y = __builtin_altivec_lvx_v16qi (0, b);
> 
> So in resolving the overloading we still maintain the type info for arg1.
> 
> Earlier I had dealt with the performance issue in a different way for the 
> vec_ld and vec_st overloaded builtins, which created the rather grotty 
> code in rs6000-c.c to modify the parse trees instead.  My hope was that
> we could simplify the code by having Will deal with them as gimple folds
> instead.  But if in so doing we lose type information, that may not be the
> right call.
> 
> However, since you say that gimple aggressively removes the casts 
> from pointer types, perhaps the code that we see in early gimple from
> the existing method might also be missing the type information?  Will,
> it would be worth looking at that code to see.  If it's no different then
> perhaps we still go ahead with the folding.

The rs6000-c.c version of the code did not fold unless arg0 was
constant; and if it was a constant, it appears the operation got turned
directly into a * reference.  So there isn't a good before/after compare
there.

What I see:
	  return vec_ld (ll1, (vector double *)p);

at gimple-time after the rs6000-c.c folding was a mostly un-folded 

  D.3207 = __builtin_altivec_lvx_v2dfD.1443 (ll1D.3192, pD.3193);

while this, with a constant value for arg0:
	return vec_ld (16, (vector double *)p);
at gimple time after rs6000-c.c folding became a reference:
  _1 = p + 16;
  _2 = _1 & -16B;
  D.3196 = *_2;

with the rs6000.c gimple folding code (the changes I've got locally),
the before/after with arg0 is constant reads the same.   When arg0 is a
variable:
	  return vec_ld (ll1, (vector double *)p);
at dump-gimple time it then becomes:
  D.3208 = (sizetype) ll1D.3192;
  D.3209 = pD.3193 + D.3208;
  D.3210 = D.3209 & -16B;
  D.3207 = MEM[(struct S *)D.3210];

And if I change the code such that arg1_type is instead ptr_type_node:
  D.3207 = MEM[(voidD.44 *)D.3210];

So...  

> Another note for Will:  The existing code gives up when -maltivec=be has
> been specified, and you probably want to do that as well.  That may be
> why you initially turned off big endian -- it is easy to misread that code.
> -maltivec=be is VECTOR_ELT_ORDER_BIG && !BYTES_BIG_ENDIAN.

Yeah, I apparently inverted and confused the logic when I made that
change.  My current snippet reads as:

         /* Do not fold for -maltivec=be on LE targets.  */
         if (VECTOR_ELT_ORDER_BIG && !BYTES_BIG_ENDIAN)
           return false;


> Thanks,
> Bill
> > 
> > Richard.
> > 
> >> Bill
> >>> 
> >>>> 
> >>>> I think if the builtins have any TBAA constraints you need to build those
> >>>> explicitely, if not, you should use ptr_type_node aka no TBAA.
> >>> 
> >>> The type signatures are constrained during parsing, so we should only
> >>> see allowed pointer types on arg1 by the time we get to gimple folding.  I
> >>> think that using arg1_type should work, but I am probably missing
> >>> something subtle, so please feel free to whack me on the temple until
> >>> I get it. :-)
> >>> 
> >>> Bill
> >>>> 
> >>>> Richard.
> >>>> 
> >>>>> +        gimple_set_location (g, loc);
> >>>>> +        gsi_replace (gsi, g, true);
> >>>>> +
> >>>>> +        return true;
> >>>>> +
> >>>>> +      }
> >>>>> +
> >>>>>   default:
> >>>>>      if (TARGET_DEBUG_BUILTIN)
> >>>>>         fprintf (stderr, "gimple builtin intrinsic not matched:%d %s %s\n",
> >>>>>                  fn_code, fn_name1, fn_name2);
> >>>>>     break;
>
Richard Biener Sept. 15, 2017, 9:13 a.m. UTC | #11
On Thu, Sep 14, 2017 at 4:38 PM, Bill Schmidt
<wschmidt@linux.vnet.ibm.com> wrote:
> On Sep 14, 2017, at 5:15 AM, Richard Biener <richard.guenther@gmail.com> wrote:
>>
>> On Wed, Sep 13, 2017 at 10:14 PM, Bill Schmidt
>> <wschmidt@linux.vnet.ibm.com> wrote:
>>> On Sep 13, 2017, at 10:40 AM, Bill Schmidt <wschmidt@linux.vnet.ibm.com> wrote:
>>>>
>>>> On Sep 13, 2017, at 7:23 AM, Richard Biener <richard.guenther@gmail.com> wrote:
>>>>>
>>>>> On Tue, Sep 12, 2017 at 11:08 PM, Will Schmidt
>>>>> <will_schmidt@vnet.ibm.com> wrote:
>>>>>> Hi,
>>>>>>
>>>>>> [PATCH, rs6000] [v2] Folding of vector loads in GIMPLE
>>>>>>
>>>>>> Folding of vector loads in GIMPLE.
>>>>>>
>>>>>> Add code to handle gimple folding for the vec_ld builtins.
>>>>>> Remove the now obsoleted folding code for vec_ld from rs6000-c.c. Surrounding
>>>>>> comments have been adjusted slightly so they continue to read OK for the
>>>>>> existing vec_st code.
>>>>>>
>>>>>> The resulting code is specifically verified by the powerpc/fold-vec-ld-*.c
>>>>>> tests which have been posted separately.
>>>>>>
>>>>>> For V2 of this patch, I've removed the chunk of code that prohibited the
>>>>>> gimple fold from occurring in BE environments.   This had fixed an issue
>>>>>> for me earlier during my development of the code, and turns out this was
>>>>>> not necessary.  I've sniff-tested after removing that check and it looks
>>>>>> OK.
>>>>>>
>>>>>>> + /* Limit folding of loads to LE targets.  */
>>>>>>> +      if (BYTES_BIG_ENDIAN || VECTOR_ELT_ORDER_BIG)
>>>>>>> +        return false;
>>>>>>
>>>>>> I've restarted a regression test on this updated version.
>>>>>>
>>>>>> OK for trunk (assuming successful regression test completion)  ?
>>>>>>
>>>>>> Thanks,
>>>>>> -Will
>>>>>>
>>>>>> [gcc]
>>>>>>
>>>>>>      2017-09-12  Will Schmidt  <will_schmidt@vnet.ibm.com>
>>>>>>
>>>>>>      * config/rs6000/rs6000.c (rs6000_gimple_fold_builtin): Add handling
>>>>>>      for early folding of vector loads (ALTIVEC_BUILTIN_LVX_*).
>>>>>>      * config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin):
>>>>>>      Remove obsoleted code for handling ALTIVEC_BUILTIN_VEC_LD.
>>>>>>
>>>>>> diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c
>>>>>> index fbab0a2..bb8a77d 100644
>>>>>> --- a/gcc/config/rs6000/rs6000-c.c
>>>>>> +++ b/gcc/config/rs6000/rs6000-c.c
>>>>>> @@ -6470,92 +6470,19 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl,
>>>>>>                   convert (TREE_TYPE (stmt), arg0));
>>>>>>     stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl);
>>>>>>     return stmt;
>>>>>>   }
>>>>>>
>>>>>> -  /* Expand vec_ld into an expression that masks the address and
>>>>>> -     performs the load.  We need to expand this early to allow
>>>>>> +  /* Expand vec_st into an expression that masks the address and
>>>>>> +     performs the store.  We need to expand this early to allow
>>>>>>    the best aliasing, as by the time we get into RTL we no longer
>>>>>>    are able to honor __restrict__, for example.  We may want to
>>>>>>    consider this for all memory access built-ins.
>>>>>>
>>>>>>    When -maltivec=be is specified, or the wrong number of arguments
>>>>>>    is provided, simply punt to existing built-in processing.  */
>>>>>> -  if (fcode == ALTIVEC_BUILTIN_VEC_LD
>>>>>> -      && (BYTES_BIG_ENDIAN || !VECTOR_ELT_ORDER_BIG)
>>>>>> -      && nargs == 2)
>>>>>> -    {
>>>>>> -      tree arg0 = (*arglist)[0];
>>>>>> -      tree arg1 = (*arglist)[1];
>>>>>> -
>>>>>> -      /* Strip qualifiers like "const" from the pointer arg.  */
>>>>>> -      tree arg1_type = TREE_TYPE (arg1);
>>>>>> -      if (!POINTER_TYPE_P (arg1_type) && TREE_CODE (arg1_type) != ARRAY_TYPE)
>>>>>> -       goto bad;
>>>>>> -
>>>>>> -      tree inner_type = TREE_TYPE (arg1_type);
>>>>>> -      if (TYPE_QUALS (TREE_TYPE (arg1_type)) != 0)
>>>>>> -       {
>>>>>> -         arg1_type = build_pointer_type (build_qualified_type (inner_type,
>>>>>> -                                                               0));
>>>>>> -         arg1 = fold_convert (arg1_type, arg1);
>>>>>> -       }
>>>>>> -
>>>>>> -      /* Construct the masked address.  Let existing error handling take
>>>>>> -        over if we don't have a constant offset.  */
>>>>>> -      arg0 = fold (arg0);
>>>>>> -
>>>>>> -      if (TREE_CODE (arg0) == INTEGER_CST)
>>>>>> -       {
>>>>>> -         if (!ptrofftype_p (TREE_TYPE (arg0)))
>>>>>> -           arg0 = build1 (NOP_EXPR, sizetype, arg0);
>>>>>> -
>>>>>> -         tree arg1_type = TREE_TYPE (arg1);
>>>>>> -         if (TREE_CODE (arg1_type) == ARRAY_TYPE)
>>>>>> -           {
>>>>>> -             arg1_type = TYPE_POINTER_TO (TREE_TYPE (arg1_type));
>>>>>> -             tree const0 = build_int_cstu (sizetype, 0);
>>>>>> -             tree arg1_elt0 = build_array_ref (loc, arg1, const0);
>>>>>> -             arg1 = build1 (ADDR_EXPR, arg1_type, arg1_elt0);
>>>>>> -           }
>>>>>> -
>>>>>> -         tree addr = fold_build2_loc (loc, POINTER_PLUS_EXPR, arg1_type,
>>>>>> -                                      arg1, arg0);
>>>>>> -         tree aligned = fold_build2_loc (loc, BIT_AND_EXPR, arg1_type, addr,
>>>>>> -                                         build_int_cst (arg1_type, -16));
>>>>>> -
>>>>>> -         /* Find the built-in to get the return type so we can convert
>>>>>> -            the result properly (or fall back to default handling if the
>>>>>> -            arguments aren't compatible).  */
>>>>>> -         for (desc = altivec_overloaded_builtins;
>>>>>> -              desc->code && desc->code != fcode; desc++)
>>>>>> -           continue;
>>>>>> -
>>>>>> -         for (; desc->code == fcode; desc++)
>>>>>> -           if (rs6000_builtin_type_compatible (TREE_TYPE (arg0), desc->op1)
>>>>>> -               && (rs6000_builtin_type_compatible (TREE_TYPE (arg1),
>>>>>> -                                                   desc->op2)))
>>>>>> -             {
>>>>>> -               tree ret_type = rs6000_builtin_type (desc->ret_type);
>>>>>> -               if (TYPE_MODE (ret_type) == V2DImode)
>>>>>> -                 /* Type-based aliasing analysis thinks vector long
>>>>>> -                    and vector long long are different and will put them
>>>>>> -                    in distinct alias classes.  Force our return type
>>>>>> -                    to be a may-alias type to avoid this.  */
>>>>>> -                 ret_type
>>>>>> -                   = build_pointer_type_for_mode (ret_type, Pmode,
>>>>>> -                                                  true/*can_alias_all*/);
>>>>>> -               else
>>>>>> -                 ret_type = build_pointer_type (ret_type);
>>>>>> -               aligned = build1 (NOP_EXPR, ret_type, aligned);
>>>>>> -               tree ret_val = build_indirect_ref (loc, aligned, RO_NULL);
>>>>>> -               return ret_val;
>>>>>> -             }
>>>>>> -       }
>>>>>> -    }
>>>>>>
>>>>>> -  /* Similarly for stvx.  */
>>>>>> if (fcode == ALTIVEC_BUILTIN_VEC_ST
>>>>>>     && (BYTES_BIG_ENDIAN || !VECTOR_ELT_ORDER_BIG)
>>>>>>     && nargs == 3)
>>>>>>   {
>>>>>>     tree arg0 = (*arglist)[0];
>>>>>> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
>>>>>> index 1338371..1fb5f44 100644
>>>>>> --- a/gcc/config/rs6000/rs6000.c
>>>>>> +++ b/gcc/config/rs6000/rs6000.c
>>>>>> @@ -16547,10 +16547,61 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi)
>>>>>>      res = gimple_build (&stmts, VIEW_CONVERT_EXPR, TREE_TYPE (lhs), res);
>>>>>>      gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
>>>>>>      update_call_from_tree (gsi, res);
>>>>>>      return true;
>>>>>>     }
>>>>>> +    /* Vector loads.  */
>>>>>> +    case ALTIVEC_BUILTIN_LVX_V16QI:
>>>>>> +    case ALTIVEC_BUILTIN_LVX_V8HI:
>>>>>> +    case ALTIVEC_BUILTIN_LVX_V4SI:
>>>>>> +    case ALTIVEC_BUILTIN_LVX_V4SF:
>>>>>> +    case ALTIVEC_BUILTIN_LVX_V2DI:
>>>>>> +    case ALTIVEC_BUILTIN_LVX_V2DF:
>>>>>> +      {
>>>>>> +        gimple *g;
>>>>>> +        arg0 = gimple_call_arg (stmt, 0);  // offset
>>>>>> +        arg1 = gimple_call_arg (stmt, 1);  // address
>>>>>> +
>>>>>> +        lhs = gimple_call_lhs (stmt);
>>>>>> +        location_t loc = gimple_location (stmt);
>>>>>> +
>>>>>> +        tree arg1_type = TREE_TYPE (arg1);
>>>>>> +        tree lhs_type = TREE_TYPE (lhs);
>>>>>> +
>>>>>> +        /* POINTER_PLUS_EXPR wants the offset to be of type 'sizetype'.  Create
>>>>>> +           the tree using the value from arg0.  The resulting type will match
>>>>>> +           the type of arg1.  */
>>>>>> +        tree temp_offset = create_tmp_reg_or_ssa_name (sizetype);
>>>>>> +        g = gimple_build_assign (temp_offset, NOP_EXPR, arg0);
>>>>>> +        gimple_set_location (g, loc);
>>>>>> +        gsi_insert_before (gsi, g, GSI_SAME_STMT);
>>>>>> +        tree temp_addr = create_tmp_reg_or_ssa_name (arg1_type);
>>>>>> +        g = gimple_build_assign (temp_addr, POINTER_PLUS_EXPR, arg1,
>>>>>> +                                 temp_offset);
>>>>>> +        gimple_set_location (g, loc);
>>>>>> +        gsi_insert_before (gsi, g, GSI_SAME_STMT);
>>>>>> +
>>>>>> +        /* Mask off any lower bits from the address.  */
>>>>>> +        tree alignment_mask = build_int_cst (arg1_type, -16);
>>>>>> +        tree aligned_addr = create_tmp_reg_or_ssa_name (arg1_type);
>>>>>> +        g = gimple_build_assign (aligned_addr, BIT_AND_EXPR,
>>>>>> +                                temp_addr, alignment_mask);
>>>>>> +        gimple_set_location (g, loc);
>>>>>> +        gsi_insert_before (gsi, g, GSI_SAME_STMT);
>>>>>
>>>>> You could use
>>>>>
>>>>> gimple_seq stmts = NULL;
>>>>> tree temp_offset = gimple_convert (&stmts, loc, sizetype, arg0);
>>>>> tree temp_addr = gimple_build (&stmts, loc, POINTER_PLUS_EXPR,
>>>>> arg1_type, arg1, temp_offset);
>>>>> tree aligned_addr = gimple_build (&stmts, loc, BIT_AND_EXPR,
>>>>> arg1_type, temp_addr, build_int_cst (arg1_type, -16));
>>>>> gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
>>>>>
>>>>>> +        /* Use the build2 helper to set up the mem_ref.  The MEM_REF could also
>>>>>> +           take an offset, but since we've already incorporated the offset
>>>>>> +           above, here we just pass in a zero.  */
>>>>>> +        g = gimple_build_assign (lhs, build2 (MEM_REF, lhs_type, aligned_addr,
>>>>>> +                                               build_int_cst (arg1_type, 0)));
>>>>>
>>>>> are you sure about arg1_type here?  I'm sure not.  For
>>>>>
>>>>> ... foo (struct S *p)
>>>>> {
>>>>> return __builtin_lvx_v2df (4, (double *)p);
>>>>> }
>>>>>
>>>>> you'd end up with p as arg1 and thus struct S * as arg1_type and thus
>>>>> TBAA using 'struct S' to access the memory.
>>>>
>>>> Hm, is that so?  Wouldn't arg1_type be double* since arg1 is (double *)p?
>>>> Will, you should probably test this example and see, but I'm pretty confident
>>>> about this (see below).
>>>
>>> But, as I should have suspected, you're right.  For some reason
>>> gimple_call_arg is returning p, stripped of the cast information where the
>>> user asserted that p points to a double*.
>>>
>>> Can you explain to me why this should be so?  I assume that somebody
>>> has decided to strip_nops the argument and lose the cast.
>>
>> pointer types have no meaning in GIMPLE so we aggressively prune them.
>>
>>> Using ptr_type_node loses all type information, so that would be a
>>> regression from what we do today.  In some cases we could reconstruct
>>> that this was necessarily, say, a double*, but I don't know how we would
>>> recover the signedness for an integer type.
>>
>> How did we handle the expansion previously - ah - it was done earlier
>> in the C FE.  So why are you moving it to GIMPLE?  The function is called
>> resolve_overloaded_builtin - what kind of overloading do you resolve here?
>> As said argument types might not be preserved.
>
> The AltiVec builtins allow overloaded names based on the argument types,
> using a special callout during parsing to convert the overloaded names to
> type-specific names.  Historically these have then remained builtin calls
> until RTL expansion, which loses a lot of useful optimization.  Will has been
> gradually implementing gimple folding for these builtins so that we can
> optimize simple vector arithmetic and so on.  The overloading is still dealt
> with during parsing.
>
> As an example:
>
>   double a[64];
>   vector double x = vec_ld (0, a);
>
> will get translated into
>
>   vector double x = __builtin_altivec_lvx_v2df (0, a);
>
> and
>
>   unsigned char b[64];
>   vector unsigned char y = vec_ld (0, b);
>
> will get translated into
>
>   vector unsigned char y = __builtin_altivec_lvx_v16qi (0, b);
>
> So in resolving the overloading we still maintain the type info for arg1.

So TBAA-wise the vec_ld is specced to use alias-set zero for this case
as it loads from a unsinged char array?  Or is it alias-set zero because
the type of arg1 is unsigned char *?  What if the type of arg1 was
struct X *?

> Earlier I had dealt with the performance issue in a different way for the
> vec_ld and vec_st overloaded builtins, which created the rather grotty
> code in rs6000-c.c to modify the parse trees instead.  My hope was that
> we could simplify the code by having Will deal with them as gimple folds
> instead.  But if in so doing we lose type information, that may not be the
> right call.
>
> However, since you say that gimple aggressively removes the casts
> from pointer types, perhaps the code that we see in early gimple from
> the existing method might also be missing the type information?  Will,
> it would be worth looking at that code to see.  If it's no different then
> perhaps we still go ahead with the folding.

As I said you can't simply use the type of arg1 for the TBAA type.
You can conservatively use ptr_type_node (alias-set zero) or you
can use sth that you derive from the builtin used (is a supposedly
existing _v4si variant always subject to int * TBAA?)

> Another note for Will:  The existing code gives up when -maltivec=be has
> been specified, and you probably want to do that as well.  That may be
> why you initially turned off big endian -- it is easy to misread that code.
> -maltivec=be is VECTOR_ELT_ORDER_BIG && !BYTES_BIG_ENDIAN.
>
> Thanks,
> Bill
>>
>> Richard.
>>
>>> Bill
>>>>
>>>>>
>>>>> I think if the builtins have any TBAA constraints you need to build those
>>>>> explicitely, if not, you should use ptr_type_node aka no TBAA.
>>>>
>>>> The type signatures are constrained during parsing, so we should only
>>>> see allowed pointer types on arg1 by the time we get to gimple folding.  I
>>>> think that using arg1_type should work, but I am probably missing
>>>> something subtle, so please feel free to whack me on the temple until
>>>> I get it. :-)
>>>>
>>>> Bill
>>>>>
>>>>> Richard.
>>>>>
>>>>>> +        gimple_set_location (g, loc);
>>>>>> +        gsi_replace (gsi, g, true);
>>>>>> +
>>>>>> +        return true;
>>>>>> +
>>>>>> +      }
>>>>>> +
>>>>>>   default:
>>>>>>      if (TARGET_DEBUG_BUILTIN)
>>>>>>         fprintf (stderr, "gimple builtin intrinsic not matched:%d %s %s\n",
>>>>>>                  fn_code, fn_name1, fn_name2);
>>>>>>     break;
>
Bill Schmidt Sept. 15, 2017, 1:16 p.m. UTC | #12
On Sep 15, 2017, at 4:13 AM, Richard Biener <richard.guenther@gmail.com> wrote:
> 
> On Thu, Sep 14, 2017 at 4:38 PM, Bill Schmidt
> <wschmidt@linux.vnet.ibm.com> wrote:
>> On Sep 14, 2017, at 5:15 AM, Richard Biener <richard.guenther@gmail.com> wrote:
>>> 
>>> On Wed, Sep 13, 2017 at 10:14 PM, Bill Schmidt
>>> <wschmidt@linux.vnet.ibm.com> wrote:
>>>> On Sep 13, 2017, at 10:40 AM, Bill Schmidt <wschmidt@linux.vnet.ibm.com> wrote:
>>>>> 
>>>>> On Sep 13, 2017, at 7:23 AM, Richard Biener <richard.guenther@gmail.com> wrote:
>>>>>> 
>>>>>> On Tue, Sep 12, 2017 at 11:08 PM, Will Schmidt
>>>>>> <will_schmidt@vnet.ibm.com> wrote:
>>>>>>> Hi,
>>>>>>> 
>>>>>>> [PATCH, rs6000] [v2] Folding of vector loads in GIMPLE
>>>>>>> 
>>>>>>> Folding of vector loads in GIMPLE.
>>>>>>> 
>>>>>>> Add code to handle gimple folding for the vec_ld builtins.
>>>>>>> Remove the now obsoleted folding code for vec_ld from rs6000-c.c. Surrounding
>>>>>>> comments have been adjusted slightly so they continue to read OK for the
>>>>>>> existing vec_st code.
>>>>>>> 
>>>>>>> The resulting code is specifically verified by the powerpc/fold-vec-ld-*.c
>>>>>>> tests which have been posted separately.
>>>>>>> 
>>>>>>> For V2 of this patch, I've removed the chunk of code that prohibited the
>>>>>>> gimple fold from occurring in BE environments.   This had fixed an issue
>>>>>>> for me earlier during my development of the code, and turns out this was
>>>>>>> not necessary.  I've sniff-tested after removing that check and it looks
>>>>>>> OK.
>>>>>>> 
>>>>>>>> + /* Limit folding of loads to LE targets.  */
>>>>>>>> +      if (BYTES_BIG_ENDIAN || VECTOR_ELT_ORDER_BIG)
>>>>>>>> +        return false;
>>>>>>> 
>>>>>>> I've restarted a regression test on this updated version.
>>>>>>> 
>>>>>>> OK for trunk (assuming successful regression test completion)  ?
>>>>>>> 
>>>>>>> Thanks,
>>>>>>> -Will
>>>>>>> 
>>>>>>> [gcc]
>>>>>>> 
>>>>>>>     2017-09-12  Will Schmidt  <will_schmidt@vnet.ibm.com>
>>>>>>> 
>>>>>>>     * config/rs6000/rs6000.c (rs6000_gimple_fold_builtin): Add handling
>>>>>>>     for early folding of vector loads (ALTIVEC_BUILTIN_LVX_*).
>>>>>>>     * config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin):
>>>>>>>     Remove obsoleted code for handling ALTIVEC_BUILTIN_VEC_LD.
>>>>>>> 
>>>>>>> diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c
>>>>>>> index fbab0a2..bb8a77d 100644
>>>>>>> --- a/gcc/config/rs6000/rs6000-c.c
>>>>>>> +++ b/gcc/config/rs6000/rs6000-c.c
>>>>>>> @@ -6470,92 +6470,19 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl,
>>>>>>>                  convert (TREE_TYPE (stmt), arg0));
>>>>>>>    stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl);
>>>>>>>    return stmt;
>>>>>>>  }
>>>>>>> 
>>>>>>> -  /* Expand vec_ld into an expression that masks the address and
>>>>>>> -     performs the load.  We need to expand this early to allow
>>>>>>> +  /* Expand vec_st into an expression that masks the address and
>>>>>>> +     performs the store.  We need to expand this early to allow
>>>>>>>   the best aliasing, as by the time we get into RTL we no longer
>>>>>>>   are able to honor __restrict__, for example.  We may want to
>>>>>>>   consider this for all memory access built-ins.
>>>>>>> 
>>>>>>>   When -maltivec=be is specified, or the wrong number of arguments
>>>>>>>   is provided, simply punt to existing built-in processing.  */
>>>>>>> -  if (fcode == ALTIVEC_BUILTIN_VEC_LD
>>>>>>> -      && (BYTES_BIG_ENDIAN || !VECTOR_ELT_ORDER_BIG)
>>>>>>> -      && nargs == 2)
>>>>>>> -    {
>>>>>>> -      tree arg0 = (*arglist)[0];
>>>>>>> -      tree arg1 = (*arglist)[1];
>>>>>>> -
>>>>>>> -      /* Strip qualifiers like "const" from the pointer arg.  */
>>>>>>> -      tree arg1_type = TREE_TYPE (arg1);
>>>>>>> -      if (!POINTER_TYPE_P (arg1_type) && TREE_CODE (arg1_type) != ARRAY_TYPE)
>>>>>>> -       goto bad;
>>>>>>> -
>>>>>>> -      tree inner_type = TREE_TYPE (arg1_type);
>>>>>>> -      if (TYPE_QUALS (TREE_TYPE (arg1_type)) != 0)
>>>>>>> -       {
>>>>>>> -         arg1_type = build_pointer_type (build_qualified_type (inner_type,
>>>>>>> -                                                               0));
>>>>>>> -         arg1 = fold_convert (arg1_type, arg1);
>>>>>>> -       }
>>>>>>> -
>>>>>>> -      /* Construct the masked address.  Let existing error handling take
>>>>>>> -        over if we don't have a constant offset.  */
>>>>>>> -      arg0 = fold (arg0);
>>>>>>> -
>>>>>>> -      if (TREE_CODE (arg0) == INTEGER_CST)
>>>>>>> -       {
>>>>>>> -         if (!ptrofftype_p (TREE_TYPE (arg0)))
>>>>>>> -           arg0 = build1 (NOP_EXPR, sizetype, arg0);
>>>>>>> -
>>>>>>> -         tree arg1_type = TREE_TYPE (arg1);
>>>>>>> -         if (TREE_CODE (arg1_type) == ARRAY_TYPE)
>>>>>>> -           {
>>>>>>> -             arg1_type = TYPE_POINTER_TO (TREE_TYPE (arg1_type));
>>>>>>> -             tree const0 = build_int_cstu (sizetype, 0);
>>>>>>> -             tree arg1_elt0 = build_array_ref (loc, arg1, const0);
>>>>>>> -             arg1 = build1 (ADDR_EXPR, arg1_type, arg1_elt0);
>>>>>>> -           }
>>>>>>> -
>>>>>>> -         tree addr = fold_build2_loc (loc, POINTER_PLUS_EXPR, arg1_type,
>>>>>>> -                                      arg1, arg0);
>>>>>>> -         tree aligned = fold_build2_loc (loc, BIT_AND_EXPR, arg1_type, addr,
>>>>>>> -                                         build_int_cst (arg1_type, -16));
>>>>>>> -
>>>>>>> -         /* Find the built-in to get the return type so we can convert
>>>>>>> -            the result properly (or fall back to default handling if the
>>>>>>> -            arguments aren't compatible).  */
>>>>>>> -         for (desc = altivec_overloaded_builtins;
>>>>>>> -              desc->code && desc->code != fcode; desc++)
>>>>>>> -           continue;
>>>>>>> -
>>>>>>> -         for (; desc->code == fcode; desc++)
>>>>>>> -           if (rs6000_builtin_type_compatible (TREE_TYPE (arg0), desc->op1)
>>>>>>> -               && (rs6000_builtin_type_compatible (TREE_TYPE (arg1),
>>>>>>> -                                                   desc->op2)))
>>>>>>> -             {
>>>>>>> -               tree ret_type = rs6000_builtin_type (desc->ret_type);
>>>>>>> -               if (TYPE_MODE (ret_type) == V2DImode)
>>>>>>> -                 /* Type-based aliasing analysis thinks vector long
>>>>>>> -                    and vector long long are different and will put them
>>>>>>> -                    in distinct alias classes.  Force our return type
>>>>>>> -                    to be a may-alias type to avoid this.  */
>>>>>>> -                 ret_type
>>>>>>> -                   = build_pointer_type_for_mode (ret_type, Pmode,
>>>>>>> -                                                  true/*can_alias_all*/);
>>>>>>> -               else
>>>>>>> -                 ret_type = build_pointer_type (ret_type);
>>>>>>> -               aligned = build1 (NOP_EXPR, ret_type, aligned);
>>>>>>> -               tree ret_val = build_indirect_ref (loc, aligned, RO_NULL);
>>>>>>> -               return ret_val;
>>>>>>> -             }
>>>>>>> -       }
>>>>>>> -    }
>>>>>>> 
>>>>>>> -  /* Similarly for stvx.  */
>>>>>>> if (fcode == ALTIVEC_BUILTIN_VEC_ST
>>>>>>>    && (BYTES_BIG_ENDIAN || !VECTOR_ELT_ORDER_BIG)
>>>>>>>    && nargs == 3)
>>>>>>>  {
>>>>>>>    tree arg0 = (*arglist)[0];
>>>>>>> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
>>>>>>> index 1338371..1fb5f44 100644
>>>>>>> --- a/gcc/config/rs6000/rs6000.c
>>>>>>> +++ b/gcc/config/rs6000/rs6000.c
>>>>>>> @@ -16547,10 +16547,61 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi)
>>>>>>>     res = gimple_build (&stmts, VIEW_CONVERT_EXPR, TREE_TYPE (lhs), res);
>>>>>>>     gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
>>>>>>>     update_call_from_tree (gsi, res);
>>>>>>>     return true;
>>>>>>>    }
>>>>>>> +    /* Vector loads.  */
>>>>>>> +    case ALTIVEC_BUILTIN_LVX_V16QI:
>>>>>>> +    case ALTIVEC_BUILTIN_LVX_V8HI:
>>>>>>> +    case ALTIVEC_BUILTIN_LVX_V4SI:
>>>>>>> +    case ALTIVEC_BUILTIN_LVX_V4SF:
>>>>>>> +    case ALTIVEC_BUILTIN_LVX_V2DI:
>>>>>>> +    case ALTIVEC_BUILTIN_LVX_V2DF:
>>>>>>> +      {
>>>>>>> +        gimple *g;
>>>>>>> +        arg0 = gimple_call_arg (stmt, 0);  // offset
>>>>>>> +        arg1 = gimple_call_arg (stmt, 1);  // address
>>>>>>> +
>>>>>>> +        lhs = gimple_call_lhs (stmt);
>>>>>>> +        location_t loc = gimple_location (stmt);
>>>>>>> +
>>>>>>> +        tree arg1_type = TREE_TYPE (arg1);
>>>>>>> +        tree lhs_type = TREE_TYPE (lhs);
>>>>>>> +
>>>>>>> +        /* POINTER_PLUS_EXPR wants the offset to be of type 'sizetype'.  Create
>>>>>>> +           the tree using the value from arg0.  The resulting type will match
>>>>>>> +           the type of arg1.  */
>>>>>>> +        tree temp_offset = create_tmp_reg_or_ssa_name (sizetype);
>>>>>>> +        g = gimple_build_assign (temp_offset, NOP_EXPR, arg0);
>>>>>>> +        gimple_set_location (g, loc);
>>>>>>> +        gsi_insert_before (gsi, g, GSI_SAME_STMT);
>>>>>>> +        tree temp_addr = create_tmp_reg_or_ssa_name (arg1_type);
>>>>>>> +        g = gimple_build_assign (temp_addr, POINTER_PLUS_EXPR, arg1,
>>>>>>> +                                 temp_offset);
>>>>>>> +        gimple_set_location (g, loc);
>>>>>>> +        gsi_insert_before (gsi, g, GSI_SAME_STMT);
>>>>>>> +
>>>>>>> +        /* Mask off any lower bits from the address.  */
>>>>>>> +        tree alignment_mask = build_int_cst (arg1_type, -16);
>>>>>>> +        tree aligned_addr = create_tmp_reg_or_ssa_name (arg1_type);
>>>>>>> +        g = gimple_build_assign (aligned_addr, BIT_AND_EXPR,
>>>>>>> +                                temp_addr, alignment_mask);
>>>>>>> +        gimple_set_location (g, loc);
>>>>>>> +        gsi_insert_before (gsi, g, GSI_SAME_STMT);
>>>>>> 
>>>>>> You could use
>>>>>> 
>>>>>> gimple_seq stmts = NULL;
>>>>>> tree temp_offset = gimple_convert (&stmts, loc, sizetype, arg0);
>>>>>> tree temp_addr = gimple_build (&stmts, loc, POINTER_PLUS_EXPR,
>>>>>> arg1_type, arg1, temp_offset);
>>>>>> tree aligned_addr = gimple_build (&stmts, loc, BIT_AND_EXPR,
>>>>>> arg1_type, temp_addr, build_int_cst (arg1_type, -16));
>>>>>> gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
>>>>>> 
>>>>>>> +        /* Use the build2 helper to set up the mem_ref.  The MEM_REF could also
>>>>>>> +           take an offset, but since we've already incorporated the offset
>>>>>>> +           above, here we just pass in a zero.  */
>>>>>>> +        g = gimple_build_assign (lhs, build2 (MEM_REF, lhs_type, aligned_addr,
>>>>>>> +                                               build_int_cst (arg1_type, 0)));
>>>>>> 
>>>>>> are you sure about arg1_type here?  I'm sure not.  For
>>>>>> 
>>>>>> ... foo (struct S *p)
>>>>>> {
>>>>>> return __builtin_lvx_v2df (4, (double *)p);
>>>>>> }
>>>>>> 
>>>>>> you'd end up with p as arg1 and thus struct S * as arg1_type and thus
>>>>>> TBAA using 'struct S' to access the memory.
>>>>> 
>>>>> Hm, is that so?  Wouldn't arg1_type be double* since arg1 is (double *)p?
>>>>> Will, you should probably test this example and see, but I'm pretty confident
>>>>> about this (see below).
>>>> 
>>>> But, as I should have suspected, you're right.  For some reason
>>>> gimple_call_arg is returning p, stripped of the cast information where the
>>>> user asserted that p points to a double*.
>>>> 
>>>> Can you explain to me why this should be so?  I assume that somebody
>>>> has decided to strip_nops the argument and lose the cast.
>>> 
>>> pointer types have no meaning in GIMPLE so we aggressively prune them.
>>> 
>>>> Using ptr_type_node loses all type information, so that would be a
>>>> regression from what we do today.  In some cases we could reconstruct
>>>> that this was necessarily, say, a double*, but I don't know how we would
>>>> recover the signedness for an integer type.
>>> 
>>> How did we handle the expansion previously - ah - it was done earlier
>>> in the C FE.  So why are you moving it to GIMPLE?  The function is called
>>> resolve_overloaded_builtin - what kind of overloading do you resolve here?
>>> As said argument types might not be preserved.
>> 
>> The AltiVec builtins allow overloaded names based on the argument types,
>> using a special callout during parsing to convert the overloaded names to
>> type-specific names.  Historically these have then remained builtin calls
>> until RTL expansion, which loses a lot of useful optimization.  Will has been
>> gradually implementing gimple folding for these builtins so that we can
>> optimize simple vector arithmetic and so on.  The overloading is still dealt
>> with during parsing.
>> 
>> As an example:
>> 
>>  double a[64];
>>  vector double x = vec_ld (0, a);
>> 
>> will get translated into
>> 
>>  vector double x = __builtin_altivec_lvx_v2df (0, a);
>> 
>> and
>> 
>>  unsigned char b[64];
>>  vector unsigned char y = vec_ld (0, b);
>> 
>> will get translated into
>> 
>>  vector unsigned char y = __builtin_altivec_lvx_v16qi (0, b);
>> 
>> So in resolving the overloading we still maintain the type info for arg1.
> 
> So TBAA-wise the vec_ld is specced to use alias-set zero for this case
> as it loads from a unsinged char array?  Or is it alias-set zero because
> the type of arg1 is unsigned char *?  What if the type of arg1 was
> struct X *?
> 
>> Earlier I had dealt with the performance issue in a different way for the
>> vec_ld and vec_st overloaded builtins, which created the rather grotty
>> code in rs6000-c.c to modify the parse trees instead.  My hope was that
>> we could simplify the code by having Will deal with them as gimple folds
>> instead.  But if in so doing we lose type information, that may not be the
>> right call.
>> 
>> However, since you say that gimple aggressively removes the casts
>> from pointer types, perhaps the code that we see in early gimple from
>> the existing method might also be missing the type information?  Will,
>> it would be worth looking at that code to see.  If it's no different then
>> perhaps we still go ahead with the folding.
> 
> As I said you can't simply use the type of arg1 for the TBAA type.
> You can conservatively use ptr_type_node (alias-set zero) or you
> can use sth that you derive from the builtin used (is a supposedly
> existing _v4si variant always subject to int * TBAA?)

After thinking about this a while, I believe Will should use ptr_type_node
here.  I think anything we do to try to enforce some TBAA on these
pointer types will be fragile.  Supposedly a _v2df variant should point
only to [an array of] double or to a vector double, and parsing enforces
that at least they've cast to that so they assert they know what they're
doing.  Beyond that we needn't be too fussed if it's actually a
struct X * or the like.  We already have issues with "vector long" and
"vector long long" being different types in theory but aliased together
for 64-bit because they are the same in practice.

As long as we are still commoning identical loads (which didn't used
to happen before the parsing-level expansion was done), I'll be happy.
We can always revisit this later if we feel like refined TBAA would solve a
concrete problem.

Bill
> 
>> Another note for Will:  The existing code gives up when -maltivec=be has
>> been specified, and you probably want to do that as well.  That may be
>> why you initially turned off big endian -- it is easy to misread that code.
>> -maltivec=be is VECTOR_ELT_ORDER_BIG && !BYTES_BIG_ENDIAN.
>> 
>> Thanks,
>> Bill
>>> 
>>> Richard.
>>> 
>>>> Bill
>>>>> 
>>>>>> 
>>>>>> I think if the builtins have any TBAA constraints you need to build those
>>>>>> explicitely, if not, you should use ptr_type_node aka no TBAA.
>>>>> 
>>>>> The type signatures are constrained during parsing, so we should only
>>>>> see allowed pointer types on arg1 by the time we get to gimple folding.  I
>>>>> think that using arg1_type should work, but I am probably missing
>>>>> something subtle, so please feel free to whack me on the temple until
>>>>> I get it. :-)
>>>>> 
>>>>> Bill
>>>>>> 
>>>>>> Richard.
>>>>>> 
>>>>>>> +        gimple_set_location (g, loc);
>>>>>>> +        gsi_replace (gsi, g, true);
>>>>>>> +
>>>>>>> +        return true;
>>>>>>> +
>>>>>>> +      }
>>>>>>> +
>>>>>>>  default:
>>>>>>>     if (TARGET_DEBUG_BUILTIN)
>>>>>>>        fprintf (stderr, "gimple builtin intrinsic not matched:%d %s %s\n",
>>>>>>>                 fn_code, fn_name1, fn_name2);
>>>>>>>    break;
diff mbox series

Patch

diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c
index 897306c..73e14d9 100644
--- a/gcc/config/rs6000/rs6000-c.c
+++ b/gcc/config/rs6000/rs6000-c.c
@@ -6459,92 +6459,19 @@  altivec_resolve_overloaded_builtin (location_t loc, tree fndecl,
 		     convert (TREE_TYPE (stmt), arg0));
       stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl);
       return stmt;
     }
 
-  /* Expand vec_ld into an expression that masks the address and
-     performs the load.  We need to expand this early to allow
+  /* Expand vec_st into an expression that masks the address and
+     performs the store.  We need to expand this early to allow
      the best aliasing, as by the time we get into RTL we no longer
      are able to honor __restrict__, for example.  We may want to
      consider this for all memory access built-ins.
 
      When -maltivec=be is specified, or the wrong number of arguments
      is provided, simply punt to existing built-in processing.  */
-  if (fcode == ALTIVEC_BUILTIN_VEC_LD
-      && (BYTES_BIG_ENDIAN || !VECTOR_ELT_ORDER_BIG)
-      && nargs == 2)
-    {
-      tree arg0 = (*arglist)[0];
-      tree arg1 = (*arglist)[1];
-
-      /* Strip qualifiers like "const" from the pointer arg.  */
-      tree arg1_type = TREE_TYPE (arg1);
-      if (!POINTER_TYPE_P (arg1_type) && TREE_CODE (arg1_type) != ARRAY_TYPE)
-	goto bad;
-
-      tree inner_type = TREE_TYPE (arg1_type);
-      if (TYPE_QUALS (TREE_TYPE (arg1_type)) != 0)
-	{
-	  arg1_type = build_pointer_type (build_qualified_type (inner_type,
-								0));
-	  arg1 = fold_convert (arg1_type, arg1);
-	}
-
-      /* Construct the masked address.  Let existing error handling take
-	 over if we don't have a constant offset.  */
-      arg0 = fold (arg0);
-
-      if (TREE_CODE (arg0) == INTEGER_CST)
-	{
-	  if (!ptrofftype_p (TREE_TYPE (arg0)))
-	    arg0 = build1 (NOP_EXPR, sizetype, arg0);
-
-	  tree arg1_type = TREE_TYPE (arg1);
-	  if (TREE_CODE (arg1_type) == ARRAY_TYPE)
-	    {
-	      arg1_type = TYPE_POINTER_TO (TREE_TYPE (arg1_type));
-	      tree const0 = build_int_cstu (sizetype, 0);
-	      tree arg1_elt0 = build_array_ref (loc, arg1, const0);
-	      arg1 = build1 (ADDR_EXPR, arg1_type, arg1_elt0);
-	    }
-
-	  tree addr = fold_build2_loc (loc, POINTER_PLUS_EXPR, arg1_type,
-				       arg1, arg0);
-	  tree aligned = fold_build2_loc (loc, BIT_AND_EXPR, arg1_type, addr,
-					  build_int_cst (arg1_type, -16));
-
-	  /* Find the built-in to get the return type so we can convert
-	     the result properly (or fall back to default handling if the
-	     arguments aren't compatible).  */
-	  for (desc = altivec_overloaded_builtins;
-	       desc->code && desc->code != fcode; desc++)
-	    continue;
-
-	  for (; desc->code == fcode; desc++)
-	    if (rs6000_builtin_type_compatible (TREE_TYPE (arg0), desc->op1)
-		&& (rs6000_builtin_type_compatible (TREE_TYPE (arg1),
-						    desc->op2)))
-	      {
-		tree ret_type = rs6000_builtin_type (desc->ret_type);
-		if (TYPE_MODE (ret_type) == V2DImode)
-		  /* Type-based aliasing analysis thinks vector long
-		     and vector long long are different and will put them
-		     in distinct alias classes.  Force our return type
-		     to be a may-alias type to avoid this.  */
-		  ret_type
-		    = build_pointer_type_for_mode (ret_type, Pmode,
-						   true/*can_alias_all*/);
-		else
-		  ret_type = build_pointer_type (ret_type);
-		aligned = build1 (NOP_EXPR, ret_type, aligned);
-		tree ret_val = build_indirect_ref (loc, aligned, RO_NULL);
-		return ret_val;
-	      }
-	}
-    }
 
-  /* Similarly for stvx.  */
   if (fcode == ALTIVEC_BUILTIN_VEC_ST
       && (BYTES_BIG_ENDIAN || !VECTOR_ELT_ORDER_BIG)
       && nargs == 3)
     {
       tree arg0 = (*arglist)[0];
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index cf744d8..5b14789 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -16473,10 +16473,65 @@  rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi)
 	res = gimple_build (&stmts, VIEW_CONVERT_EXPR, TREE_TYPE (lhs), res);
 	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
 	update_call_from_tree (gsi, res);
 	return true;
       }
+    /* Vector loads.  */
+    case ALTIVEC_BUILTIN_LVX_V16QI:
+    case ALTIVEC_BUILTIN_LVX_V8HI:
+    case ALTIVEC_BUILTIN_LVX_V4SI:
+    case ALTIVEC_BUILTIN_LVX_V4SF:
+    case ALTIVEC_BUILTIN_LVX_V2DI:
+    case ALTIVEC_BUILTIN_LVX_V2DF:
+      {
+	 gimple *g;
+	 arg0 = gimple_call_arg (stmt, 0);  // offset
+	 arg1 = gimple_call_arg (stmt, 1);  // address
+
+	 /* Limit folding of loads to LE targets.  */
+	 if (BYTES_BIG_ENDIAN || VECTOR_ELT_ORDER_BIG)
+	   return false;
+
+	 lhs = gimple_call_lhs (stmt);
+	 location_t loc = gimple_location (stmt);
+
+	 tree arg1_type = TREE_TYPE (arg1);
+	 tree lhs_type = TREE_TYPE (lhs);
+
+	 /* POINTER_PLUS_EXPR wants the offset to be of type 'sizetype'.  Create
+	    the tree using the value from arg0.  The resulting type will match
+	    the type of arg1.  */
+	 tree temp_offset = create_tmp_reg_or_ssa_name (sizetype);
+	 g = gimple_build_assign (temp_offset, NOP_EXPR, arg0);
+	 gimple_set_location (g, loc);
+	 gsi_insert_before (gsi, g, GSI_SAME_STMT);
+	 tree temp_addr = create_tmp_reg_or_ssa_name (arg1_type);
+	 g = gimple_build_assign (temp_addr, POINTER_PLUS_EXPR, arg1,
+				  temp_offset);
+	 gimple_set_location (g, loc);
+	 gsi_insert_before (gsi, g, GSI_SAME_STMT);
+
+	 /* Mask off any lower bits from the address.  */
+	 tree alignment_mask = build_int_cst (arg1_type, -16);
+	 tree aligned_addr = create_tmp_reg_or_ssa_name (arg1_type);
+	 g = gimple_build_assign (aligned_addr, BIT_AND_EXPR,
+				 temp_addr, alignment_mask);
+	 gimple_set_location (g, loc);
+	 gsi_insert_before (gsi, g, GSI_SAME_STMT);
+
+	 /* Use the build2 helper to set up the mem_ref.  The MEM_REF could also
+	    take an offset, but since we've already incorporated the offset
+	    above, here we just pass in a zero.  */
+	 g = gimple_build_assign (lhs, build2 (MEM_REF, lhs_type, aligned_addr,
+						build_int_cst (arg1_type, 0)));
+	 gimple_set_location (g, loc);
+	 gsi_replace (gsi, g, true);
+
+	 return true;
+
+      }
+
     default:
 	if (TARGET_DEBUG_BUILTIN)
 	   fprintf (stderr, "gimple builtin intrinsic not matched:%d %s %s\n",
 		    fn_code, fn_name1, fn_name2);
       break;