diff mbox series

[RFC,rs6000] enable GIMPLE folding of vec_splat

Message ID 1533669927.4452.14.camel@brimstone.rchland.ibm.com
State New
Headers show
Series [RFC,rs6000] enable GIMPLE folding of vec_splat | expand

Commit Message

will schmidt Aug. 7, 2018, 7:25 p.m. UTC
Hi
Enable GIMPLE folding of the vec_splat() intrinsic.
    
For review.. feedback is expected. :-)
    
I came up with the following after spending some time poking around
at the tree_vec_extract() and vector_element() functions as seen
in tree-vect-generic.c looking for insights.  Some of this seems a
bit clunky to me yet, but this is functional as far as make-check
can tell, and is as far as I can get without additional input.
    
This uses the tree_vec_extract() function out of tree-vect-generic.c
to retrieve the splat value, which is a BIT_FIELD_REF.   That function is
made non-static as part of this change.

In review of the .gimple output, this folding takes a sample testcase
of 
	vector bool int testb_0  (vector bool int x)
	{
	  return vec_splat (x, 0b00000); 
	}
from:
testb_0 (__vector __bool int x)
{
  __vector __bool intD.1486 D.2855;

  _1 = VIEW_CONVERT_EXPR<__vector signed intD.1468>(xD.2778);
  _2 = __builtin_altivec_vspltwD.1919 (_1, 0);
  D.2855 = VIEW_CONVERT_EXPR<__vector __bool intD.1486>(_2);
  return D.2855;
}
to:
 testb_0 (__vector __bool int x)
{
  __vector __bool intD.1486 D.2855;

  _1 = VIEW_CONVERT_EXPR<__vector signed intD.1468>(xD.2778);
  D.2856 = BIT_FIELD_REF <_1, 32, 0>;
  _2 = {D.2856, D.2856, D.2856, D.2856};
  D.2855 = VIEW_CONVERT_EXPR<__vector __bool intD.1486>(_2);
  return D.2855;
}


Testcases are being posted as a separate patch.
    
OK for trunk?   
Thanks,
-Will
    
[gcc]
    
2018-08-07  Will Schmidt  <will_schmidt@vnet.ibm.com>
    
	* config/rs6000/rs6000.c (rs6000_gimple_fold_builtin): Add support for
	  early gimple folding of vec_splat().
	* tree-vect-generic.c: Remove static from tree_vec_extract() definition.
	* gimple-fold.h:  Add an extern define for tree_vec_extract().

Comments

Bill Schmidt Aug. 9, 2018, 1:53 p.m. UTC | #1
> On Aug 7, 2018, at 2:25 PM, Will Schmidt <will_schmidt@vnet.ibm.com> wrote:
> 
> Hi
> Enable GIMPLE folding of the vec_splat() intrinsic.
> 
> For review.. feedback is expected. :-)
> 
> I came up with the following after spending some time poking around
> at the tree_vec_extract() and vector_element() functions as seen
> in tree-vect-generic.c looking for insights.  Some of this seems a
> bit clunky to me yet, but this is functional as far as make-check
> can tell, and is as far as I can get without additional input.
> 
> This uses the tree_vec_extract() function out of tree-vect-generic.c
> to retrieve the splat value, which is a BIT_FIELD_REF.   That function is
> made non-static as part of this change.
> 
> In review of the .gimple output, this folding takes a sample testcase
> of 
> 	vector bool int testb_0  (vector bool int x)
> 	{
> 	  return vec_splat (x, 0b00000); 
> 	}
> from:
> testb_0 (__vector __bool int x)
> {
>  __vector __bool intD.1486 D.2855;
> 
>  _1 = VIEW_CONVERT_EXPR<__vector signed intD.1468>(xD.2778);
>  _2 = __builtin_altivec_vspltwD.1919 (_1, 0);
>  D.2855 = VIEW_CONVERT_EXPR<__vector __bool intD.1486>(_2);
>  return D.2855;
> }
> to:
> testb_0 (__vector __bool int x)
> {
>  __vector __bool intD.1486 D.2855;
> 
>  _1 = VIEW_CONVERT_EXPR<__vector signed intD.1468>(xD.2778);
>  D.2856 = BIT_FIELD_REF <_1, 32, 0>;
>  _2 = {D.2856, D.2856, D.2856, D.2856};
>  D.2855 = VIEW_CONVERT_EXPR<__vector __bool intD.1486>(_2);
>  return D.2855;
> }
> 
This looks okay.
> 
> Testcases are being posted as a separate patch.
> 
> OK for trunk?   
> Thanks,
> -Will
> 
> [gcc]
> 
> 2018-08-07  Will Schmidt  <will_schmidt@vnet.ibm.com>
> 
> 	* config/rs6000/rs6000.c (rs6000_gimple_fold_builtin): Add support for
> 	  early gimple folding of vec_splat().
> 	* tree-vect-generic.c: Remove static from tree_vec_extract() definition.
> 	* gimple-fold.h:  Add an extern define for tree_vec_extract().
> 
> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> index 35c32be..acc6b49 100644
> --- a/gcc/config/rs6000/rs6000.c
> +++ b/gcc/config/rs6000/rs6000.c
> @@ -15764,10 +15764,56 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi)
> 	 tree splat_tree = build_vector_from_val (TREE_TYPE (lhs), splat_value);
> 	 g = gimple_build_assign (lhs, splat_tree);
> 	 gimple_set_location (g, gimple_location (stmt));
> 	 gsi_replace (gsi, g, true);
> 	 return true;
> +	}
> +
> +    /* Flavors of vec_splat.  */
> +    // a = vec_splat (b, 0x3) ;  // a = { b[3],b[3],b[3],...};
> +    case ALTIVEC_BUILTIN_VSPLTB:
> +    case ALTIVEC_BUILTIN_VSPLTH:
> +    case ALTIVEC_BUILTIN_VSPLTW:
> +    case VSX_BUILTIN_XXSPLTD_V2DI:
> +    case VSX_BUILTIN_XXSPLTD_V2DF:
> +      {
> +	arg0 = gimple_call_arg (stmt, 0); /* input vector.  */
> +	arg1 = gimple_call_arg (stmt, 1); /* index into arg0.  */
> +	/* Only fold the vec_splat_*() if arg1 is a constant
> +	   5-bit unsigned literal.  */
> +	if (TREE_CODE (arg1) != INTEGER_CST || TREE_INT_CST_LOW (arg1) > 0x1f)
> +	  return false;
> +
> +	lhs = gimple_call_lhs (stmt);
> +	tree lhs_type = TREE_TYPE (lhs);
> +
> +	tree splat;
> +	if (TREE_CODE (arg0) == VECTOR_CST)
> +	  splat = VECTOR_CST_ELT (arg0, TREE_INT_CST_LOW (arg1));

You should force arg1 into range before this.  It should be adjusted modulo
VECTOR_CST_NELTS (arg0).  Yes, the underlying vsplt* instructions will
handle the modulo itself, but when expanding early we should make the
correction early, else something is likely to explode.  Do you have a test
where arg1 is less than 32 but greater than the number of elements?

> +	else
> +	  {
> +	    /* Determine (in bits) the length and start location of the
> +	       splat value for a call to the tree_vec_extract helper.  */
> +	    int tree_size_in_bits = TREE_INT_CST_LOW (size_in_bytes (lhs_type))
> +				    * BITS_PER_UNIT;

I expect you should use arg0's type rather than lhs_type here.  They
should be the same, of course; it's just clearer.

> +	    int splat_elem_size = tree_size_in_bits / VECTOR_CST_NELTS (arg0);
> +	    int splat_start_bit = TREE_INT_CST_LOW (arg1) * splat_elem_size;
> +	    /* Do not attempt to early-fold if the size + specified offset into
> +	       the vector would touch outside of the source vector.  */
> +	    if ((splat_start_bit + splat_elem_size) > tree_size_in_bits)
> +	      return false;

This won't be necessary once you fix the modulo issue.

The rest LGTM.

Thanks,
Bill

> +	    tree len = build_int_cst (bitsizetype, splat_elem_size);
> +	    tree start = build_int_cst (bitsizetype, splat_start_bit);
> +	    splat = tree_vec_extract (gsi, TREE_TYPE (lhs_type), arg0,
> +				      len, start);
> +	}
> +	/* And finally, build the new vector.  */
> +	tree splat_tree = build_vector_from_val (lhs_type, splat);
> +	g = gimple_build_assign (lhs, splat_tree);
> +	gimple_set_location (g, gimple_location (stmt));
> +	gsi_replace (gsi, g, true);
> +	return true;
>       }
> 
>     /* vec_mergel (integrals).  */
>     case ALTIVEC_BUILTIN_VMRGLH:
>     case ALTIVEC_BUILTIN_VMRGLW:
> diff --git a/gcc/gimple-fold.h b/gcc/gimple-fold.h
> index 04e9bfa..e634180 100644
> --- a/gcc/gimple-fold.h
> +++ b/gcc/gimple-fold.h
> @@ -59,10 +59,11 @@ extern tree gimple_fold_indirect_ref (tree);
> extern bool gimple_fold_builtin_sprintf (gimple_stmt_iterator *);
> extern bool gimple_fold_builtin_snprintf (gimple_stmt_iterator *);
> extern bool arith_code_with_undefined_signed_overflow (tree_code);
> extern gimple_seq rewrite_to_defined_overflow (gimple *);
> extern void replace_call_with_value (gimple_stmt_iterator *, tree);
> +extern tree tree_vec_extract (gimple_stmt_iterator *, tree, tree, tree, tree);
> 
> /* gimple_build, functionally matching fold_buildN, outputs stmts
>    int the provided sequence, matching and simplifying them on-the-fly.
>    Supposed to replace force_gimple_operand (fold_buildN (...), ...).  */
> extern tree gimple_build (gimple_seq *, location_t,
> diff --git a/gcc/tree-vect-generic.c b/gcc/tree-vect-generic.c
> index 909f790..1c9701d 100644
> --- a/gcc/tree-vect-generic.c
> +++ b/gcc/tree-vect-generic.c
> @@ -118,11 +118,11 @@ build_word_mode_vector_type (int nunits)
> 
> typedef tree (*elem_op_func) (gimple_stmt_iterator *,
> 			      tree, tree, tree, tree, tree, enum tree_code,
> 			      tree);
> 
> -static inline tree
> +tree
> tree_vec_extract (gimple_stmt_iterator *gsi, tree type,
> 		  tree t, tree bitsize, tree bitpos)
> {
>   if (TREE_CODE (t) == SSA_NAME)
>     {
> 
>
will schmidt Aug. 9, 2018, 9:07 p.m. UTC | #2
On Thu, 2018-08-09 at 08:53 -0500, Bill Schmidt wrote:
> > On Aug 7, 2018, at 2:25 PM, Will Schmidt <will_schmidt@vnet.ibm.com> wrote:
> > 
> > Hi
> > Enable GIMPLE folding of the vec_splat() intrinsic.
> > 
> > For review.. feedback is expected. :-)
> > 
> > I came up with the following after spending some time poking around
> > at the tree_vec_extract() and vector_element() functions as seen
> > in tree-vect-generic.c looking for insights.  Some of this seems a
> > bit clunky to me yet, but this is functional as far as make-check
> > can tell, and is as far as I can get without additional input.
> > 
> > This uses the tree_vec_extract() function out of tree-vect-generic.c
> > to retrieve the splat value, which is a BIT_FIELD_REF.   That function is
> > made non-static as part of this change.
> > 
> > In review of the .gimple output, this folding takes a sample testcase
> > of 
> > 	vector bool int testb_0  (vector bool int x)
> > 	{
> > 	  return vec_splat (x, 0b00000); 
> > 	}
> > from:
> > testb_0 (__vector __bool int x)
> > {
> >  __vector __bool intD.1486 D.2855;
> > 
> >  _1 = VIEW_CONVERT_EXPR<__vector signed intD.1468>(xD.2778);
> >  _2 = __builtin_altivec_vspltwD.1919 (_1, 0);
> >  D.2855 = VIEW_CONVERT_EXPR<__vector __bool intD.1486>(_2);
> >  return D.2855;
> > }
> > to:
> > testb_0 (__vector __bool int x)
> > {
> >  __vector __bool intD.1486 D.2855;
> > 
> >  _1 = VIEW_CONVERT_EXPR<__vector signed intD.1468>(xD.2778);
> >  D.2856 = BIT_FIELD_REF <_1, 32, 0>;
> >  _2 = {D.2856, D.2856, D.2856, D.2856};
> >  D.2855 = VIEW_CONVERT_EXPR<__vector __bool intD.1486>(_2);
> >  return D.2855;
> > }
> > 
> This looks okay.
> > 
> > Testcases are being posted as a separate patch.
> > 
> > OK for trunk?   
> > Thanks,
> > -Will
> > 
> > [gcc]
> > 
> > 2018-08-07  Will Schmidt  <will_schmidt@vnet.ibm.com>
> > 
> > 	* config/rs6000/rs6000.c (rs6000_gimple_fold_builtin): Add support for
> > 	  early gimple folding of vec_splat().
> > 	* tree-vect-generic.c: Remove static from tree_vec_extract() definition.
> > 	* gimple-fold.h:  Add an extern define for tree_vec_extract().
> > 
> > diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> > index 35c32be..acc6b49 100644
> > --- a/gcc/config/rs6000/rs6000.c
> > +++ b/gcc/config/rs6000/rs6000.c
> > @@ -15764,10 +15764,56 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi)
> > 	 tree splat_tree = build_vector_from_val (TREE_TYPE (lhs), splat_value);
> > 	 g = gimple_build_assign (lhs, splat_tree);
> > 	 gimple_set_location (g, gimple_location (stmt));
> > 	 gsi_replace (gsi, g, true);
> > 	 return true;
> > +	}
> > +
> > +    /* Flavors of vec_splat.  */
> > +    // a = vec_splat (b, 0x3) ;  // a = { b[3],b[3],b[3],...};
> > +    case ALTIVEC_BUILTIN_VSPLTB:
> > +    case ALTIVEC_BUILTIN_VSPLTH:
> > +    case ALTIVEC_BUILTIN_VSPLTW:
> > +    case VSX_BUILTIN_XXSPLTD_V2DI:
> > +    case VSX_BUILTIN_XXSPLTD_V2DF:
> > +      {
> > +	arg0 = gimple_call_arg (stmt, 0); /* input vector.  */
> > +	arg1 = gimple_call_arg (stmt, 1); /* index into arg0.  */
> > +	/* Only fold the vec_splat_*() if arg1 is a constant
> > +	   5-bit unsigned literal.  */
> > +	if (TREE_CODE (arg1) != INTEGER_CST || TREE_INT_CST_LOW (arg1) > 0x1f)
> > +	  return false;
> > +
> > +	lhs = gimple_call_lhs (stmt);
> > +	tree lhs_type = TREE_TYPE (lhs);
> > +
> > +	tree splat;
> > +	if (TREE_CODE (arg0) == VECTOR_CST)
> > +	  splat = VECTOR_CST_ELT (arg0, TREE_INT_CST_LOW (arg1));
> 
> You should force arg1 into range before this.  It should be adjusted modulo
> VECTOR_CST_NELTS (arg0).  Yes, the underlying vsplt* instructions will

OK.

> handle the modulo itself, but when expanding early we should make the
> correction early, else something is likely to explode.  Do you have a test
> where arg1 is less than 32 but greater than the number of elements?

Almost. :-)   I test for arg1 < 32 and > #elms, but not with the
arg0-is-constant scenario.  I'll craft up a couple more testcases for
that.

> 
> > +	else
> > +	  {
> > +	    /* Determine (in bits) the length and start location of the
> > +	       splat value for a call to the tree_vec_extract helper.  */
> > +	    int tree_size_in_bits = TREE_INT_CST_LOW (size_in_bytes (lhs_type))
> > +				    * BITS_PER_UNIT;
> 
> I expect you should use arg0's type rather than lhs_type here.  They
> should be the same, of course; it's just clearer.

ok.

> 
> > +	    int splat_elem_size = tree_size_in_bits / VECTOR_CST_NELTS (arg0);
> > +	    int splat_start_bit = TREE_INT_CST_LOW (arg1) * splat_elem_size;
> > +	    /* Do not attempt to early-fold if the size + specified offset into
> > +	       the vector would touch outside of the source vector.  */
> > +	    if ((splat_start_bit + splat_elem_size) > tree_size_in_bits)
> > +	      return false;
> 
> This won't be necessary once you fix the modulo issue.

ok.

> The rest LGTM.

thanks for the review and feedback. :-)

-Will



> 
> Thanks,
> Bill
> 
> > +	    tree len = build_int_cst (bitsizetype, splat_elem_size);
> > +	    tree start = build_int_cst (bitsizetype, splat_start_bit);
> > +	    splat = tree_vec_extract (gsi, TREE_TYPE (lhs_type), arg0,
> > +				      len, start);
> > +	}
> > +	/* And finally, build the new vector.  */
> > +	tree splat_tree = build_vector_from_val (lhs_type, splat);
> > +	g = gimple_build_assign (lhs, splat_tree);
> > +	gimple_set_location (g, gimple_location (stmt));
> > +	gsi_replace (gsi, g, true);
> > +	return true;
> >       }
> > 
> >     /* vec_mergel (integrals).  */
> >     case ALTIVEC_BUILTIN_VMRGLH:
> >     case ALTIVEC_BUILTIN_VMRGLW:
> > diff --git a/gcc/gimple-fold.h b/gcc/gimple-fold.h
> > index 04e9bfa..e634180 100644
> > --- a/gcc/gimple-fold.h
> > +++ b/gcc/gimple-fold.h
> > @@ -59,10 +59,11 @@ extern tree gimple_fold_indirect_ref (tree);
> > extern bool gimple_fold_builtin_sprintf (gimple_stmt_iterator *);
> > extern bool gimple_fold_builtin_snprintf (gimple_stmt_iterator *);
> > extern bool arith_code_with_undefined_signed_overflow (tree_code);
> > extern gimple_seq rewrite_to_defined_overflow (gimple *);
> > extern void replace_call_with_value (gimple_stmt_iterator *, tree);
> > +extern tree tree_vec_extract (gimple_stmt_iterator *, tree, tree, tree, tree);
> > 
> > /* gimple_build, functionally matching fold_buildN, outputs stmts
> >    int the provided sequence, matching and simplifying them on-the-fly.
> >    Supposed to replace force_gimple_operand (fold_buildN (...), ...).  */
> > extern tree gimple_build (gimple_seq *, location_t,
> > diff --git a/gcc/tree-vect-generic.c b/gcc/tree-vect-generic.c
> > index 909f790..1c9701d 100644
> > --- a/gcc/tree-vect-generic.c
> > +++ b/gcc/tree-vect-generic.c
> > @@ -118,11 +118,11 @@ build_word_mode_vector_type (int nunits)
> > 
> > typedef tree (*elem_op_func) (gimple_stmt_iterator *,
> > 			      tree, tree, tree, tree, tree, enum tree_code,
> > 			      tree);
> > 
> > -static inline tree
> > +tree
> > tree_vec_extract (gimple_stmt_iterator *gsi, tree type,
> > 		  tree t, tree bitsize, tree bitpos)
> > {
> >   if (TREE_CODE (t) == SSA_NAME)
> >     {
> > 
> > 
>
Richard Biener Aug. 17, 2018, 2:11 p.m. UTC | #3
On Tue, Aug 7, 2018 at 9:25 PM Will Schmidt <will_schmidt@vnet.ibm.com> wrote:
>
> Hi
> Enable GIMPLE folding of the vec_splat() intrinsic.
>
> For review.. feedback is expected. :-)
>
> I came up with the following after spending some time poking around
> at the tree_vec_extract() and vector_element() functions as seen
> in tree-vect-generic.c looking for insights.  Some of this seems a
> bit clunky to me yet, but this is functional as far as make-check
> can tell, and is as far as I can get without additional input.
>
> This uses the tree_vec_extract() function out of tree-vect-generic.c
> to retrieve the splat value, which is a BIT_FIELD_REF.   That function is
> made non-static as part of this change.
>
> In review of the .gimple output, this folding takes a sample testcase
> of
>         vector bool int testb_0  (vector bool int x)
>         {
>           return vec_splat (x, 0b00000);
>         }
> from:
> testb_0 (__vector __bool int x)
> {
>   __vector __bool intD.1486 D.2855;
>
>   _1 = VIEW_CONVERT_EXPR<__vector signed intD.1468>(xD.2778);
>   _2 = __builtin_altivec_vspltwD.1919 (_1, 0);
>   D.2855 = VIEW_CONVERT_EXPR<__vector __bool intD.1486>(_2);
>   return D.2855;
> }
> to:
>  testb_0 (__vector __bool int x)
> {
>   __vector __bool intD.1486 D.2855;
>
>   _1 = VIEW_CONVERT_EXPR<__vector signed intD.1468>(xD.2778);
>   D.2856 = BIT_FIELD_REF <_1, 32, 0>;
>   _2 = {D.2856, D.2856, D.2856, D.2856};
>   D.2855 = VIEW_CONVERT_EXPR<__vector __bool intD.1486>(_2);
>   return D.2855;
> }
>
>
> Testcases are being posted as a separate patch.
>
> OK for trunk?

It certainly works, nowadays we have VEC_DUPLICATE_EXPR though
so you could simply do

 _1 = VIEW_CONVERT_EXPR<__vector signed intD.1468>(xD.2778);
 D.2856 = BIT_FIELD_REF <_1, 32, 0>;
 D.2855 = VEC_DUPLICATE_EXPR <__vector __bool intD.1486> (D.2856);

not sure what variant is better for followup optimizations and whether
we already fold that { D.2856, D.2856, D.2856, D.2856 }; to VEC_DUPLICATE_EXPR.

Richard may know more.

Richard.

> Thanks,
> -Will
>
> [gcc]
>
> 2018-08-07  Will Schmidt  <will_schmidt@vnet.ibm.com>
>
>         * config/rs6000/rs6000.c (rs6000_gimple_fold_builtin): Add support for
>           early gimple folding of vec_splat().
>         * tree-vect-generic.c: Remove static from tree_vec_extract() definition.
>         * gimple-fold.h:  Add an extern define for tree_vec_extract().
>
> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> index 35c32be..acc6b49 100644
> --- a/gcc/config/rs6000/rs6000.c
> +++ b/gcc/config/rs6000/rs6000.c
> @@ -15764,10 +15764,56 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi)
>          tree splat_tree = build_vector_from_val (TREE_TYPE (lhs), splat_value);
>          g = gimple_build_assign (lhs, splat_tree);
>          gimple_set_location (g, gimple_location (stmt));
>          gsi_replace (gsi, g, true);
>          return true;
> +       }
> +
> +    /* Flavors of vec_splat.  */
> +    // a = vec_splat (b, 0x3) ;  // a = { b[3],b[3],b[3],...};
> +    case ALTIVEC_BUILTIN_VSPLTB:
> +    case ALTIVEC_BUILTIN_VSPLTH:
> +    case ALTIVEC_BUILTIN_VSPLTW:
> +    case VSX_BUILTIN_XXSPLTD_V2DI:
> +    case VSX_BUILTIN_XXSPLTD_V2DF:
> +      {
> +       arg0 = gimple_call_arg (stmt, 0); /* input vector.  */
> +       arg1 = gimple_call_arg (stmt, 1); /* index into arg0.  */
> +       /* Only fold the vec_splat_*() if arg1 is a constant
> +          5-bit unsigned literal.  */
> +       if (TREE_CODE (arg1) != INTEGER_CST || TREE_INT_CST_LOW (arg1) > 0x1f)
> +         return false;
> +
> +       lhs = gimple_call_lhs (stmt);
> +       tree lhs_type = TREE_TYPE (lhs);
> +
> +       tree splat;
> +       if (TREE_CODE (arg0) == VECTOR_CST)
> +         splat = VECTOR_CST_ELT (arg0, TREE_INT_CST_LOW (arg1));
> +       else
> +         {
> +           /* Determine (in bits) the length and start location of the
> +              splat value for a call to the tree_vec_extract helper.  */
> +           int tree_size_in_bits = TREE_INT_CST_LOW (size_in_bytes (lhs_type))
> +                                   * BITS_PER_UNIT;
> +           int splat_elem_size = tree_size_in_bits / VECTOR_CST_NELTS (arg0);
> +           int splat_start_bit = TREE_INT_CST_LOW (arg1) * splat_elem_size;
> +           /* Do not attempt to early-fold if the size + specified offset into
> +              the vector would touch outside of the source vector.  */
> +           if ((splat_start_bit + splat_elem_size) > tree_size_in_bits)
> +             return false;
> +           tree len = build_int_cst (bitsizetype, splat_elem_size);
> +           tree start = build_int_cst (bitsizetype, splat_start_bit);
> +           splat = tree_vec_extract (gsi, TREE_TYPE (lhs_type), arg0,
> +                                     len, start);
> +       }
> +       /* And finally, build the new vector.  */
> +       tree splat_tree = build_vector_from_val (lhs_type, splat);
> +       g = gimple_build_assign (lhs, splat_tree);
> +       gimple_set_location (g, gimple_location (stmt));
> +       gsi_replace (gsi, g, true);
> +       return true;
>        }
>
>      /* vec_mergel (integrals).  */
>      case ALTIVEC_BUILTIN_VMRGLH:
>      case ALTIVEC_BUILTIN_VMRGLW:
> diff --git a/gcc/gimple-fold.h b/gcc/gimple-fold.h
> index 04e9bfa..e634180 100644
> --- a/gcc/gimple-fold.h
> +++ b/gcc/gimple-fold.h
> @@ -59,10 +59,11 @@ extern tree gimple_fold_indirect_ref (tree);
>  extern bool gimple_fold_builtin_sprintf (gimple_stmt_iterator *);
>  extern bool gimple_fold_builtin_snprintf (gimple_stmt_iterator *);
>  extern bool arith_code_with_undefined_signed_overflow (tree_code);
>  extern gimple_seq rewrite_to_defined_overflow (gimple *);
>  extern void replace_call_with_value (gimple_stmt_iterator *, tree);
> +extern tree tree_vec_extract (gimple_stmt_iterator *, tree, tree, tree, tree);
>
>  /* gimple_build, functionally matching fold_buildN, outputs stmts
>     int the provided sequence, matching and simplifying them on-the-fly.
>     Supposed to replace force_gimple_operand (fold_buildN (...), ...).  */
>  extern tree gimple_build (gimple_seq *, location_t,
> diff --git a/gcc/tree-vect-generic.c b/gcc/tree-vect-generic.c
> index 909f790..1c9701d 100644
> --- a/gcc/tree-vect-generic.c
> +++ b/gcc/tree-vect-generic.c
> @@ -118,11 +118,11 @@ build_word_mode_vector_type (int nunits)
>
>  typedef tree (*elem_op_func) (gimple_stmt_iterator *,
>                               tree, tree, tree, tree, tree, enum tree_code,
>                               tree);
>
> -static inline tree
> +tree
>  tree_vec_extract (gimple_stmt_iterator *gsi, tree type,
>                   tree t, tree bitsize, tree bitpos)
>  {
>    if (TREE_CODE (t) == SSA_NAME)
>      {
>
>
Richard Sandiford Aug. 18, 2018, 5:01 p.m. UTC | #4
Richard Biener <richard.guenther@gmail.com> writes:
> On Tue, Aug 7, 2018 at 9:25 PM Will Schmidt <will_schmidt@vnet.ibm.com> wrote:
>>
>> Hi
>> Enable GIMPLE folding of the vec_splat() intrinsic.
>>
>> For review.. feedback is expected. :-)
>>
>> I came up with the following after spending some time poking around
>> at the tree_vec_extract() and vector_element() functions as seen
>> in tree-vect-generic.c looking for insights.  Some of this seems a
>> bit clunky to me yet, but this is functional as far as make-check
>> can tell, and is as far as I can get without additional input.
>>
>> This uses the tree_vec_extract() function out of tree-vect-generic.c
>> to retrieve the splat value, which is a BIT_FIELD_REF.   That function is
>> made non-static as part of this change.
>>
>> In review of the .gimple output, this folding takes a sample testcase
>> of
>>         vector bool int testb_0  (vector bool int x)
>>         {
>>           return vec_splat (x, 0b00000);
>>         }
>> from:
>> testb_0 (__vector __bool int x)
>> {
>>   __vector __bool intD.1486 D.2855;
>>
>>   _1 = VIEW_CONVERT_EXPR<__vector signed intD.1468>(xD.2778);
>>   _2 = __builtin_altivec_vspltwD.1919 (_1, 0);
>>   D.2855 = VIEW_CONVERT_EXPR<__vector __bool intD.1486>(_2);
>>   return D.2855;
>> }
>> to:
>>  testb_0 (__vector __bool int x)
>> {
>>   __vector __bool intD.1486 D.2855;
>>
>>   _1 = VIEW_CONVERT_EXPR<__vector signed intD.1468>(xD.2778);
>>   D.2856 = BIT_FIELD_REF <_1, 32, 0>;
>>   _2 = {D.2856, D.2856, D.2856, D.2856};
>>   D.2855 = VIEW_CONVERT_EXPR<__vector __bool intD.1486>(_2);
>>   return D.2855;
>> }
>>
>>
>> Testcases are being posted as a separate patch.
>>
>> OK for trunk?
>
> It certainly works, nowadays we have VEC_DUPLICATE_EXPR though
> so you could simply do
>
>  _1 = VIEW_CONVERT_EXPR<__vector signed intD.1468>(xD.2778);
>  D.2856 = BIT_FIELD_REF <_1, 32, 0>;
>  D.2855 = VEC_DUPLICATE_EXPR <__vector __bool intD.1486> (D.2856);
>
> not sure what variant is better for followup optimizations and whether
> we already fold that { D.2856, D.2856, D.2856, D.2856 }; to VEC_DUPLICATE_EXPR.
>
> Richard may know more.

We only use VEC_DUPLICATE_EXPR for variable-length vectors.  I still owe
you a GCC 9 update for:

  https://gcc.gnu.org/ml/gcc-patches/2017-12/msg01039.html

(hadn't forgotten!)

It's a diversion from the patch, sorry, but I think the main alternatives are:

(1) Allow either operand of a VEC_PERM_EXPR to be a scalar
    ------------------------------------------------------

    Handles VEC_DUPLICATE_EXPR <a> as:
        VEC_PERM_EXPR <a, a, { 0, ... }>

    Handles IFN_VEC_SHL_INSERT <avec, b> as:
        VEC_PERM_EXPR <b, avec, { 0, 1, 2, 3, ... }>
    or: VEC_PERM_EXPR <avec, b, { n, 0, 1, 2, ... }>

    Can also insert a single "a" at the beginning of a vector of "b":
        VEC_PERM_EXPR <a, b, { 0, 1, 1, 1, ... }>

    Disdvantages:

    (a) It's still a bit limited.  E.g. it can't insert two scalars at
        the beginning of a vector of "b", which might be useful for SLP.

    (b) Either all targets would need to handle this case explicitly
        or something would need to massage the tree permute mask before
        the targets see it.  E.g.:

           VEC_PERM_EXPR <a, b, { 0, 1, 1, ... }>
        -> permute of { 0, n, n, ... } on two vector broadcast rtxes

    (c) There'd still be an inconsistency.  Either:
        (i) we'd only use VEC_PERM_EXPRs for variable-length vectors or
        (ii) we'd have to check before creating a CONSTRUCTOR whether
             it should be represented as a VEC_PERM_EXPR instead.

(2) Allow only the second operand of a VEC_PERM_EXPR to be a scalar
    ---------------------------------------------------------------

    Handles VEC_DUPLICATE_EXPR <a> as:
        VEC_PERM_EXPR <{ 0, ... }, a, { n, ... }>

    Handles IFN_VEC_SHL_INSERT <avec, b> as:
        VEC_PERM_EXPR <avec, b, { n, 0, 1, 2, ... }>;

    Advantages:

    - Avoids the need for target changes: the mask is the same if the scalar
      is replaced by a vector.

    Disadvantages:

    - (a) still holds (and now we can't do a two-scalar permute).

    - (c) still holds.

(3) Keep VEC_DUPLICATE_EXPR but use it for everything
    -------------------------------------------------

    Advantages:

    - The handling of duplicates becomes consistent across vector types.

    - There are no changes to VEC_PERM_EXPR or targets.

    - We can still replace IFN_VEC_SHL_INSERT with a VEC_PERM_EXPR,
      as a separate change.

    Disadvantages:

    - (c) still holds.

    (d) Vector duplication stays a bit of a special case (which was the
        original problem).

(4) Use the VECTOR_CST encoding for CONSTRUCTORs too
    ------------------------------------------------

    Advantages:

    - Reduces memory overhead of CONSTRUCTORs for vector broadcasts
      in constant-length vectors.

    - There's only one tree code to check.

    - Many more vectors have the same representation for constant- and
      variable-length vectors.

    - There's more consistency between constant element values and
      variable element values.

    - We can still replace IFN_VEC_SHL_INSERT with a VEC_PERM_EXPR,
      as a separate change.

    Disadvantages:

    (e) Completely following the VECTOR_CST encoding would mean having
        to encode the stepped forms based on two values in the series.
        E.g. a 3-pattern:

          { a : b : c : ... }

        (ad-hoc syntax) would be: 

          { a, b, c, c * 2 - b, c * 3 - b, ... }

        so the CONSTRUCTOR would include hidden arithmetic.  This means
        that the value of any given element (rather than just a directly-
        encoded one) would in general not be a gimple value.  E.g. in
        the above case, asking for element 3 would give a GENERIC tree.

    (f) If vector CONSTRUCTORs seem like a mistake then this will too.

(5) Like (4), but don't allow the stepped case
    ------------------------------------------

    Advantages:

    - As for (4), and avoids the hidden arithmetic in (e).

    Disadvantages:

    - (f) still holds.

    (g) Stepped vectors need to be handled separately.

For (f), we could (and might need to) replace CONSTRUCTOR with a new
tree code.  It would still have to be GIMPLE_SINGLE_RHS though, if we
keep the current "gassigns have 4 operands" assumption.

I'm not sure (g) is really a disadvantage.  We could replace
VEC_SERIES_EXPR <a, b> with:

    basev = { a, ... }
    stepv = { b, ... }
    tmp = stepv * { 0, 1, 2, ... };
    series = basev + tmp

and match it as an IFN_VEC_SERIES for targets that have it.
So code constructing vectors would never have to know about
VEC_SERIES_EXPR/IFN_VEC_SERIES or IFN_VEC_SHL_INSERT.

So I'm leaning towards (5) at the moment.  What do you think?

Thanks,
Richard
Richard Biener Aug. 23, 2018, 1:50 p.m. UTC | #5
On Sat, Aug 18, 2018 at 7:01 PM Richard Sandiford
<rdsandiford@googlemail.com> wrote:
>
> Richard Biener <richard.guenther@gmail.com> writes:
> > On Tue, Aug 7, 2018 at 9:25 PM Will Schmidt <will_schmidt@vnet.ibm.com> wrote:
> >>
> >> Hi
> >> Enable GIMPLE folding of the vec_splat() intrinsic.
> >>
> >> For review.. feedback is expected. :-)
> >>
> >> I came up with the following after spending some time poking around
> >> at the tree_vec_extract() and vector_element() functions as seen
> >> in tree-vect-generic.c looking for insights.  Some of this seems a
> >> bit clunky to me yet, but this is functional as far as make-check
> >> can tell, and is as far as I can get without additional input.
> >>
> >> This uses the tree_vec_extract() function out of tree-vect-generic.c
> >> to retrieve the splat value, which is a BIT_FIELD_REF.   That function is
> >> made non-static as part of this change.
> >>
> >> In review of the .gimple output, this folding takes a sample testcase
> >> of
> >>         vector bool int testb_0  (vector bool int x)
> >>         {
> >>           return vec_splat (x, 0b00000);
> >>         }
> >> from:
> >> testb_0 (__vector __bool int x)
> >> {
> >>   __vector __bool intD.1486 D.2855;
> >>
> >>   _1 = VIEW_CONVERT_EXPR<__vector signed intD.1468>(xD.2778);
> >>   _2 = __builtin_altivec_vspltwD.1919 (_1, 0);
> >>   D.2855 = VIEW_CONVERT_EXPR<__vector __bool intD.1486>(_2);
> >>   return D.2855;
> >> }
> >> to:
> >>  testb_0 (__vector __bool int x)
> >> {
> >>   __vector __bool intD.1486 D.2855;
> >>
> >>   _1 = VIEW_CONVERT_EXPR<__vector signed intD.1468>(xD.2778);
> >>   D.2856 = BIT_FIELD_REF <_1, 32, 0>;
> >>   _2 = {D.2856, D.2856, D.2856, D.2856};
> >>   D.2855 = VIEW_CONVERT_EXPR<__vector __bool intD.1486>(_2);
> >>   return D.2855;
> >> }
> >>
> >>
> >> Testcases are being posted as a separate patch.
> >>
> >> OK for trunk?
> >
> > It certainly works, nowadays we have VEC_DUPLICATE_EXPR though
> > so you could simply do
> >
> >  _1 = VIEW_CONVERT_EXPR<__vector signed intD.1468>(xD.2778);
> >  D.2856 = BIT_FIELD_REF <_1, 32, 0>;
> >  D.2855 = VEC_DUPLICATE_EXPR <__vector __bool intD.1486> (D.2856);
> >
> > not sure what variant is better for followup optimizations and whether
> > we already fold that { D.2856, D.2856, D.2856, D.2856 }; to VEC_DUPLICATE_EXPR.
> >
> > Richard may know more.
>
> We only use VEC_DUPLICATE_EXPR for variable-length vectors.  I still owe
> you a GCC 9 update for:
>
>   https://gcc.gnu.org/ml/gcc-patches/2017-12/msg01039.html
>
> (hadn't forgotten!)
>
> It's a diversion from the patch, sorry, but I think the main alternatives are:
>
> (1) Allow either operand of a VEC_PERM_EXPR to be a scalar
>     ------------------------------------------------------
>
>     Handles VEC_DUPLICATE_EXPR <a> as:
>         VEC_PERM_EXPR <a, a, { 0, ... }>
>
>     Handles IFN_VEC_SHL_INSERT <avec, b> as:
>         VEC_PERM_EXPR <b, avec, { 0, 1, 2, 3, ... }>
>     or: VEC_PERM_EXPR <avec, b, { n, 0, 1, 2, ... }>
>
>     Can also insert a single "a" at the beginning of a vector of "b":
>         VEC_PERM_EXPR <a, b, { 0, 1, 1, 1, ... }>
>
>     Disdvantages:
>
>     (a) It's still a bit limited.  E.g. it can't insert two scalars at
>         the beginning of a vector of "b", which might be useful for SLP.
>
>     (b) Either all targets would need to handle this case explicitly
>         or something would need to massage the tree permute mask before
>         the targets see it.  E.g.:
>
>            VEC_PERM_EXPR <a, b, { 0, 1, 1, ... }>
>         -> permute of { 0, n, n, ... } on two vector broadcast rtxes
>
>     (c) There'd still be an inconsistency.  Either:
>         (i) we'd only use VEC_PERM_EXPRs for variable-length vectors or
>         (ii) we'd have to check before creating a CONSTRUCTOR whether
>              it should be represented as a VEC_PERM_EXPR instead.
>
> (2) Allow only the second operand of a VEC_PERM_EXPR to be a scalar
>     ---------------------------------------------------------------
>
>     Handles VEC_DUPLICATE_EXPR <a> as:
>         VEC_PERM_EXPR <{ 0, ... }, a, { n, ... }>
>
>     Handles IFN_VEC_SHL_INSERT <avec, b> as:
>         VEC_PERM_EXPR <avec, b, { n, 0, 1, 2, ... }>;
>
>     Advantages:
>
>     - Avoids the need for target changes: the mask is the same if the scalar
>       is replaced by a vector.

Interesting idea ;)

>     Disadvantages:
>
>     - (a) still holds (and now we can't do a two-scalar permute).
>
>     - (c) still holds.
>
> (3) Keep VEC_DUPLICATE_EXPR but use it for everything
>     -------------------------------------------------
>
>     Advantages:
>
>     - The handling of duplicates becomes consistent across vector types.

I think this is definitely important - I don't like too much the idea of
having some stuff only used for the variable-size case.

>     - There are no changes to VEC_PERM_EXPR or targets.
>
>     - We can still replace IFN_VEC_SHL_INSERT with a VEC_PERM_EXPR,
>       as a separate change.
>
>     Disadvantages:
>
>     - (c) still holds.
>
>     (d) Vector duplication stays a bit of a special case (which was the
>         original problem).
>
> (4) Use the VECTOR_CST encoding for CONSTRUCTORs too
>     ------------------------------------------------
>
>     Advantages:
>
>     - Reduces memory overhead of CONSTRUCTORs for vector broadcasts
>       in constant-length vectors.
>
>     - There's only one tree code to check.
>
>     - Many more vectors have the same representation for constant- and
>       variable-length vectors.
>
>     - There's more consistency between constant element values and
>       variable element values.
>
>     - We can still replace IFN_VEC_SHL_INSERT with a VEC_PERM_EXPR,
>       as a separate change.
>
>     Disadvantages:
>
>     (e) Completely following the VECTOR_CST encoding would mean having
>         to encode the stepped forms based on two values in the series.
>         E.g. a 3-pattern:
>
>           { a : b : c : ... }
>
>         (ad-hoc syntax) would be:
>
>           { a, b, c, c * 2 - b, c * 3 - b, ... }
>
>         so the CONSTRUCTOR would include hidden arithmetic.  This means
>         that the value of any given element (rather than just a directly-
>         encoded one) would in general not be a gimple value.  E.g. in
>         the above case, asking for element 3 would give a GENERIC tree.

Eh.  The "advantage" would be it handles the VEC_SERIES_EXPR case as well?

>
>     (f) If vector CONSTRUCTORs seem like a mistake then this will too.

I still think they are ;)

> (5) Like (4), but don't allow the stepped case
>     ------------------------------------------
>
>     Advantages:
>
>     - As for (4), and avoids the hidden arithmetic in (e).
>
>     Disadvantages:
>
>     - (f) still holds.
>
>     (g) Stepped vectors need to be handled separately.
>
> For (f), we could (and might need to) replace CONSTRUCTOR with a new
> tree code.  It would still have to be GIMPLE_SINGLE_RHS though, if we
> keep the current "gassigns have 4 operands" assumption.
>
> I'm not sure (g) is really a disadvantage.  We could replace
> VEC_SERIES_EXPR <a, b> with:
>
>     basev = { a, ... }
>     stepv = { b, ... }
>     tmp = stepv * { 0, 1, 2, ... };
>     series = basev + tmp
>
> and match it as an IFN_VEC_SERIES for targets that have it.
> So code constructing vectors would never have to know about
> VEC_SERIES_EXPR/IFN_VEC_SERIES or IFN_VEC_SHL_INSERT.
>
> So I'm leaning towards (5) at the moment.  What do you think?

Generally I dislike CONSTRUCTOR because it hides the implementation
complexity.  That is, for most vector operations we only allow things
in the IL that matches what the target is willing to expand (which doesn't
necessarily mean it expands to one target instruction ...).  At least after
vector lowering.

So this would advocate for exposing all HW instructions as IFN_* but of
course that's too messy to handle in optimizers.  Instead what we do
for VEC_PERM_EXPR via can_vec_perm_const_p looks like a good
way forward to me.

That means (2) looks appealling to me given a two-scalar generic permute
isn't likely to be present in HW.  Still VEC_PERM_EXPR <scalar,
scalar, { 0, 1 }>
as way to concat two scalars to a vector makes sense to me.

We currently have

/* Vector permutation expression.  A = VEC_PERM_EXPR<v0, v1, mask> means

   N = length(mask)
   foreach i in N:
     M = mask[i] % (2*N)
     A = M < N ? v0[M] : v1[M-N]

   V0 and V1 are vectors of the same type.  MASK is an integer-typed
   vector.  The number of MASK elements must be the same with the
   number of elements in V0 and V1.  The size of the inner type
   of the MASK and of the V0 and V1 must be the same.
*/
DEFTREECODE (VEC_PERM_EXPR, "vec_perm_expr", tcc_expression, 3)

and GIMPLE checking enforces same types for the permutation result
and the two permuted vectors.

I'd relax that a bit to allow scalar / vector V0 / V1 and not require
the result to be the same as either vector type but just have the
same element type and element count as the mask vector.  That
would allow concats.

The goal would be that all primitive permute/concat (and extract?)
operations ISAs have can be expressed as a single VEC_PERM_EXPR.

And vector CONSTRUCTORs could be lowered using some automated
generator (as described by the "rewrite vectorizer" talks at the last
Cauldrons) that is seeded by some kind of machine description for
permutation operations.

That said, this would leave VEC_SERIES_EXPR alone unless we'd want
to split it up as you say and it wouldn't complicate CONSTRUCTORs
but walk in the direction of exposing some more target details
at the expense of target complication in their vec_perm_const_ok hook
(until that automated generator thing materializes).

Richard.

>
> Thanks,
> Richard
diff mbox series

Patch

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 35c32be..acc6b49 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -15764,10 +15764,56 @@  rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi)
 	 tree splat_tree = build_vector_from_val (TREE_TYPE (lhs), splat_value);
 	 g = gimple_build_assign (lhs, splat_tree);
 	 gimple_set_location (g, gimple_location (stmt));
 	 gsi_replace (gsi, g, true);
 	 return true;
+	}
+
+    /* Flavors of vec_splat.  */
+    // a = vec_splat (b, 0x3) ;  // a = { b[3],b[3],b[3],...};
+    case ALTIVEC_BUILTIN_VSPLTB:
+    case ALTIVEC_BUILTIN_VSPLTH:
+    case ALTIVEC_BUILTIN_VSPLTW:
+    case VSX_BUILTIN_XXSPLTD_V2DI:
+    case VSX_BUILTIN_XXSPLTD_V2DF:
+      {
+	arg0 = gimple_call_arg (stmt, 0); /* input vector.  */
+	arg1 = gimple_call_arg (stmt, 1); /* index into arg0.  */
+	/* Only fold the vec_splat_*() if arg1 is a constant
+	   5-bit unsigned literal.  */
+	if (TREE_CODE (arg1) != INTEGER_CST || TREE_INT_CST_LOW (arg1) > 0x1f)
+	  return false;
+
+	lhs = gimple_call_lhs (stmt);
+	tree lhs_type = TREE_TYPE (lhs);
+
+	tree splat;
+	if (TREE_CODE (arg0) == VECTOR_CST)
+	  splat = VECTOR_CST_ELT (arg0, TREE_INT_CST_LOW (arg1));
+	else
+	  {
+	    /* Determine (in bits) the length and start location of the
+	       splat value for a call to the tree_vec_extract helper.  */
+	    int tree_size_in_bits = TREE_INT_CST_LOW (size_in_bytes (lhs_type))
+				    * BITS_PER_UNIT;
+	    int splat_elem_size = tree_size_in_bits / VECTOR_CST_NELTS (arg0);
+	    int splat_start_bit = TREE_INT_CST_LOW (arg1) * splat_elem_size;
+	    /* Do not attempt to early-fold if the size + specified offset into
+	       the vector would touch outside of the source vector.  */
+	    if ((splat_start_bit + splat_elem_size) > tree_size_in_bits)
+	      return false;
+	    tree len = build_int_cst (bitsizetype, splat_elem_size);
+	    tree start = build_int_cst (bitsizetype, splat_start_bit);
+	    splat = tree_vec_extract (gsi, TREE_TYPE (lhs_type), arg0,
+				      len, start);
+	}
+	/* And finally, build the new vector.  */
+	tree splat_tree = build_vector_from_val (lhs_type, splat);
+	g = gimple_build_assign (lhs, splat_tree);
+	gimple_set_location (g, gimple_location (stmt));
+	gsi_replace (gsi, g, true);
+	return true;
       }
 
     /* vec_mergel (integrals).  */
     case ALTIVEC_BUILTIN_VMRGLH:
     case ALTIVEC_BUILTIN_VMRGLW:
diff --git a/gcc/gimple-fold.h b/gcc/gimple-fold.h
index 04e9bfa..e634180 100644
--- a/gcc/gimple-fold.h
+++ b/gcc/gimple-fold.h
@@ -59,10 +59,11 @@  extern tree gimple_fold_indirect_ref (tree);
 extern bool gimple_fold_builtin_sprintf (gimple_stmt_iterator *);
 extern bool gimple_fold_builtin_snprintf (gimple_stmt_iterator *);
 extern bool arith_code_with_undefined_signed_overflow (tree_code);
 extern gimple_seq rewrite_to_defined_overflow (gimple *);
 extern void replace_call_with_value (gimple_stmt_iterator *, tree);
+extern tree tree_vec_extract (gimple_stmt_iterator *, tree, tree, tree, tree);
 
 /* gimple_build, functionally matching fold_buildN, outputs stmts
    int the provided sequence, matching and simplifying them on-the-fly.
    Supposed to replace force_gimple_operand (fold_buildN (...), ...).  */
 extern tree gimple_build (gimple_seq *, location_t,
diff --git a/gcc/tree-vect-generic.c b/gcc/tree-vect-generic.c
index 909f790..1c9701d 100644
--- a/gcc/tree-vect-generic.c
+++ b/gcc/tree-vect-generic.c
@@ -118,11 +118,11 @@  build_word_mode_vector_type (int nunits)
 
 typedef tree (*elem_op_func) (gimple_stmt_iterator *,
 			      tree, tree, tree, tree, tree, enum tree_code,
 			      tree);
 
-static inline tree
+tree
 tree_vec_extract (gimple_stmt_iterator *gsi, tree type,
 		  tree t, tree bitsize, tree bitpos)
 {
   if (TREE_CODE (t) == SSA_NAME)
     {