[RFC] First cut at using vec_construct for strided loads
diff mbox

Message ID CAFiYyc2iiGXyp8_GrvFFnxXRL6Ci9vDb97HojLmCrcTcqGLDgw@mail.gmail.com
State New
Headers show

Commit Message

Richard Biener June 8, 2016, 12:29 p.m. UTC
On Wed, Jun 13, 2012 at 4:18 AM, William J. Schmidt
<wschmidt@linux.vnet.ibm.com> wrote:
> This patch is a follow-up to the discussion generated by
> http://gcc.gnu.org/ml/gcc-patches/2012-06/msg00546.html.  I've added
> vec_construct to the cost model for use in vect_model_load_cost, and
> implemented a cost calculation that makes sense to me for PowerPC.  I'm
> less certain about the default, i386, and spu implementations.  I took a
> guess at i386 from the discussions we had, and used the same calculation
> for the default and for spu.  I'm hoping you or others can fill in the
> blanks if I guessed badly.
>
> The i386 cost for vec_construct is different from all the others, which
> are parameterized for each processor description.  This should probably
> be parameterized in some way as well, but thought you'd know better than
> I how that should be.  Perhaps instead of
>
>         elements / 2 + 1
>
> it should be
>
>         (elements / 2) * X + Y
>
> where X and Y are taken from the processor description, and represent
> the cost of a merge and a permute, respectively.  Let me know what you
> think.

Just trying to understand how you arrived at the above formulas in investigating
strangely low cost for v16qi construction of 9.  If we pairwise reduce elements
with a cost of 1 then we arrive at a cost of elements - 1, that's what you'd
get with not accounting an initial move of element zero into a vector and then
inserting each other element into that with elements - 1 inserts.

This also matches up with code-generation on x86_64 for

vT foo (T a, T b, ...)
{
  return (vT) {a, b, ... };
}

for any vector / element type combination I tried.  Thus the patch below.

I'll bootstrap / test that on x86_64-linux and I'm leaving other
targets to target
maintainers.

Ok for the i386 parts?

Thanks,
Richard.

2016-06-08  Richard Biener  <rguenther@suse.de>

        * targhooks.c (default_builtin_vectorization_cost): Adjust
        vec_construct cost.
        * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise.



> Thanks,
> Bill
>
>
> 2012-06-12  Bill Schmidt  <wschmidt@linux.ibm.com>
>
>         * targhooks.c (default_builtin_vectorized_conversion): Handle
>         vec_construct, using vectype to base cost on subparts.
>         * target.h (enum vect_cost_for_stmt): Add vec_construct.
>         * tree-vect-stmts.c (vect_model_load_cost): Use vec_construct
>         instead of scalar_to-vec.
>         * config/spu/spu.c (spu_builtin_vectorization_cost): Handle
>         vec_construct in same way as default for now.
>         * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise.
>         * config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost):
>         Handle vec_construct, including special case for 32-bit loads.
>
>
> Index: gcc/targhooks.c
> ===================================================================
> --- gcc/targhooks.c     (revision 188482)
> +++ gcc/targhooks.c     (working copy)
> @@ -499,9 +499,11 @@ default_builtin_vectorized_conversion (unsigned in
>
>  int
>  default_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
> -                                    tree vectype ATTRIBUTE_UNUSED,
> +                                    tree vectype,
>                                      int misalign ATTRIBUTE_UNUSED)
>  {
> +  unsigned elements;
> +
>    switch (type_of_cost)
>      {
>        case scalar_stmt:
> @@ -524,6 +526,11 @@ default_builtin_vectorization_cost (enum vect_cost
>        case cond_branch_taken:
>          return 3;
>
> +      case vec_construct:
> +       elements = TYPE_VECTOR_SUBPARTS (vectype);
> +       gcc_assert (elements > 1);
> +       return elements / 2 + 1;
> +
>        default:
>          gcc_unreachable ();
>      }
> Index: gcc/target.h
> ===================================================================
> --- gcc/target.h        (revision 188482)
> +++ gcc/target.h        (working copy)
> @@ -146,7 +146,8 @@ enum vect_cost_for_stmt
>    cond_branch_not_taken,
>    cond_branch_taken,
>    vec_perm,
> -  vec_promote_demote
> +  vec_promote_demote,
> +  vec_construct
>  };
>
>  /* The target structure.  This holds all the backend hooks.  */
> Index: gcc/tree-vect-stmts.c
> ===================================================================
> --- gcc/tree-vect-stmts.c       (revision 188482)
> +++ gcc/tree-vect-stmts.c       (working copy)
> @@ -1031,11 +1031,13 @@ vect_model_load_cost (stmt_vec_info stmt_info, int
>    /* The loads themselves.  */
>    if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
>      {
> -      /* N scalar loads plus gathering them into a vector.
> -         ???  scalar_to_vec isn't the cost for that.  */
> +      /* N scalar loads plus gathering them into a vector.  */
> +      tree vectype = STMT_VINFO_VECTYPE (stmt_info);
>        inside_cost += (vect_get_stmt_cost (scalar_load) * ncopies
> -                     * TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)));
> -      inside_cost += ncopies * vect_get_stmt_cost (scalar_to_vec);
> +                     * TYPE_VECTOR_SUBPARTS (vectype));
> +      inside_cost += ncopies
> +       * targetm.vectorize.builtin_vectorization_cost (vec_construct,
> +                                                       vectype, 0);
>      }
>    else
>      vect_get_load_cost (first_dr, ncopies,
> Index: gcc/config/spu/spu.c
> ===================================================================
> --- gcc/config/spu/spu.c        (revision 188482)
> +++ gcc/config/spu/spu.c        (working copy)
> @@ -6908,9 +6908,11 @@ spu_builtin_mask_for_load (void)
>  /* Implement targetm.vectorize.builtin_vectorization_cost.  */
>  static int
>  spu_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
> -                                tree vectype ATTRIBUTE_UNUSED,
> +                                tree vectype,
>                                  int misalign ATTRIBUTE_UNUSED)
>  {
> +  unsigned elements;
> +
>    switch (type_of_cost)
>      {
>        case scalar_stmt:
> @@ -6937,6 +6939,11 @@ spu_builtin_vectorization_cost (enum vect_cost_for
>        case cond_branch_taken:
>          return 6;
>
> +      case vec_construct:
> +       elements = TYPE_VECTOR_SUBPARTS (vectype);
> +       gcc_assert (elements > 1);
> +       return elements / 2 + 1;
> +
>        default:
>          gcc_unreachable ();
>      }
> Index: gcc/config/i386/i386.c
> ===================================================================
> --- gcc/config/i386/i386.c      (revision 188482)
> +++ gcc/config/i386/i386.c      (working copy)
> @@ -36072,9 +36072,11 @@ static const struct attribute_spec ix86_attribute_
>  /* Implement targetm.vectorize.builtin_vectorization_cost.  */
>  static int
>  ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
> -                                 tree vectype ATTRIBUTE_UNUSED,
> +                                 tree vectype,
>                                   int misalign ATTRIBUTE_UNUSED)
>  {
> +  unsigned elements;
> +
>    switch (type_of_cost)
>      {
>        case scalar_stmt:
> @@ -36115,6 +36117,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_fo
>        case vec_promote_demote:
>          return ix86_cost->vec_stmt_cost;
>
> +      case vec_construct:
> +       elements = TYPE_VECTOR_SUBPARTS (vectype);
> +       gcc_assert (elements > 1);
> +       return elements / 2 + 1;
> +
>        default:
>          gcc_unreachable ();
>      }
> Index: gcc/config/rs6000/rs6000.c
> ===================================================================
> --- gcc/config/rs6000/rs6000.c  (revision 188482)
> +++ gcc/config/rs6000/rs6000.c  (working copy)
> @@ -3405,6 +3405,7 @@ rs6000_builtin_vectorization_cost (enum vect_cost_
>                                     tree vectype, int misalign)
>  {
>    unsigned elements;
> +  tree elem_type;
>
>    switch (type_of_cost)
>      {
> @@ -3504,6 +3505,19 @@ rs6000_builtin_vectorization_cost (enum vect_cost_
>
>          return 2;
>
> +      case vec_construct:
> +       elements = TYPE_VECTOR_SUBPARTS (vectype);
> +       elem_type = TREE_TYPE (vectype);
> +       gcc_assert (elements > 1);
> +       /* 32-bit vectors loaded into registers are stored as double
> +          precision, so we need n/2 converts in addition to the usual
> +          n/2 merges to construct a vector of short floats from them.  */
> +       if (SCALAR_FLOAT_TYPE_P (elem_type)
> +           && TYPE_PRECISION (elem_type) == 32)
> +         return elements + 1;
> +       else
> +         return elements / 2 + 1;
> +
>        default:
>          gcc_unreachable ();
>      }
>
>

Comments

Bill Schmidt June 8, 2016, 1:47 p.m. UTC | #1
Hi Richard,

> On Jun 8, 2016, at 7:29 AM, Richard Biener <richard.guenther@gmail.com> wrote:
> 
> On Wed, Jun 13, 2012 at 4:18 AM, William J. Schmidt
> <wschmidt@linux.vnet.ibm.com> wrote:
>> This patch is a follow-up to the discussion generated by
>> http://gcc.gnu.org/ml/gcc-patches/2012-06/msg00546.html.  I've added
>> vec_construct to the cost model for use in vect_model_load_cost, and
>> implemented a cost calculation that makes sense to me for PowerPC.  I'm
>> less certain about the default, i386, and spu implementations.  I took a
>> guess at i386 from the discussions we had, and used the same calculation
>> for the default and for spu.  I'm hoping you or others can fill in the
>> blanks if I guessed badly.
>> 
>> The i386 cost for vec_construct is different from all the others, which
>> are parameterized for each processor description.  This should probably
>> be parameterized in some way as well, but thought you'd know better than
>> I how that should be.  Perhaps instead of
>> 
>>        elements / 2 + 1
>> 
>> it should be
>> 
>>        (elements / 2) * X + Y
>> 
>> where X and Y are taken from the processor description, and represent
>> the cost of a merge and a permute, respectively.  Let me know what you
>> think.
> 
> Just trying to understand how you arrived at the above formulas in investigating
> strangely low cost for v16qi construction of 9.  If we pairwise reduce elements
> with a cost of 1 then we arrive at a cost of elements - 1, that's what you'd
> get with not accounting an initial move of element zero into a vector and then
> inserting each other element into that with elements - 1 inserts.

What I wrote there only makes partial sense for certain types on Power, so far as
I can tell, and even then it doesn’t generalize properly.  When the scalar registers
are contained in the vector registers (as happens for floating-point on Power), then
you can do some merges and other forms of permutes to combine them faster
than doing specific inserts.  But that isn’t a general solution even on Power; for the
integer modes we still do inserts.

So what you have makes sense to me, and what’s currently in place for Power needs
work also, so far as I can tell.  I’ll take a note to revisit this.

— Bill

> 
> This also matches up with code-generation on x86_64 for
> 
> vT foo (T a, T b, ...)
> {
>  return (vT) {a, b, ... };
> }
> 
> for any vector / element type combination I tried.  Thus the patch below.
> 
> I'll bootstrap / test that on x86_64-linux and I'm leaving other
> targets to target
> maintainers.
> 
> Ok for the i386 parts?
> 
> Thanks,
> Richard.
> 
> 2016-06-08  Richard Biener  <rguenther@suse.de>
> 
>        * targhooks.c (default_builtin_vectorization_cost): Adjust
>        vec_construct cost.
>        * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise.
> 
> Index: gcc/targhooks.c
> ===================================================================
> --- gcc/targhooks.c     (revision 237196)
> +++ gcc/targhooks.c     (working copy)
> @@ -589,8 +589,7 @@ default_builtin_vectorization_cost (enum
>         return 3;
> 
>       case vec_construct:
> -       elements = TYPE_VECTOR_SUBPARTS (vectype);
> -       return elements / 2 + 1;
> +       return TYPE_VECTOR_SUBPARTS (vectype) - 1;
> 
>       default:
>         gcc_unreachable ();
> Index: gcc/config/i386/i386.c
> ===================================================================
> --- gcc/config/i386/i386.c      (revision 237196)
> +++ gcc/config/i386/i386.c      (working copy)
> @@ -49503,8 +49520,6 @@ static int
> ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
>                                  tree vectype, int)
> {
> -  unsigned elements;
> -
>   switch (type_of_cost)
>     {
>       case scalar_stmt:
> @@ -49546,8 +49561,7 @@ ix86_builtin_vectorization_cost (enum ve
>         return ix86_cost->vec_stmt_cost;
> 
>       case vec_construct:
> -       elements = TYPE_VECTOR_SUBPARTS (vectype);
> -       return ix86_cost->vec_stmt_cost * (elements / 2 + 1);
> +       return ix86_cost->vec_stmt_cost * (TYPE_VECTOR_SUBPARTS (vectype) - 1);
> 
>       default:
>         gcc_unreachable ();
> 
> 
>> Thanks,
>> Bill
>> 
>> 
>> 2012-06-12  Bill Schmidt  <wschmidt@linux.ibm.com>
>> 
>>        * targhooks.c (default_builtin_vectorized_conversion): Handle
>>        vec_construct, using vectype to base cost on subparts.
>>        * target.h (enum vect_cost_for_stmt): Add vec_construct.
>>        * tree-vect-stmts.c (vect_model_load_cost): Use vec_construct
>>        instead of scalar_to-vec.
>>        * config/spu/spu.c (spu_builtin_vectorization_cost): Handle
>>        vec_construct in same way as default for now.
>>        * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise.
>>        * config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost):
>>        Handle vec_construct, including special case for 32-bit loads.
>> 
>> 
>> Index: gcc/targhooks.c
>> ===================================================================
>> --- gcc/targhooks.c     (revision 188482)
>> +++ gcc/targhooks.c     (working copy)
>> @@ -499,9 +499,11 @@ default_builtin_vectorized_conversion (unsigned in
>> 
>> int
>> default_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
>> -                                    tree vectype ATTRIBUTE_UNUSED,
>> +                                    tree vectype,
>>                                     int misalign ATTRIBUTE_UNUSED)
>> {
>> +  unsigned elements;
>> +
>>   switch (type_of_cost)
>>     {
>>       case scalar_stmt:
>> @@ -524,6 +526,11 @@ default_builtin_vectorization_cost (enum vect_cost
>>       case cond_branch_taken:
>>         return 3;
>> 
>> +      case vec_construct:
>> +       elements = TYPE_VECTOR_SUBPARTS (vectype);
>> +       gcc_assert (elements > 1);
>> +       return elements / 2 + 1;
>> +
>>       default:
>>         gcc_unreachable ();
>>     }
>> Index: gcc/target.h
>> ===================================================================
>> --- gcc/target.h        (revision 188482)
>> +++ gcc/target.h        (working copy)
>> @@ -146,7 +146,8 @@ enum vect_cost_for_stmt
>>   cond_branch_not_taken,
>>   cond_branch_taken,
>>   vec_perm,
>> -  vec_promote_demote
>> +  vec_promote_demote,
>> +  vec_construct
>> };
>> 
>> /* The target structure.  This holds all the backend hooks.  */
>> Index: gcc/tree-vect-stmts.c
>> ===================================================================
>> --- gcc/tree-vect-stmts.c       (revision 188482)
>> +++ gcc/tree-vect-stmts.c       (working copy)
>> @@ -1031,11 +1031,13 @@ vect_model_load_cost (stmt_vec_info stmt_info, int
>>   /* The loads themselves.  */
>>   if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
>>     {
>> -      /* N scalar loads plus gathering them into a vector.
>> -         ???  scalar_to_vec isn't the cost for that.  */
>> +      /* N scalar loads plus gathering them into a vector.  */
>> +      tree vectype = STMT_VINFO_VECTYPE (stmt_info);
>>       inside_cost += (vect_get_stmt_cost (scalar_load) * ncopies
>> -                     * TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)));
>> -      inside_cost += ncopies * vect_get_stmt_cost (scalar_to_vec);
>> +                     * TYPE_VECTOR_SUBPARTS (vectype));
>> +      inside_cost += ncopies
>> +       * targetm.vectorize.builtin_vectorization_cost (vec_construct,
>> +                                                       vectype, 0);
>>     }
>>   else
>>     vect_get_load_cost (first_dr, ncopies,
>> Index: gcc/config/spu/spu.c
>> ===================================================================
>> --- gcc/config/spu/spu.c        (revision 188482)
>> +++ gcc/config/spu/spu.c        (working copy)
>> @@ -6908,9 +6908,11 @@ spu_builtin_mask_for_load (void)
>> /* Implement targetm.vectorize.builtin_vectorization_cost.  */
>> static int
>> spu_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
>> -                                tree vectype ATTRIBUTE_UNUSED,
>> +                                tree vectype,
>>                                 int misalign ATTRIBUTE_UNUSED)
>> {
>> +  unsigned elements;
>> +
>>   switch (type_of_cost)
>>     {
>>       case scalar_stmt:
>> @@ -6937,6 +6939,11 @@ spu_builtin_vectorization_cost (enum vect_cost_for
>>       case cond_branch_taken:
>>         return 6;
>> 
>> +      case vec_construct:
>> +       elements = TYPE_VECTOR_SUBPARTS (vectype);
>> +       gcc_assert (elements > 1);
>> +       return elements / 2 + 1;
>> +
>>       default:
>>         gcc_unreachable ();
>>     }
>> Index: gcc/config/i386/i386.c
>> ===================================================================
>> --- gcc/config/i386/i386.c      (revision 188482)
>> +++ gcc/config/i386/i386.c      (working copy)
>> @@ -36072,9 +36072,11 @@ static const struct attribute_spec ix86_attribute_
>> /* Implement targetm.vectorize.builtin_vectorization_cost.  */
>> static int
>> ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
>> -                                 tree vectype ATTRIBUTE_UNUSED,
>> +                                 tree vectype,
>>                                  int misalign ATTRIBUTE_UNUSED)
>> {
>> +  unsigned elements;
>> +
>>   switch (type_of_cost)
>>     {
>>       case scalar_stmt:
>> @@ -36115,6 +36117,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_fo
>>       case vec_promote_demote:
>>         return ix86_cost->vec_stmt_cost;
>> 
>> +      case vec_construct:
>> +       elements = TYPE_VECTOR_SUBPARTS (vectype);
>> +       gcc_assert (elements > 1);
>> +       return elements / 2 + 1;
>> +
>>       default:
>>         gcc_unreachable ();
>>     }
>> Index: gcc/config/rs6000/rs6000.c
>> ===================================================================
>> --- gcc/config/rs6000/rs6000.c  (revision 188482)
>> +++ gcc/config/rs6000/rs6000.c  (working copy)
>> @@ -3405,6 +3405,7 @@ rs6000_builtin_vectorization_cost (enum vect_cost_
>>                                    tree vectype, int misalign)
>> {
>>   unsigned elements;
>> +  tree elem_type;
>> 
>>   switch (type_of_cost)
>>     {
>> @@ -3504,6 +3505,19 @@ rs6000_builtin_vectorization_cost (enum vect_cost_
>> 
>>         return 2;
>> 
>> +      case vec_construct:
>> +       elements = TYPE_VECTOR_SUBPARTS (vectype);
>> +       elem_type = TREE_TYPE (vectype);
>> +       gcc_assert (elements > 1);
>> +       /* 32-bit vectors loaded into registers are stored as double
>> +          precision, so we need n/2 converts in addition to the usual
>> +          n/2 merges to construct a vector of short floats from them.  */
>> +       if (SCALAR_FLOAT_TYPE_P (elem_type)
>> +           && TYPE_PRECISION (elem_type) == 32)
>> +         return elements + 1;
>> +       else
>> +         return elements / 2 + 1;
>> +
>>       default:
>>         gcc_unreachable ();
>>     }
>> 
>> 
>
Richard Biener June 8, 2016, 2:05 p.m. UTC | #2
On Wed, 8 Jun 2016, Bill Schmidt wrote:

> Hi Richard,
> 
> > On Jun 8, 2016, at 7:29 AM, Richard Biener <richard.guenther@gmail.com> wrote:
> > 
> > On Wed, Jun 13, 2012 at 4:18 AM, William J. Schmidt
> > <wschmidt@linux.vnet.ibm.com> wrote:
> >> This patch is a follow-up to the discussion generated by
> >> http://gcc.gnu.org/ml/gcc-patches/2012-06/msg00546.html.  I've added
> >> vec_construct to the cost model for use in vect_model_load_cost, and
> >> implemented a cost calculation that makes sense to me for PowerPC.  I'm
> >> less certain about the default, i386, and spu implementations.  I took a
> >> guess at i386 from the discussions we had, and used the same calculation
> >> for the default and for spu.  I'm hoping you or others can fill in the
> >> blanks if I guessed badly.
> >> 
> >> The i386 cost for vec_construct is different from all the others, which
> >> are parameterized for each processor description.  This should probably
> >> be parameterized in some way as well, but thought you'd know better than
> >> I how that should be.  Perhaps instead of
> >> 
> >>        elements / 2 + 1
> >> 
> >> it should be
> >> 
> >>        (elements / 2) * X + Y
> >> 
> >> where X and Y are taken from the processor description, and represent
> >> the cost of a merge and a permute, respectively.  Let me know what you
> >> think.
> > 
> > Just trying to understand how you arrived at the above formulas in investigating
> > strangely low cost for v16qi construction of 9.  If we pairwise reduce elements
> > with a cost of 1 then we arrive at a cost of elements - 1, that's what you'd
> > get with not accounting an initial move of element zero into a vector and then
> > inserting each other element into that with elements - 1 inserts.
> 
> What I wrote there only makes partial sense for certain types on Power, so far as
> I can tell, and even then it doesn’t generalize properly.  When the scalar registers
> are contained in the vector registers (as happens for floating-point on Power), then
> you can do some merges and other forms of permutes to combine them faster
> than doing specific inserts.  But that isn’t a general solution even on Power; for the
> integer modes we still do inserts.

You mean Power has instructions to combine more than two vector registers
into one?  Otherwise you still need n / 2 plus n / 4 plus n / 8 ...
"permutes" which boils down to n - 1.
 
> So what you have makes sense to me, and what’s currently in place for Power needs
> work also, so far as I can tell.  I’ll take a note to revisit this.

Thanks.
Richard.

> — Bill
> 
> > 
> > This also matches up with code-generation on x86_64 for
> > 
> > vT foo (T a, T b, ...)
> > {
> >  return (vT) {a, b, ... };
> > }
> > 
> > for any vector / element type combination I tried.  Thus the patch below.
> > 
> > I'll bootstrap / test that on x86_64-linux and I'm leaving other
> > targets to target
> > maintainers.
> > 
> > Ok for the i386 parts?
> > 
> > Thanks,
> > Richard.
> > 
> > 2016-06-08  Richard Biener  <rguenther@suse.de>
> > 
> >        * targhooks.c (default_builtin_vectorization_cost): Adjust
> >        vec_construct cost.
> >        * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise.
> > 
> > Index: gcc/targhooks.c
> > ===================================================================
> > --- gcc/targhooks.c     (revision 237196)
> > +++ gcc/targhooks.c     (working copy)
> > @@ -589,8 +589,7 @@ default_builtin_vectorization_cost (enum
> >         return 3;
> > 
> >       case vec_construct:
> > -       elements = TYPE_VECTOR_SUBPARTS (vectype);
> > -       return elements / 2 + 1;
> > +       return TYPE_VECTOR_SUBPARTS (vectype) - 1;
> > 
> >       default:
> >         gcc_unreachable ();
> > Index: gcc/config/i386/i386.c
> > ===================================================================
> > --- gcc/config/i386/i386.c      (revision 237196)
> > +++ gcc/config/i386/i386.c      (working copy)
> > @@ -49503,8 +49520,6 @@ static int
> > ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
> >                                  tree vectype, int)
> > {
> > -  unsigned elements;
> > -
> >   switch (type_of_cost)
> >     {
> >       case scalar_stmt:
> > @@ -49546,8 +49561,7 @@ ix86_builtin_vectorization_cost (enum ve
> >         return ix86_cost->vec_stmt_cost;
> > 
> >       case vec_construct:
> > -       elements = TYPE_VECTOR_SUBPARTS (vectype);
> > -       return ix86_cost->vec_stmt_cost * (elements / 2 + 1);
> > +       return ix86_cost->vec_stmt_cost * (TYPE_VECTOR_SUBPARTS (vectype) - 1);
> > 
> >       default:
> >         gcc_unreachable ();
> > 
> > 
> >> Thanks,
> >> Bill
> >> 
> >> 
> >> 2012-06-12  Bill Schmidt  <wschmidt@linux.ibm.com>
> >> 
> >>        * targhooks.c (default_builtin_vectorized_conversion): Handle
> >>        vec_construct, using vectype to base cost on subparts.
> >>        * target.h (enum vect_cost_for_stmt): Add vec_construct.
> >>        * tree-vect-stmts.c (vect_model_load_cost): Use vec_construct
> >>        instead of scalar_to-vec.
> >>        * config/spu/spu.c (spu_builtin_vectorization_cost): Handle
> >>        vec_construct in same way as default for now.
> >>        * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise.
> >>        * config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost):
> >>        Handle vec_construct, including special case for 32-bit loads.
> >> 
> >> 
> >> Index: gcc/targhooks.c
> >> ===================================================================
> >> --- gcc/targhooks.c     (revision 188482)
> >> +++ gcc/targhooks.c     (working copy)
> >> @@ -499,9 +499,11 @@ default_builtin_vectorized_conversion (unsigned in
> >> 
> >> int
> >> default_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
> >> -                                    tree vectype ATTRIBUTE_UNUSED,
> >> +                                    tree vectype,
> >>                                     int misalign ATTRIBUTE_UNUSED)
> >> {
> >> +  unsigned elements;
> >> +
> >>   switch (type_of_cost)
> >>     {
> >>       case scalar_stmt:
> >> @@ -524,6 +526,11 @@ default_builtin_vectorization_cost (enum vect_cost
> >>       case cond_branch_taken:
> >>         return 3;
> >> 
> >> +      case vec_construct:
> >> +       elements = TYPE_VECTOR_SUBPARTS (vectype);
> >> +       gcc_assert (elements > 1);
> >> +       return elements / 2 + 1;
> >> +
> >>       default:
> >>         gcc_unreachable ();
> >>     }
> >> Index: gcc/target.h
> >> ===================================================================
> >> --- gcc/target.h        (revision 188482)
> >> +++ gcc/target.h        (working copy)
> >> @@ -146,7 +146,8 @@ enum vect_cost_for_stmt
> >>   cond_branch_not_taken,
> >>   cond_branch_taken,
> >>   vec_perm,
> >> -  vec_promote_demote
> >> +  vec_promote_demote,
> >> +  vec_construct
> >> };
> >> 
> >> /* The target structure.  This holds all the backend hooks.  */
> >> Index: gcc/tree-vect-stmts.c
> >> ===================================================================
> >> --- gcc/tree-vect-stmts.c       (revision 188482)
> >> +++ gcc/tree-vect-stmts.c       (working copy)
> >> @@ -1031,11 +1031,13 @@ vect_model_load_cost (stmt_vec_info stmt_info, int
> >>   /* The loads themselves.  */
> >>   if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
> >>     {
> >> -      /* N scalar loads plus gathering them into a vector.
> >> -         ???  scalar_to_vec isn't the cost for that.  */
> >> +      /* N scalar loads plus gathering them into a vector.  */
> >> +      tree vectype = STMT_VINFO_VECTYPE (stmt_info);
> >>       inside_cost += (vect_get_stmt_cost (scalar_load) * ncopies
> >> -                     * TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)));
> >> -      inside_cost += ncopies * vect_get_stmt_cost (scalar_to_vec);
> >> +                     * TYPE_VECTOR_SUBPARTS (vectype));
> >> +      inside_cost += ncopies
> >> +       * targetm.vectorize.builtin_vectorization_cost (vec_construct,
> >> +                                                       vectype, 0);
> >>     }
> >>   else
> >>     vect_get_load_cost (first_dr, ncopies,
> >> Index: gcc/config/spu/spu.c
> >> ===================================================================
> >> --- gcc/config/spu/spu.c        (revision 188482)
> >> +++ gcc/config/spu/spu.c        (working copy)
> >> @@ -6908,9 +6908,11 @@ spu_builtin_mask_for_load (void)
> >> /* Implement targetm.vectorize.builtin_vectorization_cost.  */
> >> static int
> >> spu_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
> >> -                                tree vectype ATTRIBUTE_UNUSED,
> >> +                                tree vectype,
> >>                                 int misalign ATTRIBUTE_UNUSED)
> >> {
> >> +  unsigned elements;
> >> +
> >>   switch (type_of_cost)
> >>     {
> >>       case scalar_stmt:
> >> @@ -6937,6 +6939,11 @@ spu_builtin_vectorization_cost (enum vect_cost_for
> >>       case cond_branch_taken:
> >>         return 6;
> >> 
> >> +      case vec_construct:
> >> +       elements = TYPE_VECTOR_SUBPARTS (vectype);
> >> +       gcc_assert (elements > 1);
> >> +       return elements / 2 + 1;
> >> +
> >>       default:
> >>         gcc_unreachable ();
> >>     }
> >> Index: gcc/config/i386/i386.c
> >> ===================================================================
> >> --- gcc/config/i386/i386.c      (revision 188482)
> >> +++ gcc/config/i386/i386.c      (working copy)
> >> @@ -36072,9 +36072,11 @@ static const struct attribute_spec ix86_attribute_
> >> /* Implement targetm.vectorize.builtin_vectorization_cost.  */
> >> static int
> >> ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
> >> -                                 tree vectype ATTRIBUTE_UNUSED,
> >> +                                 tree vectype,
> >>                                  int misalign ATTRIBUTE_UNUSED)
> >> {
> >> +  unsigned elements;
> >> +
> >>   switch (type_of_cost)
> >>     {
> >>       case scalar_stmt:
> >> @@ -36115,6 +36117,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_fo
> >>       case vec_promote_demote:
> >>         return ix86_cost->vec_stmt_cost;
> >> 
> >> +      case vec_construct:
> >> +       elements = TYPE_VECTOR_SUBPARTS (vectype);
> >> +       gcc_assert (elements > 1);
> >> +       return elements / 2 + 1;
> >> +
> >>       default:
> >>         gcc_unreachable ();
> >>     }
> >> Index: gcc/config/rs6000/rs6000.c
> >> ===================================================================
> >> --- gcc/config/rs6000/rs6000.c  (revision 188482)
> >> +++ gcc/config/rs6000/rs6000.c  (working copy)
> >> @@ -3405,6 +3405,7 @@ rs6000_builtin_vectorization_cost (enum vect_cost_
> >>                                    tree vectype, int misalign)
> >> {
> >>   unsigned elements;
> >> +  tree elem_type;
> >> 
> >>   switch (type_of_cost)
> >>     {
> >> @@ -3504,6 +3505,19 @@ rs6000_builtin_vectorization_cost (enum vect_cost_
> >> 
> >>         return 2;
> >> 
> >> +      case vec_construct:
> >> +       elements = TYPE_VECTOR_SUBPARTS (vectype);
> >> +       elem_type = TREE_TYPE (vectype);
> >> +       gcc_assert (elements > 1);
> >> +       /* 32-bit vectors loaded into registers are stored as double
> >> +          precision, so we need n/2 converts in addition to the usual
> >> +          n/2 merges to construct a vector of short floats from them.  */
> >> +       if (SCALAR_FLOAT_TYPE_P (elem_type)
> >> +           && TYPE_PRECISION (elem_type) == 32)
> >> +         return elements + 1;
> >> +       else
> >> +         return elements / 2 + 1;
> >> +
> >>       default:
> >>         gcc_unreachable ();
> >>     }
> >> 
> >> 
> > 
> 
>
Bill Schmidt June 9, 2016, 3:27 p.m. UTC | #3
> On Jun 8, 2016, at 9:05 AM, Richard Biener <rguenther@suse.de> wrote:
> 
> On Wed, 8 Jun 2016, Bill Schmidt wrote:
> 
>> Hi Richard,
>> 
>>> On Jun 8, 2016, at 7:29 AM, Richard Biener <richard.guenther@gmail.com> wrote:
>>> 
>>> On Wed, Jun 13, 2012 at 4:18 AM, William J. Schmidt
>>> <wschmidt@linux.vnet.ibm.com> wrote:
>>>> This patch is a follow-up to the discussion generated by
>>>> http://gcc.gnu.org/ml/gcc-patches/2012-06/msg00546.html.  I've added
>>>> vec_construct to the cost model for use in vect_model_load_cost, and
>>>> implemented a cost calculation that makes sense to me for PowerPC.  I'm
>>>> less certain about the default, i386, and spu implementations.  I took a
>>>> guess at i386 from the discussions we had, and used the same calculation
>>>> for the default and for spu.  I'm hoping you or others can fill in the
>>>> blanks if I guessed badly.
>>>> 
>>>> The i386 cost for vec_construct is different from all the others, which
>>>> are parameterized for each processor description.  This should probably
>>>> be parameterized in some way as well, but thought you'd know better than
>>>> I how that should be.  Perhaps instead of
>>>> 
>>>>       elements / 2 + 1
>>>> 
>>>> it should be
>>>> 
>>>>       (elements / 2) * X + Y
>>>> 
>>>> where X and Y are taken from the processor description, and represent
>>>> the cost of a merge and a permute, respectively.  Let me know what you
>>>> think.
>>> 
>>> Just trying to understand how you arrived at the above formulas in investigating
>>> strangely low cost for v16qi construction of 9.  If we pairwise reduce elements
>>> with a cost of 1 then we arrive at a cost of elements - 1, that's what you'd
>>> get with not accounting an initial move of element zero into a vector and then
>>> inserting each other element into that with elements - 1 inserts.
>> 
>> What I wrote there only makes partial sense for certain types on Power, so far as
>> I can tell, and even then it doesn’t generalize properly.  When the scalar registers
>> are contained in the vector registers (as happens for floating-point on Power), then
>> you can do some merges and other forms of permutes to combine them faster
>> than doing specific inserts.  But that isn’t a general solution even on Power; for the
>> integer modes we still do inserts.
> 
> You mean Power has instructions to combine more than two vector registers
> into one?  Otherwise you still need n / 2 plus n / 4 plus n / 8 ...
> "permutes" which boils down to n - 1.

Right, we do not.  This is what I meant about it not generalizing properly -- it was an
ill-thought-out, off-the-cuff remark, so far as I can tell.  I agree with the n - 1, and I need
to go in and make a similar change for Power.  There is a special case with 32-bit
floating-point that probably also needs adjustment.

Bill

> 
>> So what you have makes sense to me, and what’s currently in place for Power needs
>> work also, so far as I can tell.  I’ll take a note to revisit this.
> 
> Thanks.
> Richard.
> 
>> — Bill
>> 
>>> 
>>> This also matches up with code-generation on x86_64 for
>>> 
>>> vT foo (T a, T b, ...)
>>> {
>>> return (vT) {a, b, ... };
>>> }
>>> 
>>> for any vector / element type combination I tried.  Thus the patch below.
>>> 
>>> I'll bootstrap / test that on x86_64-linux and I'm leaving other
>>> targets to target
>>> maintainers.
>>> 
>>> Ok for the i386 parts?
>>> 
>>> Thanks,
>>> Richard.
>>> 
>>> 2016-06-08  Richard Biener  <rguenther@suse.de>
>>> 
>>>       * targhooks.c (default_builtin_vectorization_cost): Adjust
>>>       vec_construct cost.
>>>       * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise.
>>> 
>>> Index: gcc/targhooks.c
>>> ===================================================================
>>> --- gcc/targhooks.c     (revision 237196)
>>> +++ gcc/targhooks.c     (working copy)
>>> @@ -589,8 +589,7 @@ default_builtin_vectorization_cost (enum
>>>        return 3;
>>> 
>>>      case vec_construct:
>>> -       elements = TYPE_VECTOR_SUBPARTS (vectype);
>>> -       return elements / 2 + 1;
>>> +       return TYPE_VECTOR_SUBPARTS (vectype) - 1;
>>> 
>>>      default:
>>>        gcc_unreachable ();
>>> Index: gcc/config/i386/i386.c
>>> ===================================================================
>>> --- gcc/config/i386/i386.c      (revision 237196)
>>> +++ gcc/config/i386/i386.c      (working copy)
>>> @@ -49503,8 +49520,6 @@ static int
>>> ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
>>>                                 tree vectype, int)
>>> {
>>> -  unsigned elements;
>>> -
>>>  switch (type_of_cost)
>>>    {
>>>      case scalar_stmt:
>>> @@ -49546,8 +49561,7 @@ ix86_builtin_vectorization_cost (enum ve
>>>        return ix86_cost->vec_stmt_cost;
>>> 
>>>      case vec_construct:
>>> -       elements = TYPE_VECTOR_SUBPARTS (vectype);
>>> -       return ix86_cost->vec_stmt_cost * (elements / 2 + 1);
>>> +       return ix86_cost->vec_stmt_cost * (TYPE_VECTOR_SUBPARTS (vectype) - 1);
>>> 
>>>      default:
>>>        gcc_unreachable ();
>>> 
>>> 
>>>> Thanks,
>>>> Bill
>>>> 
>>>> 
>>>> 2012-06-12  Bill Schmidt  <wschmidt@linux.ibm.com>
>>>> 
>>>>       * targhooks.c (default_builtin_vectorized_conversion): Handle
>>>>       vec_construct, using vectype to base cost on subparts.
>>>>       * target.h (enum vect_cost_for_stmt): Add vec_construct.
>>>>       * tree-vect-stmts.c (vect_model_load_cost): Use vec_construct
>>>>       instead of scalar_to-vec.
>>>>       * config/spu/spu.c (spu_builtin_vectorization_cost): Handle
>>>>       vec_construct in same way as default for now.
>>>>       * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise.
>>>>       * config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost):
>>>>       Handle vec_construct, including special case for 32-bit loads.
>>>> 
>>>> 
>>>> Index: gcc/targhooks.c
>>>> ===================================================================
>>>> --- gcc/targhooks.c     (revision 188482)
>>>> +++ gcc/targhooks.c     (working copy)
>>>> @@ -499,9 +499,11 @@ default_builtin_vectorized_conversion (unsigned in
>>>> 
>>>> int
>>>> default_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
>>>> -                                    tree vectype ATTRIBUTE_UNUSED,
>>>> +                                    tree vectype,
>>>>                                    int misalign ATTRIBUTE_UNUSED)
>>>> {
>>>> +  unsigned elements;
>>>> +
>>>>  switch (type_of_cost)
>>>>    {
>>>>      case scalar_stmt:
>>>> @@ -524,6 +526,11 @@ default_builtin_vectorization_cost (enum vect_cost
>>>>      case cond_branch_taken:
>>>>        return 3;
>>>> 
>>>> +      case vec_construct:
>>>> +       elements = TYPE_VECTOR_SUBPARTS (vectype);
>>>> +       gcc_assert (elements > 1);
>>>> +       return elements / 2 + 1;
>>>> +
>>>>      default:
>>>>        gcc_unreachable ();
>>>>    }
>>>> Index: gcc/target.h
>>>> ===================================================================
>>>> --- gcc/target.h        (revision 188482)
>>>> +++ gcc/target.h        (working copy)
>>>> @@ -146,7 +146,8 @@ enum vect_cost_for_stmt
>>>>  cond_branch_not_taken,
>>>>  cond_branch_taken,
>>>>  vec_perm,
>>>> -  vec_promote_demote
>>>> +  vec_promote_demote,
>>>> +  vec_construct
>>>> };
>>>> 
>>>> /* The target structure.  This holds all the backend hooks.  */
>>>> Index: gcc/tree-vect-stmts.c
>>>> ===================================================================
>>>> --- gcc/tree-vect-stmts.c       (revision 188482)
>>>> +++ gcc/tree-vect-stmts.c       (working copy)
>>>> @@ -1031,11 +1031,13 @@ vect_model_load_cost (stmt_vec_info stmt_info, int
>>>>  /* The loads themselves.  */
>>>>  if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
>>>>    {
>>>> -      /* N scalar loads plus gathering them into a vector.
>>>> -         ???  scalar_to_vec isn't the cost for that.  */
>>>> +      /* N scalar loads plus gathering them into a vector.  */
>>>> +      tree vectype = STMT_VINFO_VECTYPE (stmt_info);
>>>>      inside_cost += (vect_get_stmt_cost (scalar_load) * ncopies
>>>> -                     * TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)));
>>>> -      inside_cost += ncopies * vect_get_stmt_cost (scalar_to_vec);
>>>> +                     * TYPE_VECTOR_SUBPARTS (vectype));
>>>> +      inside_cost += ncopies
>>>> +       * targetm.vectorize.builtin_vectorization_cost (vec_construct,
>>>> +                                                       vectype, 0);
>>>>    }
>>>>  else
>>>>    vect_get_load_cost (first_dr, ncopies,
>>>> Index: gcc/config/spu/spu.c
>>>> ===================================================================
>>>> --- gcc/config/spu/spu.c        (revision 188482)
>>>> +++ gcc/config/spu/spu.c        (working copy)
>>>> @@ -6908,9 +6908,11 @@ spu_builtin_mask_for_load (void)
>>>> /* Implement targetm.vectorize.builtin_vectorization_cost.  */
>>>> static int
>>>> spu_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
>>>> -                                tree vectype ATTRIBUTE_UNUSED,
>>>> +                                tree vectype,
>>>>                                int misalign ATTRIBUTE_UNUSED)
>>>> {
>>>> +  unsigned elements;
>>>> +
>>>>  switch (type_of_cost)
>>>>    {
>>>>      case scalar_stmt:
>>>> @@ -6937,6 +6939,11 @@ spu_builtin_vectorization_cost (enum vect_cost_for
>>>>      case cond_branch_taken:
>>>>        return 6;
>>>> 
>>>> +      case vec_construct:
>>>> +       elements = TYPE_VECTOR_SUBPARTS (vectype);
>>>> +       gcc_assert (elements > 1);
>>>> +       return elements / 2 + 1;
>>>> +
>>>>      default:
>>>>        gcc_unreachable ();
>>>>    }
>>>> Index: gcc/config/i386/i386.c
>>>> ===================================================================
>>>> --- gcc/config/i386/i386.c      (revision 188482)
>>>> +++ gcc/config/i386/i386.c      (working copy)
>>>> @@ -36072,9 +36072,11 @@ static const struct attribute_spec ix86_attribute_
>>>> /* Implement targetm.vectorize.builtin_vectorization_cost.  */
>>>> static int
>>>> ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
>>>> -                                 tree vectype ATTRIBUTE_UNUSED,
>>>> +                                 tree vectype,
>>>>                                 int misalign ATTRIBUTE_UNUSED)
>>>> {
>>>> +  unsigned elements;
>>>> +
>>>>  switch (type_of_cost)
>>>>    {
>>>>      case scalar_stmt:
>>>> @@ -36115,6 +36117,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_fo
>>>>      case vec_promote_demote:
>>>>        return ix86_cost->vec_stmt_cost;
>>>> 
>>>> +      case vec_construct:
>>>> +       elements = TYPE_VECTOR_SUBPARTS (vectype);
>>>> +       gcc_assert (elements > 1);
>>>> +       return elements / 2 + 1;
>>>> +
>>>>      default:
>>>>        gcc_unreachable ();
>>>>    }
>>>> Index: gcc/config/rs6000/rs6000.c
>>>> ===================================================================
>>>> --- gcc/config/rs6000/rs6000.c  (revision 188482)
>>>> +++ gcc/config/rs6000/rs6000.c  (working copy)
>>>> @@ -3405,6 +3405,7 @@ rs6000_builtin_vectorization_cost (enum vect_cost_
>>>>                                   tree vectype, int misalign)
>>>> {
>>>>  unsigned elements;
>>>> +  tree elem_type;
>>>> 
>>>>  switch (type_of_cost)
>>>>    {
>>>> @@ -3504,6 +3505,19 @@ rs6000_builtin_vectorization_cost (enum vect_cost_
>>>> 
>>>>        return 2;
>>>> 
>>>> +      case vec_construct:
>>>> +       elements = TYPE_VECTOR_SUBPARTS (vectype);
>>>> +       elem_type = TREE_TYPE (vectype);
>>>> +       gcc_assert (elements > 1);
>>>> +       /* 32-bit vectors loaded into registers are stored as double
>>>> +          precision, so we need n/2 converts in addition to the usual
>>>> +          n/2 merges to construct a vector of short floats from them.  */
>>>> +       if (SCALAR_FLOAT_TYPE_P (elem_type)
>>>> +           && TYPE_PRECISION (elem_type) == 32)
>>>> +         return elements + 1;
>>>> +       else
>>>> +         return elements / 2 + 1;
>>>> +
>>>>      default:
>>>>        gcc_unreachable ();
>>>>    }
>>>> 
>>>> 
>>> 
>> 
>> 
> 
> -- 
> Richard Biener <rguenther@suse.de>
> SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg)

Patch
diff mbox

Index: gcc/targhooks.c
===================================================================
--- gcc/targhooks.c     (revision 237196)
+++ gcc/targhooks.c     (working copy)
@@ -589,8 +589,7 @@  default_builtin_vectorization_cost (enum
         return 3;

       case vec_construct:
-       elements = TYPE_VECTOR_SUBPARTS (vectype);
-       return elements / 2 + 1;
+       return TYPE_VECTOR_SUBPARTS (vectype) - 1;

       default:
         gcc_unreachable ();
Index: gcc/config/i386/i386.c
===================================================================
--- gcc/config/i386/i386.c      (revision 237196)
+++ gcc/config/i386/i386.c      (working copy)
@@ -49503,8 +49520,6 @@  static int
 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
                                  tree vectype, int)
 {
-  unsigned elements;
-
   switch (type_of_cost)
     {
       case scalar_stmt:
@@ -49546,8 +49561,7 @@  ix86_builtin_vectorization_cost (enum ve
         return ix86_cost->vec_stmt_cost;

       case vec_construct:
-       elements = TYPE_VECTOR_SUBPARTS (vectype);
-       return ix86_cost->vec_stmt_cost * (elements / 2 + 1);
+       return ix86_cost->vec_stmt_cost * (TYPE_VECTOR_SUBPARTS (vectype) - 1);

       default:
         gcc_unreachable ();