diff mbox

[gomp4,simd,RFC] Simple fix to override vectorization cost estimation.

Message ID CAGYS_TL77Dos_ZvCTz3nBmvQYqxrS2UerjsTcrbjuAvdChcJrA@mail.gmail.com
State New
Headers show

Commit Message

Sergey Ostanevich Nov. 19, 2013, 2:04 p.m. UTC
:) agree to you, but as soon as you're a user who tries to introduce
vector code and face a bug in cost model you'd like to have a
workaround until the bug will be fixed and compiler will come to you
with new OS distribution, don't you?

I propose the following, yet SLP have to use a NULL as a loop info
which looks somewhat hacky.

Sergos


        * common.opt: Added new option -fsimd-vect-cost-model
        * tree-vectorizer.h (unlimited_cost_model): Interface update
        to rely on particular loop info
        * tree-vect-data-refs.c (vect_peeling_hash_insert): Update to
        unlimited_cost_model call according to new interface
        (vect_peeling_hash_choose_best_peeling): Ditto
        (vect_enhance_data_refs_alignment): Ditto
        * tree-vect-slp.c: Ditto
        * tree-vect-loop.c (vect_estimate_min_profitable_iters): Ditto
        plus issue a warning in case cost model overrides users' directive



 /* Source location */

On Mon, Nov 18, 2013 at 7:13 PM, Richard Biener <rguenther@suse.de> wrote:
> On Mon, 18 Nov 2013, Sergey Ostanevich wrote:
>
>> I would agree that the example is just for the case cost model makes
>> correct estimation But how can we assure ourself that it won't have any
>> mistakes in the future?
>
> We call it bugs and not mistakes and we have bugzilla for it.
>
> Richard.
>
>> I believe it'll be Ok to introduce an extra flag as Jakub proposed for the
>> dedicated simd-forced vectorization to use unlimited cost model. This
>> can be default for -fopenmp or there should be a warning issued that
>> compiler overrides user's request of vectorization. In such a case user
>> can enforce vectorization (even with mentioned results :) with this
>> unlimited cost model for simd.
>>
>>
>>
>> On Fri, Nov 15, 2013 at 6:24 PM, Richard Biener <rguenther@suse.de> wrote:
>> > On Fri, 15 Nov 2013, Sergey Ostanevich wrote:
>> >
>> >> Richard,
>> >>
>> >> here's an example that causes trigger for the cost model.
>> >
>> > I hardly believe that (AVX2)
>> >
>> > .L9:
>> >         vmovups (%rsi), %xmm3
>> >         addl    $1, %r8d
>> >         addq    $256, %rsi
>> >         vinsertf128     $0x1, -240(%rsi), %ymm3, %ymm1
>> >         vmovups -224(%rsi), %xmm3
>> >         vinsertf128     $0x1, -208(%rsi), %ymm3, %ymm3
>> >         vshufps $136, %ymm3, %ymm1, %ymm3
>> >         vperm2f128      $3, %ymm3, %ymm3, %ymm2
>> >         vshufps $68, %ymm2, %ymm3, %ymm1
>> >         vshufps $238, %ymm2, %ymm3, %ymm2
>> >         vmovups -192(%rsi), %xmm3
>> >         vinsertf128     $1, %xmm2, %ymm1, %ymm2
>> >         vinsertf128     $0x1, -176(%rsi), %ymm3, %ymm1
>> >         vmovups -160(%rsi), %xmm3
>> >         vinsertf128     $0x1, -144(%rsi), %ymm3, %ymm3
>> >         vshufps $136, %ymm3, %ymm1, %ymm3
>> >         vperm2f128      $3, %ymm3, %ymm3, %ymm1
>> >         vshufps $68, %ymm1, %ymm3, %ymm4
>> >         vshufps $238, %ymm1, %ymm3, %ymm1
>> >         vmovups -128(%rsi), %xmm3
>> >         vinsertf128     $1, %xmm1, %ymm4, %ymm1
>> >         vshufps $136, %ymm1, %ymm2, %ymm1
>> >         vperm2f128      $3, %ymm1, %ymm1, %ymm2
>> >         vshufps $68, %ymm2, %ymm1, %ymm4
>> >         vshufps $238, %ymm2, %ymm1, %ymm2
>> >         vinsertf128     $0x1, -112(%rsi), %ymm3, %ymm1
>> >         vmovups -96(%rsi), %xmm3
>> >         vinsertf128     $1, %xmm2, %ymm4, %ymm4
>> >         vinsertf128     $0x1, -80(%rsi), %ymm3, %ymm3
>> >         vshufps $136, %ymm3, %ymm1, %ymm3
>> >         vperm2f128      $3, %ymm3, %ymm3, %ymm2
>> >         vshufps $68, %ymm2, %ymm3, %ymm1
>> >         vshufps $238, %ymm2, %ymm3, %ymm2
>> >         vmovups -64(%rsi), %xmm3
>> >         vinsertf128     $1, %xmm2, %ymm1, %ymm2
>> >         vinsertf128     $0x1, -48(%rsi), %ymm3, %ymm1
>> >         vmovups -32(%rsi), %xmm3
>> >         vinsertf128     $0x1, -16(%rsi), %ymm3, %ymm3
>> >         cmpl    %r8d, %edi
>> >         vshufps $136, %ymm3, %ymm1, %ymm3
>> >         vperm2f128      $3, %ymm3, %ymm3, %ymm1
>> >         vshufps $68, %ymm1, %ymm3, %ymm5
>> >         vshufps $238, %ymm1, %ymm3, %ymm1
>> >         vinsertf128     $1, %xmm1, %ymm5, %ymm1
>> >         vshufps $136, %ymm1, %ymm2, %ymm1
>> >         vperm2f128      $3, %ymm1, %ymm1, %ymm2
>> >         vshufps $68, %ymm2, %ymm1, %ymm3
>> >         vshufps $238, %ymm2, %ymm1, %ymm2
>> >         vinsertf128     $1, %xmm2, %ymm3, %ymm1
>> >         vshufps $136, %ymm1, %ymm4, %ymm1
>> >         vperm2f128      $3, %ymm1, %ymm1, %ymm2
>> >         vshufps $68, %ymm2, %ymm1, %ymm3
>> >         vshufps $238, %ymm2, %ymm1, %ymm2
>> >         vinsertf128     $1, %xmm2, %ymm3, %ymm2
>> >         vaddps  %ymm2, %ymm0, %ymm0
>> >         ja      .L9
>> >
>> > is more efficient than
>> >
>> > .L3:
>> >         vaddss  (%rcx,%rax), %xmm0, %xmm0
>> >         addq    $32, %rax
>> >         cmpq    %rdx, %rax
>> >         jne     .L3
>> >
>> > ;)
>> >
>> >> As soon as
>> >> elemental functions will appear and we update the vectorizer so it can accept
>> >> an elemental function inside the loop - we will have the same
>> >> situation as we have
>> >> it now: cost model will bail out with profitability estimation.
>> >
>> > Yes.
>> >
>> >> Still we have no chance to get info on how efficient the bar() function when it
>> >> is in vector form.
>> >
>> > Well I assume you mean that the speedup when vectorizing the elemental
>> > will offset whatever wreckage we cause with vectorizing the rest of the
>> > statements.  I'd say you can at least compare to unrolling by
>> > the vectorization factor, building the vector inputs to the elemental
>> > from scalars, distributing the vector result from the elemental to
>> > scalars.
>> >
>> >> I believe I should repeat: #pragma omp simd is intended for introduction of an
>> >> instruction-level parallel region on developer's request, hence should
>> >> be treated
>> >> in same manner as #pragma omp parallel. Vectorizer cost model is an obstacle
>> >> here, not a help.
>> >
>> > Surely not if there isn't an elemental call in it.  With it the
>> > cost model of course will have not enough information to decide.
>> >
>> > But still, what's the difference to the case where we cannot vectorize
>> > the function?  What happens if we cannot vectorize the elemental?
>> > Do we have to build scalar versions for all possible vector sizes?
>> >
>> > Richard.
>> >
>> >> Regards,
>> >> Sergos
>> >>
>> >>
>> >> On Fri, Nov 15, 2013 at 1:08 AM, Richard Biener <rguenther@suse.de> wrote:
>> >> > Sergey Ostanevich <sergos.gnu@gmail.com> wrote:
>> >> >>this is only for the whole file? I mean to have a particular loop
>> >> >>vectorized in a
>> >> >>file while all others - up to compiler's cost model. is there such a
>> >> >>machinery?
>> >> >
>> >> > No, there is not.
>> >> >
>> >> > Richard.
>> >> >
>> >> >>Sergos
>> >> >>
>> >> >>On Thu, Nov 14, 2013 at 12:39 PM, Richard Biener <rguenther@suse.de>
>> >> >>wrote:
>> >> >>> On Wed, 13 Nov 2013, Sergey Ostanevich wrote:
>> >> >>>
>> >> >>>> I will get some tests.
>> >> >>>> As for cost analysis - simply consider the pragma as a request to
>> >> >>>> vectorize. How can I - as a developer - enforce it beyond the
>> >> >>pragma?
>> >> >>>
>> >> >>> You can disable the cost model via -fvect-cost-model=unlimited
>> >> >>>
>> >> >>> Richard.
>> >> >>>
>> >> >>>> On Wed, Nov 13, 2013 at 12:55 PM, Richard Biener <rguenther@suse.de>
>> >> >>wrote:
>> >> >>>> > On Tue, 12 Nov 2013, Sergey Ostanevich wrote:
>> >> >>>> >
>> >> >>>> >> The reason patch was in its original state is because we want
>> >> >>>> >> to notify user that his assumption of profitability may be wrong.
>> >> >>>> >> This is not a part of any spec and as far as I know ICC does not
>> >> >>>> >> notify user about the case. Still it can be a good hint for those
>> >> >>>> >> users who tries to get as much as possible performance.
>> >> >>>> >>
>> >> >>>> >> Richard's comment on the vectorization problems is about the same
>> >> >>-
>> >> >>>> >> to inform user that his attempt to force vectorization is failed.
>> >> >>>> >>
>> >> >>>> >> As for profitable or not - sometimes I believe it's impossible to
>> >> >>be
>> >> >>>> >> precise. For OMP we have case of a vector version of a function
>> >> >>>> >> and we have no chance to figure out whether it is profitable to
>> >> >>use
>> >> >>>> >> it or to loose it. If we can't map the loop for any vector length
>> >> >>>> >> other than 1 - I believe in this case we have to bail out and
>> >> >>report.
>> >> >>>> >> Is it about 'never profitable'?
>> >> >>>> >
>> >> >>>> > For example.  I think we should report non-vectorized loops
>> >> >>>> > that are marked with force_vect anyway, with
>> >> >>-Wdisabled-optimization.
>> >> >>>> > Another case is that a loop may be profitable to vectorize if
>> >> >>>> > the ISA supports a gather instruction but otherwise not.  Or if
>> >> >>the
>> >> >>>> > ISA supports efficient vector construction from N not loop
>> >> >>>> > invariant scalars (for vectorization of strided loads).
>> >> >>>> >
>> >> >>>> > Simply disregarding all of the cost analysis sounds completely
>> >> >>>> > bogus to me.
>> >> >>>> >
>> >> >>>> > I'd simply go for the diagnostic for now, not changing anything
>> >> >>else.
>> >> >>>> > We want to have a good understanding about why the cost model is
>> >> >>>> > so bad that we have to force to ignore it for #pragma simd - thus
>> >> >>we
>> >> >>>> > want testcases.
>> >> >>>> >
>> >> >>>> > Richard.
>> >> >>>> >
>> >> >>>> >>
>> >> >>>> >> On Tue, Nov 12, 2013 at 6:35 PM, Richard Biener
>> >> >><rguenther@suse.de> wrote:
>> >> >>>> >> > On 11/12/13 3:16 PM, Jakub Jelinek wrote:
>> >> >>>> >> >> On Tue, Nov 12, 2013 at 05:46:14PM +0400, Sergey Ostanevich
>> >> >>wrote:
>> >> >>>> >> >>> ivdep just substitutes all cross-iteration data analysis,
>> >> >>>> >> >>> nothing related to cost model. ICC does not cancel its
>> >> >>>> >> >>> cost model in case of #pragma ivdep
>> >> >>>> >> >>>
>> >> >>>> >> >>> as for the safelen - OMP standart treats it as a limitation
>> >> >>>> >> >>> for the vector length. this means if no safelen is present
>> >> >>>> >> >>> an arbitrary vector length can be used.
>> >> >>>> >> >>
>> >> >>>> >> >> I was talking about GCC loop->safelen, which is INT_MAX for
>> >> >>#pragma omp simd
>> >> >>>> >> >> without safelen clause or #pragma simd without vectorlength
>> >> >>clause.
>> >> >>>> >> >>
>> >> >>>> >> >>> so I believe loop->force_vect is the only trigger to
>> >> >>disregard
>> >> >>>> >> >>> the cost model
>> >> >>>> >> >>
>> >> >>>> >> >> Anyway, in that case I think the originally posted patch is
>> >> >>wrong,
>> >> >>>> >> >> if we want to treat force_vect as disregard all the cost model
>> >> >>and
>> >> >>>> >> >> force vectorization (well, the name of the field already kind
>> >> >>of suggest
>> >> >>>> >> >> that), then IMHO we should treat it the same as
>> >> >>-fvect-cost-model=unlimited
>> >> >>>> >> >> for those loops.
>> >> >>>> >> >
>> >> >>>> >> > Err - the user may have a specific sub-architecture in mind
>> >> >>when using
>> >> >>>> >> > #pragma simd, if you say we should completely ignore the cost
>> >> >>model
>> >> >>>> >> > then should we also sorry () if we cannot vectorize the loop
>> >> >>(either
>> >> >>>> >> > because of GCC deficiencies or lack of sub-target support)?
>> >> >>>> >> >
>> >> >>>> >> > That said, at least in the cases that the cost model says the
>> >> >>loop
>> >> >>>> >> > is never profitable to vectorize we should follow its advice.
>> >> >>>> >> >
>> >> >>>> >> > Richard.
>> >> >>>> >> >
>> >> >>>> >> >> Thus (untested):
>> >> >>>> >> >>
>> >> >>>> >> >> 2013-11-12  Jakub Jelinek  <jakub@redhat.com>
>> >> >>>> >> >>
>> >> >>>> >> >>       * tree-vect-loop.c (vect_estimate_min_profitable_iters):
>> >> >>Use
>> >> >>>> >> >>       unlimited cost model also for force_vect loops.
>> >> >>>> >> >>
>> >> >>>> >> >> --- gcc/tree-vect-loop.c.jj   2013-11-12 12:09:40.000000000
>> >> >>+0100
>> >> >>>> >> >> +++ gcc/tree-vect-loop.c      2013-11-12 15:11:43.821404330
>> >> >>+0100
>> >> >>>> >> >> @@ -2702,7 +2702,7 @@ vect_estimate_min_profitable_iters (loop
>> >> >>>> >> >>    void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA
>> >> >>(loop_vinfo);
>> >> >>>> >> >>
>> >> >>>> >> >>    /* Cost model disabled.  */
>> >> >>>> >> >> -  if (unlimited_cost_model ())
>> >> >>>> >> >> +  if (unlimited_cost_model () || LOOP_VINFO_LOOP
>> >> >>(loop_vinfo)->force_vect)
>> >> >>>> >> >>      {
>> >> >>>> >> >>        dump_printf_loc (MSG_NOTE, vect_location, "cost model
>> >> >>disabled.\n");
>> >> >>>> >> >>        *ret_min_profitable_niters = 0;
>> >> >>>> >> >>
>> >> >>>> >> >>       Jakub
>> >> >>>> >> >>
>> >> >>>> >> >
>> >> >>>> >>
>> >> >>>> >>
>> >> >>>> >
>> >> >>>> > --
>> >> >>>> > Richard Biener <rguenther@suse.de>
>> >> >>>> > SUSE / SUSE Labs
>> >> >>>> > SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
>> >> >>>> > GF: Jeff Hawn, Jennifer Guild, Felix Imend
>> >> >>>>
>> >> >>>>
>> >> >>>
>> >> >>> --
>> >> >>> Richard Biener <rguenther@suse.de>
>> >> >>> SUSE / SUSE Labs
>> >> >>> SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
>> >> >>> GF: Jeff Hawn, Jennifer Guild, Felix Imend"orffer
>> >> >
>> >> >
>> >>
>> >
>> > --
>> > Richard Biener <rguenther@suse.de>
>> > SUSE / SUSE Labs
>> > SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
>> > GF: Jeff Hawn, Jennifer Guild, Felix Imend"orffer
>>
>>
>
> --
> Richard Biener <rguenther@suse.de>
> SUSE / SUSE Labs
> SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
> GF: Jeff Hawn, Jennifer Guild, Felix Imend"orffer

Comments

Richard Biener Nov. 19, 2013, 2:07 p.m. UTC | #1
On Tue, 19 Nov 2013, Sergey Ostanevich wrote:

> :) agree to you, but as soon as you're a user who tries to introduce
> vector code and face a bug in cost model you'd like to have a
> workaround until the bug will be fixed and compiler will come to you
> with new OS distribution, don't you?
> 
> I propose the following, yet SLP have to use a NULL as a loop info
> which looks somewhat hacky.

I think this is overengineering.  -fvect-cost-model will do as
workaround.  And -fsimd-vect-cost-model has what I consider
duplicate - "simd" and "vect".

Richard.

> Sergos
> 
> 
>         * common.opt: Added new option -fsimd-vect-cost-model
>         * tree-vectorizer.h (unlimited_cost_model): Interface update
>         to rely on particular loop info
>         * tree-vect-data-refs.c (vect_peeling_hash_insert): Update to
>         unlimited_cost_model call according to new interface
>         (vect_peeling_hash_choose_best_peeling): Ditto
>         (vect_enhance_data_refs_alignment): Ditto
>         * tree-vect-slp.c: Ditto
>         * tree-vect-loop.c (vect_estimate_min_profitable_iters): Ditto
>         plus issue a warning in case cost model overrides users' directive
> 
> 
> 
> diff --git a/gcc/common.opt b/gcc/common.opt
> index d5971df..87b3b37 100644
> --- a/gcc/common.opt
> +++ b/gcc/common.opt
> @@ -2296,6 +2296,10 @@ fvect-cost-model=
>  Common Joined RejectNegative Enum(vect_cost_model)
> Var(flag_vect_cost_model) Init(VECT_COST_MODEL_DEFAULT)
>  Specifies the cost model for vectorization
> 
> +fsimd-vect-cost-model=
> +Common Joined RejectNegative Enum(vect_cost_model)
> Var(flag_simd_vect_cost_model) Init(VECT_COST_MODEL_UNLIMITED)
> +Specifies the cost model for vectorization in loops marked with
> #pragma omp simd
> +
>  Enum
>  Name(vect_cost_model) Type(enum vect_cost_model) UnknownError(unknown
> vectorizer cost model %qs)
> 
> diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
> index 83d1f45..e26f704 100644
> --- a/gcc/tree-vect-data-refs.c
> +++ b/gcc/tree-vect-data-refs.c
> @@ -1090,7 +1090,8 @@ vect_peeling_hash_insert (loop_vec_info
> loop_vinfo, struct data_reference *dr,
>        *new_slot = slot;
>      }
> 
> -  if (!supportable_dr_alignment && unlimited_cost_model ())
> +  if (!supportable_dr_alignment
> +      && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
>      slot->count += VECT_MAX_COST;
>  }
> 
> @@ -1200,7 +1201,7 @@ vect_peeling_hash_choose_best_peeling
> (loop_vec_info loop_vinfo,
>     res.peel_info.dr = NULL;
>     res.body_cost_vec = stmt_vector_for_cost ();
> 
> -   if (!unlimited_cost_model ())
> +   if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
>       {
>         res.inside_cost = INT_MAX;
>         res.outside_cost = INT_MAX;
> @@ -1429,7 +1430,7 @@ vect_enhance_data_refs_alignment (loop_vec_info
> loop_vinfo)
>                   vectorization factor.
>                   We do this automtically for cost model, since we
> calculate cost
>                   for every peeling option.  */
> -              if (unlimited_cost_model ())
> +              if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
>                  possible_npeel_number = vf /nelements;
> 
>                /* Handle the aligned case. We may decide to align some other
> @@ -1437,7 +1438,7 @@ vect_enhance_data_refs_alignment (loop_vec_info
> loop_vinfo)
>                if (DR_MISALIGNMENT (dr) == 0)
>                  {
>                    npeel_tmp = 0;
> -                  if (unlimited_cost_model ())
> +                  if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
>                      possible_npeel_number++;
>                  }
> 
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index 86ebbd2..be66172 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -2696,7 +2696,7 @@ vect_estimate_min_profitable_iters
> (loop_vec_info loop_vinfo,
>    void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
> 
>    /* Cost model disabled.  */
> -  if (unlimited_cost_model ())
> +  if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
>      {
>        dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
>        *ret_min_profitable_niters = 0;
> @@ -2929,6 +2929,11 @@ vect_estimate_min_profitable_iters
> (loop_vec_info loop_vinfo,
>    /* vector version will never be profitable.  */
>    else
>      {
> +      if (LOOP_VINFO_LOOP (loop_vinfo)->force_vect)
> +        {
> +          pedwarn (vect_location, 0, "Vectorization did not happen
> for the loop");
> +        }
> +
>        if (dump_enabled_p ())
>          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>   "cost model: the vector iteration cost = %d "
> diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
> index 247bdfd..4b25964 100644
> --- a/gcc/tree-vect-slp.c
> +++ b/gcc/tree-vect-slp.c
> @@ -2171,7 +2171,7 @@ vect_slp_analyze_bb_1 (basic_block bb)
>      }
> 
>    /* Cost model: check if the vectorization is worthwhile.  */
> -  if (!unlimited_cost_model ()
> +  if (!unlimited_cost_model (NULL)
>        && !vect_bb_vectorization_profitable_p (bb_vinfo))
>      {
>        if (dump_enabled_p ())
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index a6c5b59..2916906 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -919,9 +919,12 @@ known_alignment_for_access_p (struct
> data_reference *data_ref_info)
> 
>  /* Return true if the vect cost model is unlimited.  */
>  static inline bool
> -unlimited_cost_model ()
> +unlimited_cost_model (loop_p loop)
>  {
> -  return flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED;
> +  return (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED
> +          || (loop != NULL
> +              && loop->force_vect
> +              && flag_simd_vect_cost_model == VECT_COST_MODEL_UNLIMITED));
>  }
> 
>  /* Source location */
> 
> On Mon, Nov 18, 2013 at 7:13 PM, Richard Biener <rguenther@suse.de> wrote:
> > On Mon, 18 Nov 2013, Sergey Ostanevich wrote:
> >
> >> I would agree that the example is just for the case cost model makes
> >> correct estimation But how can we assure ourself that it won't have any
> >> mistakes in the future?
> >
> > We call it bugs and not mistakes and we have bugzilla for it.
> >
> > Richard.
> >
> >> I believe it'll be Ok to introduce an extra flag as Jakub proposed for the
> >> dedicated simd-forced vectorization to use unlimited cost model. This
> >> can be default for -fopenmp or there should be a warning issued that
> >> compiler overrides user's request of vectorization. In such a case user
> >> can enforce vectorization (even with mentioned results :) with this
> >> unlimited cost model for simd.
> >>
> >>
> >>
> >> On Fri, Nov 15, 2013 at 6:24 PM, Richard Biener <rguenther@suse.de> wrote:
> >> > On Fri, 15 Nov 2013, Sergey Ostanevich wrote:
> >> >
> >> >> Richard,
> >> >>
> >> >> here's an example that causes trigger for the cost model.
> >> >
> >> > I hardly believe that (AVX2)
> >> >
> >> > .L9:
> >> >         vmovups (%rsi), %xmm3
> >> >         addl    $1, %r8d
> >> >         addq    $256, %rsi
> >> >         vinsertf128     $0x1, -240(%rsi), %ymm3, %ymm1
> >> >         vmovups -224(%rsi), %xmm3
> >> >         vinsertf128     $0x1, -208(%rsi), %ymm3, %ymm3
> >> >         vshufps $136, %ymm3, %ymm1, %ymm3
> >> >         vperm2f128      $3, %ymm3, %ymm3, %ymm2
> >> >         vshufps $68, %ymm2, %ymm3, %ymm1
> >> >         vshufps $238, %ymm2, %ymm3, %ymm2
> >> >         vmovups -192(%rsi), %xmm3
> >> >         vinsertf128     $1, %xmm2, %ymm1, %ymm2
> >> >         vinsertf128     $0x1, -176(%rsi), %ymm3, %ymm1
> >> >         vmovups -160(%rsi), %xmm3
> >> >         vinsertf128     $0x1, -144(%rsi), %ymm3, %ymm3
> >> >         vshufps $136, %ymm3, %ymm1, %ymm3
> >> >         vperm2f128      $3, %ymm3, %ymm3, %ymm1
> >> >         vshufps $68, %ymm1, %ymm3, %ymm4
> >> >         vshufps $238, %ymm1, %ymm3, %ymm1
> >> >         vmovups -128(%rsi), %xmm3
> >> >         vinsertf128     $1, %xmm1, %ymm4, %ymm1
> >> >         vshufps $136, %ymm1, %ymm2, %ymm1
> >> >         vperm2f128      $3, %ymm1, %ymm1, %ymm2
> >> >         vshufps $68, %ymm2, %ymm1, %ymm4
> >> >         vshufps $238, %ymm2, %ymm1, %ymm2
> >> >         vinsertf128     $0x1, -112(%rsi), %ymm3, %ymm1
> >> >         vmovups -96(%rsi), %xmm3
> >> >         vinsertf128     $1, %xmm2, %ymm4, %ymm4
> >> >         vinsertf128     $0x1, -80(%rsi), %ymm3, %ymm3
> >> >         vshufps $136, %ymm3, %ymm1, %ymm3
> >> >         vperm2f128      $3, %ymm3, %ymm3, %ymm2
> >> >         vshufps $68, %ymm2, %ymm3, %ymm1
> >> >         vshufps $238, %ymm2, %ymm3, %ymm2
> >> >         vmovups -64(%rsi), %xmm3
> >> >         vinsertf128     $1, %xmm2, %ymm1, %ymm2
> >> >         vinsertf128     $0x1, -48(%rsi), %ymm3, %ymm1
> >> >         vmovups -32(%rsi), %xmm3
> >> >         vinsertf128     $0x1, -16(%rsi), %ymm3, %ymm3
> >> >         cmpl    %r8d, %edi
> >> >         vshufps $136, %ymm3, %ymm1, %ymm3
> >> >         vperm2f128      $3, %ymm3, %ymm3, %ymm1
> >> >         vshufps $68, %ymm1, %ymm3, %ymm5
> >> >         vshufps $238, %ymm1, %ymm3, %ymm1
> >> >         vinsertf128     $1, %xmm1, %ymm5, %ymm1
> >> >         vshufps $136, %ymm1, %ymm2, %ymm1
> >> >         vperm2f128      $3, %ymm1, %ymm1, %ymm2
> >> >         vshufps $68, %ymm2, %ymm1, %ymm3
> >> >         vshufps $238, %ymm2, %ymm1, %ymm2
> >> >         vinsertf128     $1, %xmm2, %ymm3, %ymm1
> >> >         vshufps $136, %ymm1, %ymm4, %ymm1
> >> >         vperm2f128      $3, %ymm1, %ymm1, %ymm2
> >> >         vshufps $68, %ymm2, %ymm1, %ymm3
> >> >         vshufps $238, %ymm2, %ymm1, %ymm2
> >> >         vinsertf128     $1, %xmm2, %ymm3, %ymm2
> >> >         vaddps  %ymm2, %ymm0, %ymm0
> >> >         ja      .L9
> >> >
> >> > is more efficient than
> >> >
> >> > .L3:
> >> >         vaddss  (%rcx,%rax), %xmm0, %xmm0
> >> >         addq    $32, %rax
> >> >         cmpq    %rdx, %rax
> >> >         jne     .L3
> >> >
> >> > ;)
> >> >
> >> >> As soon as
> >> >> elemental functions will appear and we update the vectorizer so it can accept
> >> >> an elemental function inside the loop - we will have the same
> >> >> situation as we have
> >> >> it now: cost model will bail out with profitability estimation.
> >> >
> >> > Yes.
> >> >
> >> >> Still we have no chance to get info on how efficient the bar() function when it
> >> >> is in vector form.
> >> >
> >> > Well I assume you mean that the speedup when vectorizing the elemental
> >> > will offset whatever wreckage we cause with vectorizing the rest of the
> >> > statements.  I'd say you can at least compare to unrolling by
> >> > the vectorization factor, building the vector inputs to the elemental
> >> > from scalars, distributing the vector result from the elemental to
> >> > scalars.
> >> >
> >> >> I believe I should repeat: #pragma omp simd is intended for introduction of an
> >> >> instruction-level parallel region on developer's request, hence should
> >> >> be treated
> >> >> in same manner as #pragma omp parallel. Vectorizer cost model is an obstacle
> >> >> here, not a help.
> >> >
> >> > Surely not if there isn't an elemental call in it.  With it the
> >> > cost model of course will have not enough information to decide.
> >> >
> >> > But still, what's the difference to the case where we cannot vectorize
> >> > the function?  What happens if we cannot vectorize the elemental?
> >> > Do we have to build scalar versions for all possible vector sizes?
> >> >
> >> > Richard.
> >> >
> >> >> Regards,
> >> >> Sergos
> >> >>
> >> >>
> >> >> On Fri, Nov 15, 2013 at 1:08 AM, Richard Biener <rguenther@suse.de> wrote:
> >> >> > Sergey Ostanevich <sergos.gnu@gmail.com> wrote:
> >> >> >>this is only for the whole file? I mean to have a particular loop
> >> >> >>vectorized in a
> >> >> >>file while all others - up to compiler's cost model. is there such a
> >> >> >>machinery?
> >> >> >
> >> >> > No, there is not.
> >> >> >
> >> >> > Richard.
> >> >> >
> >> >> >>Sergos
> >> >> >>
> >> >> >>On Thu, Nov 14, 2013 at 12:39 PM, Richard Biener <rguenther@suse.de>
> >> >> >>wrote:
> >> >> >>> On Wed, 13 Nov 2013, Sergey Ostanevich wrote:
> >> >> >>>
> >> >> >>>> I will get some tests.
> >> >> >>>> As for cost analysis - simply consider the pragma as a request to
> >> >> >>>> vectorize. How can I - as a developer - enforce it beyond the
> >> >> >>pragma?
> >> >> >>>
> >> >> >>> You can disable the cost model via -fvect-cost-model=unlimited
> >> >> >>>
> >> >> >>> Richard.
> >> >> >>>
> >> >> >>>> On Wed, Nov 13, 2013 at 12:55 PM, Richard Biener <rguenther@suse.de>
> >> >> >>wrote:
> >> >> >>>> > On Tue, 12 Nov 2013, Sergey Ostanevich wrote:
> >> >> >>>> >
> >> >> >>>> >> The reason patch was in its original state is because we want
> >> >> >>>> >> to notify user that his assumption of profitability may be wrong.
> >> >> >>>> >> This is not a part of any spec and as far as I know ICC does not
> >> >> >>>> >> notify user about the case. Still it can be a good hint for those
> >> >> >>>> >> users who tries to get as much as possible performance.
> >> >> >>>> >>
> >> >> >>>> >> Richard's comment on the vectorization problems is about the same
> >> >> >>-
> >> >> >>>> >> to inform user that his attempt to force vectorization is failed.
> >> >> >>>> >>
> >> >> >>>> >> As for profitable or not - sometimes I believe it's impossible to
> >> >> >>be
> >> >> >>>> >> precise. For OMP we have case of a vector version of a function
> >> >> >>>> >> and we have no chance to figure out whether it is profitable to
> >> >> >>use
> >> >> >>>> >> it or to loose it. If we can't map the loop for any vector length
> >> >> >>>> >> other than 1 - I believe in this case we have to bail out and
> >> >> >>report.
> >> >> >>>> >> Is it about 'never profitable'?
> >> >> >>>> >
> >> >> >>>> > For example.  I think we should report non-vectorized loops
> >> >> >>>> > that are marked with force_vect anyway, with
> >> >> >>-Wdisabled-optimization.
> >> >> >>>> > Another case is that a loop may be profitable to vectorize if
> >> >> >>>> > the ISA supports a gather instruction but otherwise not.  Or if
> >> >> >>the
> >> >> >>>> > ISA supports efficient vector construction from N not loop
> >> >> >>>> > invariant scalars (for vectorization of strided loads).
> >> >> >>>> >
> >> >> >>>> > Simply disregarding all of the cost analysis sounds completely
> >> >> >>>> > bogus to me.
> >> >> >>>> >
> >> >> >>>> > I'd simply go for the diagnostic for now, not changing anything
> >> >> >>else.
> >> >> >>>> > We want to have a good understanding about why the cost model is
> >> >> >>>> > so bad that we have to force to ignore it for #pragma simd - thus
> >> >> >>we
> >> >> >>>> > want testcases.
> >> >> >>>> >
> >> >> >>>> > Richard.
> >> >> >>>> >
> >> >> >>>> >>
> >> >> >>>> >> On Tue, Nov 12, 2013 at 6:35 PM, Richard Biener
> >> >> >><rguenther@suse.de> wrote:
> >> >> >>>> >> > On 11/12/13 3:16 PM, Jakub Jelinek wrote:
> >> >> >>>> >> >> On Tue, Nov 12, 2013 at 05:46:14PM +0400, Sergey Ostanevich
> >> >> >>wrote:
> >> >> >>>> >> >>> ivdep just substitutes all cross-iteration data analysis,
> >> >> >>>> >> >>> nothing related to cost model. ICC does not cancel its
> >> >> >>>> >> >>> cost model in case of #pragma ivdep
> >> >> >>>> >> >>>
> >> >> >>>> >> >>> as for the safelen - OMP standart treats it as a limitation
> >> >> >>>> >> >>> for the vector length. this means if no safelen is present
> >> >> >>>> >> >>> an arbitrary vector length can be used.
> >> >> >>>> >> >>
> >> >> >>>> >> >> I was talking about GCC loop->safelen, which is INT_MAX for
> >> >> >>#pragma omp simd
> >> >> >>>> >> >> without safelen clause or #pragma simd without vectorlength
> >> >> >>clause.
> >> >> >>>> >> >>
> >> >> >>>> >> >>> so I believe loop->force_vect is the only trigger to
> >> >> >>disregard
> >> >> >>>> >> >>> the cost model
> >> >> >>>> >> >>
> >> >> >>>> >> >> Anyway, in that case I think the originally posted patch is
> >> >> >>wrong,
> >> >> >>>> >> >> if we want to treat force_vect as disregard all the cost model
> >> >> >>and
> >> >> >>>> >> >> force vectorization (well, the name of the field already kind
> >> >> >>of suggest
> >> >> >>>> >> >> that), then IMHO we should treat it the same as
> >> >> >>-fvect-cost-model=unlimited
> >> >> >>>> >> >> for those loops.
> >> >> >>>> >> >
> >> >> >>>> >> > Err - the user may have a specific sub-architecture in mind
> >> >> >>when using
> >> >> >>>> >> > #pragma simd, if you say we should completely ignore the cost
> >> >> >>model
> >> >> >>>> >> > then should we also sorry () if we cannot vectorize the loop
> >> >> >>(either
> >> >> >>>> >> > because of GCC deficiencies or lack of sub-target support)?
> >> >> >>>> >> >
> >> >> >>>> >> > That said, at least in the cases that the cost model says the
> >> >> >>loop
> >> >> >>>> >> > is never profitable to vectorize we should follow its advice.
> >> >> >>>> >> >
> >> >> >>>> >> > Richard.
> >> >> >>>> >> >
> >> >> >>>> >> >> Thus (untested):
> >> >> >>>> >> >>
> >> >> >>>> >> >> 2013-11-12  Jakub Jelinek  <jakub@redhat.com>
> >> >> >>>> >> >>
> >> >> >>>> >> >>       * tree-vect-loop.c (vect_estimate_min_profitable_iters):
> >> >> >>Use
> >> >> >>>> >> >>       unlimited cost model also for force_vect loops.
> >> >> >>>> >> >>
> >> >> >>>> >> >> --- gcc/tree-vect-loop.c.jj   2013-11-12 12:09:40.000000000
> >> >> >>+0100
> >> >> >>>> >> >> +++ gcc/tree-vect-loop.c      2013-11-12 15:11:43.821404330
> >> >> >>+0100
> >> >> >>>> >> >> @@ -2702,7 +2702,7 @@ vect_estimate_min_profitable_iters (loop
> >> >> >>>> >> >>    void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA
> >> >> >>(loop_vinfo);
> >> >> >>>> >> >>
> >> >> >>>> >> >>    /* Cost model disabled.  */
> >> >> >>>> >> >> -  if (unlimited_cost_model ())
> >> >> >>>> >> >> +  if (unlimited_cost_model () || LOOP_VINFO_LOOP
> >> >> >>(loop_vinfo)->force_vect)
> >> >> >>>> >> >>      {
> >> >> >>>> >> >>        dump_printf_loc (MSG_NOTE, vect_location, "cost model
> >> >> >>disabled.\n");
> >> >> >>>> >> >>        *ret_min_profitable_niters = 0;
> >> >> >>>> >> >>
> >> >> >>>> >> >>       Jakub
> >> >> >>>> >> >>
> >> >> >>>> >> >
> >> >> >>>> >>
> >> >> >>>> >>
> >> >> >>>> >
> >> >> >>>> > --
> >> >> >>>> > Richard Biener <rguenther@suse.de>
> >> >> >>>> > SUSE / SUSE Labs
> >> >> >>>> > SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
> >> >> >>>> > GF: Jeff Hawn, Jennifer Guild, Felix Imend
> >> >> >>>>
> >> >> >>>>
> >> >> >>>
> >> >> >>> --
> >> >> >>> Richard Biener <rguenther@suse.de>
> >> >> >>> SUSE / SUSE Labs
> >> >> >>> SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
> >> >> >>> GF: Jeff Hawn, Jennifer Guild, Felix Imend"orffer
> >> >> >
> >> >> >
> >> >>
> >> >
> >> > --
> >> > Richard Biener <rguenther@suse.de>
> >> > SUSE / SUSE Labs
> >> > SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
> >> > GF: Jeff Hawn, Jennifer Guild, Felix Imend"orffer
> >>
> >>
> >
> > --
> > Richard Biener <rguenther@suse.de>
> > SUSE / SUSE Labs
> > SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
> > GF: Jeff Hawn, Jennifer Guild, Felix Imend"orffer
> 
>
Jakub Jelinek Nov. 19, 2013, 2:11 p.m. UTC | #2
On Tue, Nov 19, 2013 at 03:07:52PM +0100, Richard Biener wrote:
> On Tue, 19 Nov 2013, Sergey Ostanevich wrote:
> 
> > :) agree to you, but as soon as you're a user who tries to introduce
> > vector code and face a bug in cost model you'd like to have a
> > workaround until the bug will be fixed and compiler will come to you
> > with new OS distribution, don't you?
> > 
> > I propose the following, yet SLP have to use a NULL as a loop info
> > which looks somewhat hacky.
> 
> I think this is overengineering.  -fvect-cost-model will do as
> workaround.  And -fsimd-vect-cost-model has what I consider
> duplicate - "simd" and "vect".

I think it is a good idea, though I agree about s/simd-vect/simd/ and
I'd use VECT_COST_MODEL_DEFAULT as the default, which would mean
just use -fvect-cost-model.

> > @@ -2929,6 +2929,11 @@ vect_estimate_min_profitable_iters
> > (loop_vec_info loop_vinfo,
> >    /* vector version will never be profitable.  */
> >    else
> >      {
> > +      if (LOOP_VINFO_LOOP (loop_vinfo)->force_vect)
> > +        {
> > +          pedwarn (vect_location, 0, "Vectorization did not happen
> > for the loop");
> > +        }

pedwarn isn't really desirable for this, you want just warning,
but some warning you can actually also turn off.
-Wopenmp-simd (and we'd use it also when we ignore #pragma omp declare simd
because it wasn't useful/desirable).

	Jakub
Sergey Ostanevich Nov. 19, 2013, 2:30 p.m. UTC | #3
On Tue, Nov 19, 2013 at 6:07 PM, Richard Biener <rguenther@suse.de> wrote:
> On Tue, 19 Nov 2013, Sergey Ostanevich wrote:
>
>> :) agree to you, but as soon as you're a user who tries to introduce
>> vector code and face a bug in cost model you'd like to have a
>> workaround until the bug will be fixed and compiler will come to you
>> with new OS distribution, don't you?
>>
>> I propose the following, yet SLP have to use a NULL as a loop info
>> which looks somewhat hacky.
>
> I think this is overengineering.  -fvect-cost-model will do as
> workaround.  And -fsimd-vect-cost-model has what I consider
> duplicate - "simd" and "vect".

I just wanted to separate the autovectorized loops from ones user
wants to vectorize. The -fvect-cost-model will force all at once.
That's the reason to introcude the simd-vect, since pragma name
is simd.

>
> Richard.
>
>> Sergos
>>
>>
>>         * common.opt: Added new option -fsimd-vect-cost-model
>>         * tree-vectorizer.h (unlimited_cost_model): Interface update
>>         to rely on particular loop info
>>         * tree-vect-data-refs.c (vect_peeling_hash_insert): Update to
>>         unlimited_cost_model call according to new interface
>>         (vect_peeling_hash_choose_best_peeling): Ditto
>>         (vect_enhance_data_refs_alignment): Ditto
>>         * tree-vect-slp.c: Ditto
>>         * tree-vect-loop.c (vect_estimate_min_profitable_iters): Ditto
>>         plus issue a warning in case cost model overrides users' directive
>>
>>
>>
>> diff --git a/gcc/common.opt b/gcc/common.opt
>> index d5971df..87b3b37 100644
>> --- a/gcc/common.opt
>> +++ b/gcc/common.opt
>> @@ -2296,6 +2296,10 @@ fvect-cost-model=
>>  Common Joined RejectNegative Enum(vect_cost_model)
>> Var(flag_vect_cost_model) Init(VECT_COST_MODEL_DEFAULT)
>>  Specifies the cost model for vectorization
>>
>> +fsimd-vect-cost-model=
>> +Common Joined RejectNegative Enum(vect_cost_model)
>> Var(flag_simd_vect_cost_model) Init(VECT_COST_MODEL_UNLIMITED)
>> +Specifies the cost model for vectorization in loops marked with
>> #pragma omp simd
>> +
>>  Enum
>>  Name(vect_cost_model) Type(enum vect_cost_model) UnknownError(unknown
>> vectorizer cost model %qs)
>>
>> diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
>> index 83d1f45..e26f704 100644
>> --- a/gcc/tree-vect-data-refs.c
>> +++ b/gcc/tree-vect-data-refs.c
>> @@ -1090,7 +1090,8 @@ vect_peeling_hash_insert (loop_vec_info
>> loop_vinfo, struct data_reference *dr,
>>        *new_slot = slot;
>>      }
>>
>> -  if (!supportable_dr_alignment && unlimited_cost_model ())
>> +  if (!supportable_dr_alignment
>> +      && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
>>      slot->count += VECT_MAX_COST;
>>  }
>>
>> @@ -1200,7 +1201,7 @@ vect_peeling_hash_choose_best_peeling
>> (loop_vec_info loop_vinfo,
>>     res.peel_info.dr = NULL;
>>     res.body_cost_vec = stmt_vector_for_cost ();
>>
>> -   if (!unlimited_cost_model ())
>> +   if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
>>       {
>>         res.inside_cost = INT_MAX;
>>         res.outside_cost = INT_MAX;
>> @@ -1429,7 +1430,7 @@ vect_enhance_data_refs_alignment (loop_vec_info
>> loop_vinfo)
>>                   vectorization factor.
>>                   We do this automtically for cost model, since we
>> calculate cost
>>                   for every peeling option.  */
>> -              if (unlimited_cost_model ())
>> +              if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
>>                  possible_npeel_number = vf /nelements;
>>
>>                /* Handle the aligned case. We may decide to align some other
>> @@ -1437,7 +1438,7 @@ vect_enhance_data_refs_alignment (loop_vec_info
>> loop_vinfo)
>>                if (DR_MISALIGNMENT (dr) == 0)
>>                  {
>>                    npeel_tmp = 0;
>> -                  if (unlimited_cost_model ())
>> +                  if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
>>                      possible_npeel_number++;
>>                  }
>>
>> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
>> index 86ebbd2..be66172 100644
>> --- a/gcc/tree-vect-loop.c
>> +++ b/gcc/tree-vect-loop.c
>> @@ -2696,7 +2696,7 @@ vect_estimate_min_profitable_iters
>> (loop_vec_info loop_vinfo,
>>    void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
>>
>>    /* Cost model disabled.  */
>> -  if (unlimited_cost_model ())
>> +  if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
>>      {
>>        dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
>>        *ret_min_profitable_niters = 0;
>> @@ -2929,6 +2929,11 @@ vect_estimate_min_profitable_iters
>> (loop_vec_info loop_vinfo,
>>    /* vector version will never be profitable.  */
>>    else
>>      {
>> +      if (LOOP_VINFO_LOOP (loop_vinfo)->force_vect)
>> +        {
>> +          pedwarn (vect_location, 0, "Vectorization did not happen
>> for the loop");
>> +        }
>> +
>>        if (dump_enabled_p ())
>>          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>>   "cost model: the vector iteration cost = %d "
>> diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
>> index 247bdfd..4b25964 100644
>> --- a/gcc/tree-vect-slp.c
>> +++ b/gcc/tree-vect-slp.c
>> @@ -2171,7 +2171,7 @@ vect_slp_analyze_bb_1 (basic_block bb)
>>      }
>>
>>    /* Cost model: check if the vectorization is worthwhile.  */
>> -  if (!unlimited_cost_model ()
>> +  if (!unlimited_cost_model (NULL)
>>        && !vect_bb_vectorization_profitable_p (bb_vinfo))
>>      {
>>        if (dump_enabled_p ())
>> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
>> index a6c5b59..2916906 100644
>> --- a/gcc/tree-vectorizer.h
>> +++ b/gcc/tree-vectorizer.h
>> @@ -919,9 +919,12 @@ known_alignment_for_access_p (struct
>> data_reference *data_ref_info)
>>
>>  /* Return true if the vect cost model is unlimited.  */
>>  static inline bool
>> -unlimited_cost_model ()
>> +unlimited_cost_model (loop_p loop)
>>  {
>> -  return flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED;
>> +  return (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED
>> +          || (loop != NULL
>> +              && loop->force_vect
>> +              && flag_simd_vect_cost_model == VECT_COST_MODEL_UNLIMITED));
>>  }
>>
>>  /* Source location */
>>
>> On Mon, Nov 18, 2013 at 7:13 PM, Richard Biener <rguenther@suse.de> wrote:
>> > On Mon, 18 Nov 2013, Sergey Ostanevich wrote:
>> >
>> >> I would agree that the example is just for the case cost model makes
>> >> correct estimation But how can we assure ourself that it won't have any
>> >> mistakes in the future?
>> >
>> > We call it bugs and not mistakes and we have bugzilla for it.
>> >
>> > Richard.
>> >
>> >> I believe it'll be Ok to introduce an extra flag as Jakub proposed for the
>> >> dedicated simd-forced vectorization to use unlimited cost model. This
>> >> can be default for -fopenmp or there should be a warning issued that
>> >> compiler overrides user's request of vectorization. In such a case user
>> >> can enforce vectorization (even with mentioned results :) with this
>> >> unlimited cost model for simd.
>> >>
>> >>
>> >>
>> >> On Fri, Nov 15, 2013 at 6:24 PM, Richard Biener <rguenther@suse.de> wrote:
>> >> > On Fri, 15 Nov 2013, Sergey Ostanevich wrote:
>> >> >
>> >> >> Richard,
>> >> >>
>> >> >> here's an example that causes trigger for the cost model.
>> >> >
>> >> > I hardly believe that (AVX2)
>> >> >
>> >> > .L9:
>> >> >         vmovups (%rsi), %xmm3
>> >> >         addl    $1, %r8d
>> >> >         addq    $256, %rsi
>> >> >         vinsertf128     $0x1, -240(%rsi), %ymm3, %ymm1
>> >> >         vmovups -224(%rsi), %xmm3
>> >> >         vinsertf128     $0x1, -208(%rsi), %ymm3, %ymm3
>> >> >         vshufps $136, %ymm3, %ymm1, %ymm3
>> >> >         vperm2f128      $3, %ymm3, %ymm3, %ymm2
>> >> >         vshufps $68, %ymm2, %ymm3, %ymm1
>> >> >         vshufps $238, %ymm2, %ymm3, %ymm2
>> >> >         vmovups -192(%rsi), %xmm3
>> >> >         vinsertf128     $1, %xmm2, %ymm1, %ymm2
>> >> >         vinsertf128     $0x1, -176(%rsi), %ymm3, %ymm1
>> >> >         vmovups -160(%rsi), %xmm3
>> >> >         vinsertf128     $0x1, -144(%rsi), %ymm3, %ymm3
>> >> >         vshufps $136, %ymm3, %ymm1, %ymm3
>> >> >         vperm2f128      $3, %ymm3, %ymm3, %ymm1
>> >> >         vshufps $68, %ymm1, %ymm3, %ymm4
>> >> >         vshufps $238, %ymm1, %ymm3, %ymm1
>> >> >         vmovups -128(%rsi), %xmm3
>> >> >         vinsertf128     $1, %xmm1, %ymm4, %ymm1
>> >> >         vshufps $136, %ymm1, %ymm2, %ymm1
>> >> >         vperm2f128      $3, %ymm1, %ymm1, %ymm2
>> >> >         vshufps $68, %ymm2, %ymm1, %ymm4
>> >> >         vshufps $238, %ymm2, %ymm1, %ymm2
>> >> >         vinsertf128     $0x1, -112(%rsi), %ymm3, %ymm1
>> >> >         vmovups -96(%rsi), %xmm3
>> >> >         vinsertf128     $1, %xmm2, %ymm4, %ymm4
>> >> >         vinsertf128     $0x1, -80(%rsi), %ymm3, %ymm3
>> >> >         vshufps $136, %ymm3, %ymm1, %ymm3
>> >> >         vperm2f128      $3, %ymm3, %ymm3, %ymm2
>> >> >         vshufps $68, %ymm2, %ymm3, %ymm1
>> >> >         vshufps $238, %ymm2, %ymm3, %ymm2
>> >> >         vmovups -64(%rsi), %xmm3
>> >> >         vinsertf128     $1, %xmm2, %ymm1, %ymm2
>> >> >         vinsertf128     $0x1, -48(%rsi), %ymm3, %ymm1
>> >> >         vmovups -32(%rsi), %xmm3
>> >> >         vinsertf128     $0x1, -16(%rsi), %ymm3, %ymm3
>> >> >         cmpl    %r8d, %edi
>> >> >         vshufps $136, %ymm3, %ymm1, %ymm3
>> >> >         vperm2f128      $3, %ymm3, %ymm3, %ymm1
>> >> >         vshufps $68, %ymm1, %ymm3, %ymm5
>> >> >         vshufps $238, %ymm1, %ymm3, %ymm1
>> >> >         vinsertf128     $1, %xmm1, %ymm5, %ymm1
>> >> >         vshufps $136, %ymm1, %ymm2, %ymm1
>> >> >         vperm2f128      $3, %ymm1, %ymm1, %ymm2
>> >> >         vshufps $68, %ymm2, %ymm1, %ymm3
>> >> >         vshufps $238, %ymm2, %ymm1, %ymm2
>> >> >         vinsertf128     $1, %xmm2, %ymm3, %ymm1
>> >> >         vshufps $136, %ymm1, %ymm4, %ymm1
>> >> >         vperm2f128      $3, %ymm1, %ymm1, %ymm2
>> >> >         vshufps $68, %ymm2, %ymm1, %ymm3
>> >> >         vshufps $238, %ymm2, %ymm1, %ymm2
>> >> >         vinsertf128     $1, %xmm2, %ymm3, %ymm2
>> >> >         vaddps  %ymm2, %ymm0, %ymm0
>> >> >         ja      .L9
>> >> >
>> >> > is more efficient than
>> >> >
>> >> > .L3:
>> >> >         vaddss  (%rcx,%rax), %xmm0, %xmm0
>> >> >         addq    $32, %rax
>> >> >         cmpq    %rdx, %rax
>> >> >         jne     .L3
>> >> >
>> >> > ;)
>> >> >
>> >> >> As soon as
>> >> >> elemental functions will appear and we update the vectorizer so it can accept
>> >> >> an elemental function inside the loop - we will have the same
>> >> >> situation as we have
>> >> >> it now: cost model will bail out with profitability estimation.
>> >> >
>> >> > Yes.
>> >> >
>> >> >> Still we have no chance to get info on how efficient the bar() function when it
>> >> >> is in vector form.
>> >> >
>> >> > Well I assume you mean that the speedup when vectorizing the elemental
>> >> > will offset whatever wreckage we cause with vectorizing the rest of the
>> >> > statements.  I'd say you can at least compare to unrolling by
>> >> > the vectorization factor, building the vector inputs to the elemental
>> >> > from scalars, distributing the vector result from the elemental to
>> >> > scalars.
>> >> >
>> >> >> I believe I should repeat: #pragma omp simd is intended for introduction of an
>> >> >> instruction-level parallel region on developer's request, hence should
>> >> >> be treated
>> >> >> in same manner as #pragma omp parallel. Vectorizer cost model is an obstacle
>> >> >> here, not a help.
>> >> >
>> >> > Surely not if there isn't an elemental call in it.  With it the
>> >> > cost model of course will have not enough information to decide.
>> >> >
>> >> > But still, what's the difference to the case where we cannot vectorize
>> >> > the function?  What happens if we cannot vectorize the elemental?
>> >> > Do we have to build scalar versions for all possible vector sizes?
>> >> >
>> >> > Richard.
>> >> >
>> >> >> Regards,
>> >> >> Sergos
>> >> >>
>> >> >>
>> >> >> On Fri, Nov 15, 2013 at 1:08 AM, Richard Biener <rguenther@suse.de> wrote:
>> >> >> > Sergey Ostanevich <sergos.gnu@gmail.com> wrote:
>> >> >> >>this is only for the whole file? I mean to have a particular loop
>> >> >> >>vectorized in a
>> >> >> >>file while all others - up to compiler's cost model. is there such a
>> >> >> >>machinery?
>> >> >> >
>> >> >> > No, there is not.
>> >> >> >
>> >> >> > Richard.
>> >> >> >
>> >> >> >>Sergos
>> >> >> >>
>> >> >> >>On Thu, Nov 14, 2013 at 12:39 PM, Richard Biener <rguenther@suse.de>
>> >> >> >>wrote:
>> >> >> >>> On Wed, 13 Nov 2013, Sergey Ostanevich wrote:
>> >> >> >>>
>> >> >> >>>> I will get some tests.
>> >> >> >>>> As for cost analysis - simply consider the pragma as a request to
>> >> >> >>>> vectorize. How can I - as a developer - enforce it beyond the
>> >> >> >>pragma?
>> >> >> >>>
>> >> >> >>> You can disable the cost model via -fvect-cost-model=unlimited
>> >> >> >>>
>> >> >> >>> Richard.
>> >> >> >>>
>> >> >> >>>> On Wed, Nov 13, 2013 at 12:55 PM, Richard Biener <rguenther@suse.de>
>> >> >> >>wrote:
>> >> >> >>>> > On Tue, 12 Nov 2013, Sergey Ostanevich wrote:
>> >> >> >>>> >
>> >> >> >>>> >> The reason patch was in its original state is because we want
>> >> >> >>>> >> to notify user that his assumption of profitability may be wrong.
>> >> >> >>>> >> This is not a part of any spec and as far as I know ICC does not
>> >> >> >>>> >> notify user about the case. Still it can be a good hint for those
>> >> >> >>>> >> users who tries to get as much as possible performance.
>> >> >> >>>> >>
>> >> >> >>>> >> Richard's comment on the vectorization problems is about the same
>> >> >> >>-
>> >> >> >>>> >> to inform user that his attempt to force vectorization is failed.
>> >> >> >>>> >>
>> >> >> >>>> >> As for profitable or not - sometimes I believe it's impossible to
>> >> >> >>be
>> >> >> >>>> >> precise. For OMP we have case of a vector version of a function
>> >> >> >>>> >> and we have no chance to figure out whether it is profitable to
>> >> >> >>use
>> >> >> >>>> >> it or to loose it. If we can't map the loop for any vector length
>> >> >> >>>> >> other than 1 - I believe in this case we have to bail out and
>> >> >> >>report.
>> >> >> >>>> >> Is it about 'never profitable'?
>> >> >> >>>> >
>> >> >> >>>> > For example.  I think we should report non-vectorized loops
>> >> >> >>>> > that are marked with force_vect anyway, with
>> >> >> >>-Wdisabled-optimization.
>> >> >> >>>> > Another case is that a loop may be profitable to vectorize if
>> >> >> >>>> > the ISA supports a gather instruction but otherwise not.  Or if
>> >> >> >>the
>> >> >> >>>> > ISA supports efficient vector construction from N not loop
>> >> >> >>>> > invariant scalars (for vectorization of strided loads).
>> >> >> >>>> >
>> >> >> >>>> > Simply disregarding all of the cost analysis sounds completely
>> >> >> >>>> > bogus to me.
>> >> >> >>>> >
>> >> >> >>>> > I'd simply go for the diagnostic for now, not changing anything
>> >> >> >>else.
>> >> >> >>>> > We want to have a good understanding about why the cost model is
>> >> >> >>>> > so bad that we have to force to ignore it for #pragma simd - thus
>> >> >> >>we
>> >> >> >>>> > want testcases.
>> >> >> >>>> >
>> >> >> >>>> > Richard.
>> >> >> >>>> >
>> >> >> >>>> >>
>> >> >> >>>> >> On Tue, Nov 12, 2013 at 6:35 PM, Richard Biener
>> >> >> >><rguenther@suse.de> wrote:
>> >> >> >>>> >> > On 11/12/13 3:16 PM, Jakub Jelinek wrote:
>> >> >> >>>> >> >> On Tue, Nov 12, 2013 at 05:46:14PM +0400, Sergey Ostanevich
>> >> >> >>wrote:
>> >> >> >>>> >> >>> ivdep just substitutes all cross-iteration data analysis,
>> >> >> >>>> >> >>> nothing related to cost model. ICC does not cancel its
>> >> >> >>>> >> >>> cost model in case of #pragma ivdep
>> >> >> >>>> >> >>>
>> >> >> >>>> >> >>> as for the safelen - OMP standart treats it as a limitation
>> >> >> >>>> >> >>> for the vector length. this means if no safelen is present
>> >> >> >>>> >> >>> an arbitrary vector length can be used.
>> >> >> >>>> >> >>
>> >> >> >>>> >> >> I was talking about GCC loop->safelen, which is INT_MAX for
>> >> >> >>#pragma omp simd
>> >> >> >>>> >> >> without safelen clause or #pragma simd without vectorlength
>> >> >> >>clause.
>> >> >> >>>> >> >>
>> >> >> >>>> >> >>> so I believe loop->force_vect is the only trigger to
>> >> >> >>disregard
>> >> >> >>>> >> >>> the cost model
>> >> >> >>>> >> >>
>> >> >> >>>> >> >> Anyway, in that case I think the originally posted patch is
>> >> >> >>wrong,
>> >> >> >>>> >> >> if we want to treat force_vect as disregard all the cost model
>> >> >> >>and
>> >> >> >>>> >> >> force vectorization (well, the name of the field already kind
>> >> >> >>of suggest
>> >> >> >>>> >> >> that), then IMHO we should treat it the same as
>> >> >> >>-fvect-cost-model=unlimited
>> >> >> >>>> >> >> for those loops.
>> >> >> >>>> >> >
>> >> >> >>>> >> > Err - the user may have a specific sub-architecture in mind
>> >> >> >>when using
>> >> >> >>>> >> > #pragma simd, if you say we should completely ignore the cost
>> >> >> >>model
>> >> >> >>>> >> > then should we also sorry () if we cannot vectorize the loop
>> >> >> >>(either
>> >> >> >>>> >> > because of GCC deficiencies or lack of sub-target support)?
>> >> >> >>>> >> >
>> >> >> >>>> >> > That said, at least in the cases that the cost model says the
>> >> >> >>loop
>> >> >> >>>> >> > is never profitable to vectorize we should follow its advice.
>> >> >> >>>> >> >
>> >> >> >>>> >> > Richard.
>> >> >> >>>> >> >
>> >> >> >>>> >> >> Thus (untested):
>> >> >> >>>> >> >>
>> >> >> >>>> >> >> 2013-11-12  Jakub Jelinek  <jakub@redhat.com>
>> >> >> >>>> >> >>
>> >> >> >>>> >> >>       * tree-vect-loop.c (vect_estimate_min_profitable_iters):
>> >> >> >>Use
>> >> >> >>>> >> >>       unlimited cost model also for force_vect loops.
>> >> >> >>>> >> >>
>> >> >> >>>> >> >> --- gcc/tree-vect-loop.c.jj   2013-11-12 12:09:40.000000000
>> >> >> >>+0100
>> >> >> >>>> >> >> +++ gcc/tree-vect-loop.c      2013-11-12 15:11:43.821404330
>> >> >> >>+0100
>> >> >> >>>> >> >> @@ -2702,7 +2702,7 @@ vect_estimate_min_profitable_iters (loop
>> >> >> >>>> >> >>    void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA
>> >> >> >>(loop_vinfo);
>> >> >> >>>> >> >>
>> >> >> >>>> >> >>    /* Cost model disabled.  */
>> >> >> >>>> >> >> -  if (unlimited_cost_model ())
>> >> >> >>>> >> >> +  if (unlimited_cost_model () || LOOP_VINFO_LOOP
>> >> >> >>(loop_vinfo)->force_vect)
>> >> >> >>>> >> >>      {
>> >> >> >>>> >> >>        dump_printf_loc (MSG_NOTE, vect_location, "cost model
>> >> >> >>disabled.\n");
>> >> >> >>>> >> >>        *ret_min_profitable_niters = 0;
>> >> >> >>>> >> >>
>> >> >> >>>> >> >>       Jakub
>> >> >> >>>> >> >>
>> >> >> >>>> >> >
>> >> >> >>>> >>
>> >> >> >>>> >>
>> >> >> >>>> >
>> >> >> >>>> > --
>> >> >> >>>> > Richard Biener <rguenther@suse.de>
>> >> >> >>>> > SUSE / SUSE Labs
>> >> >> >>>> > SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
>> >> >> >>>> > GF: Jeff Hawn, Jennifer Guild, Felix Imend
>> >> >> >>>>
>> >> >> >>>>
>> >> >> >>>
>> >> >> >>> --
>> >> >> >>> Richard Biener <rguenther@suse.de>
>> >> >> >>> SUSE / SUSE Labs
>> >> >> >>> SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
>> >> >> >>> GF: Jeff Hawn, Jennifer Guild, Felix Imend"orffer
>> >> >> >
>> >> >> >
>> >> >>
>> >> >
>> >> > --
>> >> > Richard Biener <rguenther@suse.de>
>> >> > SUSE / SUSE Labs
>> >> > SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
>> >> > GF: Jeff Hawn, Jennifer Guild, Felix Imend"orffer
>> >>
>> >>
>> >
>> > --
>> > Richard Biener <rguenther@suse.de>
>> > SUSE / SUSE Labs
>> > SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
>> > GF: Jeff Hawn, Jennifer Guild, Felix Imend"orffer
>>
>>
>
> --
> Richard Biener <rguenther@suse.de>
> SUSE / SUSE Labs
> SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
> GF: Jeff Hawn, Jennifer Guild, Felix Imend"orffer
Sergey Ostanevich Nov. 19, 2013, 2:39 p.m. UTC | #4
>> > I propose the following, yet SLP have to use a NULL as a loop info
>> > which looks somewhat hacky.
>>
>> I think this is overengineering.  -fvect-cost-model will do as
>> workaround.  And -fsimd-vect-cost-model has what I consider
>> duplicate - "simd" and "vect".
>
> I think it is a good idea, though I agree about s/simd-vect/simd/ and
> I'd use VECT_COST_MODEL_DEFAULT as the default, which would mean
> just use -fvect-cost-model.

that's ok, since we'd have a way to force those 'simd' loops.

>
>> > @@ -2929,6 +2929,11 @@ vect_estimate_min_profitable_iters
>> > (loop_vec_info loop_vinfo,
>> >    /* vector version will never be profitable.  */
>> >    else
>> >      {
>> > +      if (LOOP_VINFO_LOOP (loop_vinfo)->force_vect)
>> > +        {
>> > +          pedwarn (vect_location, 0, "Vectorization did not happen
>> > for the loop");
>> > +        }
>
> pedwarn isn't really desirable for this, you want just warning,
> but some warning you can actually also turn off.
> -Wopenmp-simd (and we'd use it also when we ignore #pragma omp declare simd
> because it wasn't useful/desirable).

consider a user is interested in enabling warning-as-error for this case?
can we disable the pedwarn the same way?

Sergos

>
>         Jakub
Jakub Jelinek Nov. 19, 2013, 2:42 p.m. UTC | #5
On Tue, Nov 19, 2013 at 06:39:48PM +0400, Sergey Ostanevich wrote:
> > pedwarn isn't really desirable for this, you want just warning,
> > but some warning you can actually also turn off.
> > -Wopenmp-simd (and we'd use it also when we ignore #pragma omp declare simd
> > because it wasn't useful/desirable).
> 
> consider a user is interested in enabling warning-as-error for this case?

-Werror=openmp-simd will work then, this works for any named warnings.

> can we disable the pedwarn the same way?

pedwarn is for pedantic warnings, no standard says that #pragma omp simd
must be vectorized, or that #pragma omp simd or #pragma omp declare simd
is anything but an optimization hint, so pedwarn isn't what you are looking
for.

	Jakub
diff mbox

Patch

diff --git a/gcc/common.opt b/gcc/common.opt
index d5971df..87b3b37 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -2296,6 +2296,10 @@  fvect-cost-model=
 Common Joined RejectNegative Enum(vect_cost_model)
Var(flag_vect_cost_model) Init(VECT_COST_MODEL_DEFAULT)
 Specifies the cost model for vectorization

+fsimd-vect-cost-model=
+Common Joined RejectNegative Enum(vect_cost_model)
Var(flag_simd_vect_cost_model) Init(VECT_COST_MODEL_UNLIMITED)
+Specifies the cost model for vectorization in loops marked with
#pragma omp simd
+
 Enum
 Name(vect_cost_model) Type(enum vect_cost_model) UnknownError(unknown
vectorizer cost model %qs)

diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index 83d1f45..e26f704 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -1090,7 +1090,8 @@  vect_peeling_hash_insert (loop_vec_info
loop_vinfo, struct data_reference *dr,
       *new_slot = slot;
     }

-  if (!supportable_dr_alignment && unlimited_cost_model ())
+  if (!supportable_dr_alignment
+      && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
     slot->count += VECT_MAX_COST;
 }

@@ -1200,7 +1201,7 @@  vect_peeling_hash_choose_best_peeling
(loop_vec_info loop_vinfo,
    res.peel_info.dr = NULL;
    res.body_cost_vec = stmt_vector_for_cost ();

-   if (!unlimited_cost_model ())
+   if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
      {
        res.inside_cost = INT_MAX;
        res.outside_cost = INT_MAX;
@@ -1429,7 +1430,7 @@  vect_enhance_data_refs_alignment (loop_vec_info
loop_vinfo)
                  vectorization factor.
                  We do this automtically for cost model, since we
calculate cost
                  for every peeling option.  */
-              if (unlimited_cost_model ())
+              if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
                 possible_npeel_number = vf /nelements;

               /* Handle the aligned case. We may decide to align some other
@@ -1437,7 +1438,7 @@  vect_enhance_data_refs_alignment (loop_vec_info
loop_vinfo)
               if (DR_MISALIGNMENT (dr) == 0)
                 {
                   npeel_tmp = 0;
-                  if (unlimited_cost_model ())
+                  if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
                     possible_npeel_number++;
                 }

diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 86ebbd2..be66172 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -2696,7 +2696,7 @@  vect_estimate_min_profitable_iters
(loop_vec_info loop_vinfo,
   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);

   /* Cost model disabled.  */
-  if (unlimited_cost_model ())
+  if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
     {
       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
       *ret_min_profitable_niters = 0;
@@ -2929,6 +2929,11 @@  vect_estimate_min_profitable_iters
(loop_vec_info loop_vinfo,
   /* vector version will never be profitable.  */
   else
     {
+      if (LOOP_VINFO_LOOP (loop_vinfo)->force_vect)
+        {
+          pedwarn (vect_location, 0, "Vectorization did not happen
for the loop");
+        }
+
       if (dump_enabled_p ())
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  "cost model: the vector iteration cost = %d "
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 247bdfd..4b25964 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -2171,7 +2171,7 @@  vect_slp_analyze_bb_1 (basic_block bb)
     }

   /* Cost model: check if the vectorization is worthwhile.  */
-  if (!unlimited_cost_model ()
+  if (!unlimited_cost_model (NULL)
       && !vect_bb_vectorization_profitable_p (bb_vinfo))
     {
       if (dump_enabled_p ())
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index a6c5b59..2916906 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -919,9 +919,12 @@  known_alignment_for_access_p (struct
data_reference *data_ref_info)

 /* Return true if the vect cost model is unlimited.  */
 static inline bool
-unlimited_cost_model ()
+unlimited_cost_model (loop_p loop)
 {
-  return flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED;
+  return (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED
+          || (loop != NULL
+              && loop->force_vect
+              && flag_simd_vect_cost_model == VECT_COST_MODEL_UNLIMITED));
 }