diff mbox series

Add a bit dislike for separate mem alternative when op is REG_P.

Message ID 20220525033920.77449-1-hongtao.liu@intel.com
State New
Headers show
Series Add a bit dislike for separate mem alternative when op is REG_P. | expand

Commit Message

Liu, Hongtao May 25, 2022, 3:39 a.m. UTC
Rigt now, mem_cost for separate mem alternative is 1 * frequency which
is pretty small and caused the unnecessary SSE spill in the PR, I've tried
to rework backend cost model, but RA still not happy with that(regress
somewhere else). I think the root cause of this is cost for separate 'm'
alternative cost is too small, especially considering that the mov cost
of gpr are 2(default for REGISTER_MOVE_COST). So this patch increase mem_cost
to 2*frequency, also increase 1 for reg_class cost when m alternative.


Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

	PR target/105513
	* ira-costs.cc (record_reg_classes): Increase both mem_cost
	and reg class cost by 1 for separate mem alternative when
	REG_P (op).

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr105513-1.c: New test.
---
 gcc/ira-costs.cc                           | 26 +++++++++++++---------
 gcc/testsuite/gcc.target/i386/pr105513-1.c | 16 +++++++++++++
 2 files changed, 31 insertions(+), 11 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr105513-1.c

Comments

Hongtao Liu May 25, 2022, 5:17 a.m. UTC | #1
On Wed, May 25, 2022 at 11:39 AM liuhongt via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Rigt now, mem_cost for separate mem alternative is 1 * frequency which
> is pretty small and caused the unnecessary SSE spill in the PR, I've tried
> to rework backend cost model, but RA still not happy with that(regress
> somewhere else). I think the root cause of this is cost for separate 'm'
> alternative cost is too small, especially considering that the mov cost
> of gpr are 2(default for REGISTER_MOVE_COST). So this patch increase mem_cost
> to 2*frequency, also increase 1 for reg_class cost when m alternative.
>
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?
>
> gcc/ChangeLog:
>
>         PR target/105513
>         * ira-costs.cc (record_reg_classes): Increase both mem_cost
>         and reg class cost by 1 for separate mem alternative when
>         REG_P (op).
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr105513-1.c: New test.
> ---
>  gcc/ira-costs.cc                           | 26 +++++++++++++---------
>  gcc/testsuite/gcc.target/i386/pr105513-1.c | 16 +++++++++++++
>  2 files changed, 31 insertions(+), 11 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr105513-1.c
>
> diff --git a/gcc/ira-costs.cc b/gcc/ira-costs.cc
> index 964c94a06ef..f7b8325e195 100644
> --- a/gcc/ira-costs.cc
> +++ b/gcc/ira-costs.cc
> @@ -625,7 +625,8 @@ record_reg_classes (int n_alts, int n_ops, rtx *ops,
>                           for (k = cost_classes_ptr->num - 1; k >= 0; k--)
>                             {
>                               rclass = cost_classes[k];
> -                             pp_costs[k] = mem_cost[rclass][0] * frequency;
> +                             pp_costs[k] = (mem_cost[rclass][0]
> +                                            + 1) * frequency;
>                             }
>                         }
>                       else
> @@ -648,7 +649,8 @@ record_reg_classes (int n_alts, int n_ops, rtx *ops,
>                           for (k = cost_classes_ptr->num - 1; k >= 0; k--)
>                             {
>                               rclass = cost_classes[k];
> -                             pp_costs[k] = mem_cost[rclass][1] * frequency;
> +                             pp_costs[k] = (mem_cost[rclass][1]
> +                                            + 1) * frequency;
>                             }
>                         }
>                       else
> @@ -670,9 +672,9 @@ record_reg_classes (int n_alts, int n_ops, rtx *ops,
>                           for (k = cost_classes_ptr->num - 1; k >= 0; k--)
>                             {
>                               rclass = cost_classes[k];
> -                             pp_costs[k] = ((mem_cost[rclass][0]
> -                                             + mem_cost[rclass][1])
> -                                            * frequency);
> +                             pp_costs[k] = (mem_cost[rclass][0]
> +                                            + mem_cost[rclass][1]
> +                                            + 2) * frequency;
>                             }
>                         }
>                       else
> @@ -861,7 +863,8 @@ record_reg_classes (int n_alts, int n_ops, rtx *ops,
>                           for (k = cost_classes_ptr->num - 1; k >= 0; k--)
>                             {
>                               rclass = cost_classes[k];
> -                             pp_costs[k] = mem_cost[rclass][0] * frequency;
> +                             pp_costs[k] = (mem_cost[rclass][0]
> +                                            + 1) * frequency;
>                             }
>                         }
>                       else
> @@ -884,7 +887,8 @@ record_reg_classes (int n_alts, int n_ops, rtx *ops,
>                           for (k = cost_classes_ptr->num - 1; k >= 0; k--)
>                             {
>                               rclass = cost_classes[k];
> -                             pp_costs[k] = mem_cost[rclass][1] * frequency;
> +                             pp_costs[k] = (mem_cost[rclass][1]
> +                                            + 1) * frequency;
>                             }
>                         }
>                       else
> @@ -906,9 +910,9 @@ record_reg_classes (int n_alts, int n_ops, rtx *ops,
>                           for (k = cost_classes_ptr->num - 1; k >= 0; k--)
>                             {
>                               rclass = cost_classes[k];
> -                             pp_costs[k] = ((mem_cost[rclass][0]
> -                                             + mem_cost[rclass][1])
> -                                            * frequency);
> +                             pp_costs[k] = (mem_cost[rclass][0]
> +                                            + mem_cost[rclass][1]
> +                                            + 2) * frequency;
>                             }
>                         }
>                       else
> @@ -929,7 +933,7 @@ record_reg_classes (int n_alts, int n_ops, rtx *ops,
>                     /* Although we don't need insn to reload from
>                        memory, still accessing memory is usually more
>                        expensive than a register.  */
> -                   pp->mem_cost = frequency;
> +                   pp->mem_cost = 2 * frequency;
>                   else
>                     /* If the alternative actually allows memory, make
>                        things a bit cheaper since we won't need an
> diff --git a/gcc/testsuite/gcc.target/i386/pr105513-1.c b/gcc/testsuite/gcc.target/i386/pr105513-1.c
> new file mode 100644
> index 00000000000..530f5292252
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr105513-1.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2 -msse2 -mtune=skylake -mfpmath=sse" } */
> +/* { dg-final { scan-assembler-not "\\(%rsp\\)" } } */
> +
> +static int as_int(float x)
> +{
> +    return (union{float x; int i;}){x}.i;
> +}
> +
> +float f(double y, float x)
> +{
> +    int i = as_int(x);
> +    if (__builtin_expect(i > 99, 0)) return 0;
> +    if (i*2u < 77) if (i==2) return 0;
> +    return y*x;
> +}
> --
> 2.18.1
>
Vladimir Makarov May 26, 2022, 9:12 p.m. UTC | #2
On 2022-05-24 23:39, liuhongt wrote:
> Rigt now, mem_cost for separate mem alternative is 1 * frequency which
> is pretty small and caused the unnecessary SSE spill in the PR, I've tried
> to rework backend cost model, but RA still not happy with that(regress
> somewhere else). I think the root cause of this is cost for separate 'm'
> alternative cost is too small, especially considering that the mov cost
> of gpr are 2(default for REGISTER_MOVE_COST). So this patch increase mem_cost
> to 2*frequency, also increase 1 for reg_class cost when m alternative.
>
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?

Thank you for addressing this problem. And sorry I can not approve this 
patch at least w/o your additional work on benchmarking this change.

This code is very old.  It is coming from older RA (former file 
regclass.c) and existed practically since GCC day 1.  People tried many 
times to improve this code.  The code also affects many targets.

I can approve this patch if you show that there is no regression at 
least on x86-64 on some credible benchmark, e.g. SPEC2006 or SPEC2017.

I know it is a big work but when I myself do such changes I check 
SPEC2017.  I rejected my changes like this one several times when I 
benchmarked them on SPEC2017 although at the first glance they looked 
reasonable.

> gcc/ChangeLog:
>
> 	PR target/105513
> 	* ira-costs.cc (record_reg_classes): Increase both mem_cost
> 	and reg class cost by 1 for separate mem alternative when
> 	REG_P (op).
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/i386/pr105513-1.c: New test.
> ---
>   gcc/ira-costs.cc                           | 26 +++++++++++++---------
>   gcc/testsuite/gcc.target/i386/pr105513-1.c | 16 +++++++++++++
>   2 files changed, 31 insertions(+), 11 deletions(-)
>   create mode 100644 gcc/testsuite/gcc.target/i386/pr105513-1.c
>
> diff --git a/gcc/ira-costs.cc b/gcc/ira-costs.cc
> index 964c94a06ef..f7b8325e195 100644
> --- a/gcc/ira-costs.cc
> +++ b/gcc/ira-costs.cc
> @@ -625,7 +625,8 @@ record_reg_classes (int n_alts, int n_ops, rtx *ops,
>   			  for (k = cost_classes_ptr->num - 1; k >= 0; k--)
>   			    {
>   			      rclass = cost_classes[k];
> -			      pp_costs[k] = mem_cost[rclass][0] * frequency;
> +			      pp_costs[k] = (mem_cost[rclass][0]
> +					     + 1) * frequency;
>   			    }
>   			}
>   		      else
> @@ -648,7 +649,8 @@ record_reg_classes (int n_alts, int n_ops, rtx *ops,
>   			  for (k = cost_classes_ptr->num - 1; k >= 0; k--)
>   			    {
>   			      rclass = cost_classes[k];
> -			      pp_costs[k] = mem_cost[rclass][1] * frequency;
> +			      pp_costs[k] = (mem_cost[rclass][1]
> +					     + 1) * frequency;
>   			    }
>   			}
>   		      else
> @@ -670,9 +672,9 @@ record_reg_classes (int n_alts, int n_ops, rtx *ops,
>   			  for (k = cost_classes_ptr->num - 1; k >= 0; k--)
>   			    {
>   			      rclass = cost_classes[k];
> -			      pp_costs[k] = ((mem_cost[rclass][0]
> -					      + mem_cost[rclass][1])
> -					     * frequency);
> +			      pp_costs[k] = (mem_cost[rclass][0]
> +					     + mem_cost[rclass][1]
> +					     + 2) * frequency;
>   			    }
>   			}
>   		      else
> @@ -861,7 +863,8 @@ record_reg_classes (int n_alts, int n_ops, rtx *ops,
>   			  for (k = cost_classes_ptr->num - 1; k >= 0; k--)
>   			    {
>   			      rclass = cost_classes[k];
> -			      pp_costs[k] = mem_cost[rclass][0] * frequency;
> +			      pp_costs[k] = (mem_cost[rclass][0]
> +					     + 1) * frequency;
>   			    }
>   			}
>   		      else
> @@ -884,7 +887,8 @@ record_reg_classes (int n_alts, int n_ops, rtx *ops,
>   			  for (k = cost_classes_ptr->num - 1; k >= 0; k--)
>   			    {
>   			      rclass = cost_classes[k];
> -			      pp_costs[k] = mem_cost[rclass][1] * frequency;
> +			      pp_costs[k] = (mem_cost[rclass][1]
> +					     + 1) * frequency;
>   			    }
>   			}
>   		      else
> @@ -906,9 +910,9 @@ record_reg_classes (int n_alts, int n_ops, rtx *ops,
>   			  for (k = cost_classes_ptr->num - 1; k >= 0; k--)
>   			    {
>   			      rclass = cost_classes[k];
> -			      pp_costs[k] = ((mem_cost[rclass][0]
> -					      + mem_cost[rclass][1])
> -					     * frequency);
> +			      pp_costs[k] = (mem_cost[rclass][0]
> +					     + mem_cost[rclass][1]
> +					     + 2) * frequency;
>   			    }
>   			}
>   		      else
> @@ -929,7 +933,7 @@ record_reg_classes (int n_alts, int n_ops, rtx *ops,
>   		    /* Although we don't need insn to reload from
>   		       memory, still accessing memory is usually more
>   		       expensive than a register.  */
> -		    pp->mem_cost = frequency;
> +		    pp->mem_cost = 2 * frequency;
>   		  else
>   		    /* If the alternative actually allows memory, make
>   		       things a bit cheaper since we won't need an
> diff --git a/gcc/testsuite/gcc.target/i386/pr105513-1.c b/gcc/testsuite/gcc.target/i386/pr105513-1.c
> new file mode 100644
> index 00000000000..530f5292252
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr105513-1.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2 -msse2 -mtune=skylake -mfpmath=sse" } */
> +/* { dg-final { scan-assembler-not "\\(%rsp\\)" } } */
> +
> +static int as_int(float x)
> +{
> +    return (union{float x; int i;}){x}.i;
> +}
> +
> +float f(double y, float x)
> +{
> +    int i = as_int(x);
> +    if (__builtin_expect(i > 99, 0)) return 0;
> +    if (i*2u < 77) if (i==2) return 0;
> +    return y*x;
> +}
Alexander Monakov May 27, 2022, 9:39 a.m. UTC | #3
On Wed, 25 May 2022, liuhongt via Gcc-patches wrote:

> Rigt now, mem_cost for separate mem alternative is 1 * frequency which
> is pretty small and caused the unnecessary SSE spill in the PR, I've tried
> to rework backend cost model, but RA still not happy with that(regress
> somewhere else). I think the root cause of this is cost for separate 'm'
> alternative cost is too small, especially considering that the mov cost
> of gpr are 2(default for REGISTER_MOVE_COST). So this patch increase mem_cost
> to 2*frequency, also increase 1 for reg_class cost when m alternative.

In the PR, the spill happens in the initial basic block of the function, i.e.
the one with the highest frequency.

Also as noted in the PR, swapping the 'unlikely' branch to 'likely' avoids
the spill, even though it does not affect the frequency of the initial basic
block, and makes the block with the use more rarely executed.

Do you have a root cause analysis that explains the above?

Alexander
Li, Pan2 via Gcc-patches May 30, 2022, 2:52 a.m. UTC | #4
> -----Original Message-----
> From: Alexander Monakov <amonakov@ispras.ru>
> Sent: Friday, May 27, 2022 5:39 PM
> To: Liu, Hongtao <hongtao.liu@intel.com>
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH] Add a bit dislike for separate mem alternative when op is
> REG_P.
> 
> On Wed, 25 May 2022, liuhongt via Gcc-patches wrote:
> 
> > Rigt now, mem_cost for separate mem alternative is 1 * frequency which
> > is pretty small and caused the unnecessary SSE spill in the PR, I've
> > tried to rework backend cost model, but RA still not happy with
> > that(regress somewhere else). I think the root cause of this is cost for separate
> 'm'
> > alternative cost is too small, especially considering that the mov
> > cost of gpr are 2(default for REGISTER_MOVE_COST). So this patch
> > increase mem_cost to 2*frequency, also increase 1 for reg_class cost when m
> alternative.
> 
> In the PR, the spill happens in the initial basic block of the function, i.e.
> the one with the highest frequency.
> 
> Also as noted in the PR, swapping the 'unlikely' branch to 'likely' avoids the spill,
> even though it does not affect the frequency of the initial basic block, and
> makes the block with the use more rarely executed.

The spill is mainly decided by 3 insns related to r92

283(insn 3 61 4 2 (set (reg/v:SF 92 [ x ])
284        (reg:SF 102)) "test3.c":7:1 142 {*movsf_internal}
285     (expr_list:REG_DEAD (reg:SF 102)

288(insn 9 4 12 2 (set (reg:SI 89 [ _11 ])
289        (subreg:SI (reg/v:SF 92 [ x ]) 0)) "test3.c":3:36 81 {*movsi_internal}
290     (nil))

And
382(insn 28 27 29 5 (set (reg:DF 98)
383        (float_extend:DF (reg/v:SF 92 [ x ]))) "test3.c":11:13 163 {*extendsfdf2}
384     (expr_list:REG_DEAD (reg/v:SF 92 [ x ])
385        (nil)))
386(insn 29 28 30 5 (s

The frequency the for INSN 3 and INSN 9 is not affected, but frequency of INSN 28 drop from 805 -> 89 after swapping "unlikely" and "likely".
Because of that, GPR cost decreases a lot, finally make the RA choose GPR instead of MEM.

GENERAL_REGS:2356,2356 
SSE_REGS:6000,6000
MEM:4089,4089

Dump of 301.ira:
67  a4(r92,l0) costs: AREG:2356,2356 DREG:2356,2356 CREG:2356,2356 BREG:2356,2356 SIREG:2356,2356 DIREG:2356,2356 AD_REGS:2356,2356 CLOBBERED_REGS:2356,2356 Q_REGS:2356,2356 NON_Q_REGS:2356,2356 TLS_GOTBASE_REGS:2356,2356 GENERAL_REGS:2356,2356 SSE_FIRST_REG:6000,6000 NO_REX_SSE_REGS:6000,6000 SSE_REGS:6000,6000 \
   MMX_REGS:19534,19534 INT_SSE_REGS:19534,19534 ALL_REGS:214534,214534 MEM:4089,4089

And although there's no spill, there's an extra VMOVD in the later BB which looks suboptimal(Guess we can stand with that since it's cold.)

24        vmovd   %eax, %xmm2
25        vcvtss2sd       %xmm2, %xmm2, %xmm1
26        vmulsd  %xmm0, %xmm1, %xmm0
27        vcvtsd2ss       %xmm0, %xmm0, %xmm0
> 
> Do you have a root cause analysis that explains the above?
> 
> Alexander
Hongtao Liu May 30, 2022, 3:05 a.m. UTC | #5
On Fri, May 27, 2022 at 5:12 AM Vladimir Makarov via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
>
> On 2022-05-24 23:39, liuhongt wrote:
> > Rigt now, mem_cost for separate mem alternative is 1 * frequency which
> > is pretty small and caused the unnecessary SSE spill in the PR, I've tried
> > to rework backend cost model, but RA still not happy with that(regress
> > somewhere else). I think the root cause of this is cost for separate 'm'
> > alternative cost is too small, especially considering that the mov cost
> > of gpr are 2(default for REGISTER_MOVE_COST). So this patch increase mem_cost
> > to 2*frequency, also increase 1 for reg_class cost when m alternative.
> >
> >
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > Ok for trunk?
>
> Thank you for addressing this problem. And sorry I can not approve this
> patch at least w/o your additional work on benchmarking this change.
>
> This code is very old.  It is coming from older RA (former file
> regclass.c) and existed practically since GCC day 1.  People tried many
> times to improve this code.  The code also affects many targets.
Yes, that's why I increased it as low as possible, so it won't regress
#c6 in the PR.
>
> I can approve this patch if you show that there is no regression at
> least on x86-64 on some credible benchmark, e.g. SPEC2006 or SPEC2017.
>
I've tested the patch for SPEC2017 with both  -march=cascadelake
-Ofast -flto and -O2 -mtune=generic.
No obvious regression is observed, the binaries are all different from
before, so I looked at 2 of them, the difference mainly comes from
different choices of registers(xmm13 -> xmm12).
Ok for trunk then?
> I know it is a big work but when I myself do such changes I check
> SPEC2017.  I rejected my changes like this one several times when I
> benchmarked them on SPEC2017 although at the first glance they looked
> reasonable.
>
> > gcc/ChangeLog:
> >
> >       PR target/105513
> >       * ira-costs.cc (record_reg_classes): Increase both mem_cost
> >       and reg class cost by 1 for separate mem alternative when
> >       REG_P (op).
> >
> > gcc/testsuite/ChangeLog:
> >
> >       * gcc.target/i386/pr105513-1.c: New test.
> > ---
> >   gcc/ira-costs.cc                           | 26 +++++++++++++---------
> >   gcc/testsuite/gcc.target/i386/pr105513-1.c | 16 +++++++++++++
> >   2 files changed, 31 insertions(+), 11 deletions(-)
> >   create mode 100644 gcc/testsuite/gcc.target/i386/pr105513-1.c
> >
> > diff --git a/gcc/ira-costs.cc b/gcc/ira-costs.cc
> > index 964c94a06ef..f7b8325e195 100644
> > --- a/gcc/ira-costs.cc
> > +++ b/gcc/ira-costs.cc
> > @@ -625,7 +625,8 @@ record_reg_classes (int n_alts, int n_ops, rtx *ops,
> >                         for (k = cost_classes_ptr->num - 1; k >= 0; k--)
> >                           {
> >                             rclass = cost_classes[k];
> > -                           pp_costs[k] = mem_cost[rclass][0] * frequency;
> > +                           pp_costs[k] = (mem_cost[rclass][0]
> > +                                          + 1) * frequency;
> >                           }
> >                       }
> >                     else
> > @@ -648,7 +649,8 @@ record_reg_classes (int n_alts, int n_ops, rtx *ops,
> >                         for (k = cost_classes_ptr->num - 1; k >= 0; k--)
> >                           {
> >                             rclass = cost_classes[k];
> > -                           pp_costs[k] = mem_cost[rclass][1] * frequency;
> > +                           pp_costs[k] = (mem_cost[rclass][1]
> > +                                          + 1) * frequency;
> >                           }
> >                       }
> >                     else
> > @@ -670,9 +672,9 @@ record_reg_classes (int n_alts, int n_ops, rtx *ops,
> >                         for (k = cost_classes_ptr->num - 1; k >= 0; k--)
> >                           {
> >                             rclass = cost_classes[k];
> > -                           pp_costs[k] = ((mem_cost[rclass][0]
> > -                                           + mem_cost[rclass][1])
> > -                                          * frequency);
> > +                           pp_costs[k] = (mem_cost[rclass][0]
> > +                                          + mem_cost[rclass][1]
> > +                                          + 2) * frequency;
> >                           }
> >                       }
> >                     else
> > @@ -861,7 +863,8 @@ record_reg_classes (int n_alts, int n_ops, rtx *ops,
> >                         for (k = cost_classes_ptr->num - 1; k >= 0; k--)
> >                           {
> >                             rclass = cost_classes[k];
> > -                           pp_costs[k] = mem_cost[rclass][0] * frequency;
> > +                           pp_costs[k] = (mem_cost[rclass][0]
> > +                                          + 1) * frequency;
> >                           }
> >                       }
> >                     else
> > @@ -884,7 +887,8 @@ record_reg_classes (int n_alts, int n_ops, rtx *ops,
> >                         for (k = cost_classes_ptr->num - 1; k >= 0; k--)
> >                           {
> >                             rclass = cost_classes[k];
> > -                           pp_costs[k] = mem_cost[rclass][1] * frequency;
> > +                           pp_costs[k] = (mem_cost[rclass][1]
> > +                                          + 1) * frequency;
> >                           }
> >                       }
> >                     else
> > @@ -906,9 +910,9 @@ record_reg_classes (int n_alts, int n_ops, rtx *ops,
> >                         for (k = cost_classes_ptr->num - 1; k >= 0; k--)
> >                           {
> >                             rclass = cost_classes[k];
> > -                           pp_costs[k] = ((mem_cost[rclass][0]
> > -                                           + mem_cost[rclass][1])
> > -                                          * frequency);
> > +                           pp_costs[k] = (mem_cost[rclass][0]
> > +                                          + mem_cost[rclass][1]
> > +                                          + 2) * frequency;
> >                           }
> >                       }
> >                     else
> > @@ -929,7 +933,7 @@ record_reg_classes (int n_alts, int n_ops, rtx *ops,
> >                   /* Although we don't need insn to reload from
> >                      memory, still accessing memory is usually more
> >                      expensive than a register.  */
> > -                 pp->mem_cost = frequency;
> > +                 pp->mem_cost = 2 * frequency;
> >                 else
> >                   /* If the alternative actually allows memory, make
> >                      things a bit cheaper since we won't need an
> > diff --git a/gcc/testsuite/gcc.target/i386/pr105513-1.c b/gcc/testsuite/gcc.target/i386/pr105513-1.c
> > new file mode 100644
> > index 00000000000..530f5292252
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr105513-1.c
> > @@ -0,0 +1,16 @@
> > +/* { dg-do compile { target { ! ia32 } } } */
> > +/* { dg-options "-O2 -msse2 -mtune=skylake -mfpmath=sse" } */
> > +/* { dg-final { scan-assembler-not "\\(%rsp\\)" } } */
> > +
> > +static int as_int(float x)
> > +{
> > +    return (union{float x; int i;}){x}.i;
> > +}
> > +
> > +float f(double y, float x)
> > +{
> > +    int i = as_int(x);
> > +    if (__builtin_expect(i > 99, 0)) return 0;
> > +    if (i*2u < 77) if (i==2) return 0;
> > +    return y*x;
> > +}
>
Alexander Monakov May 30, 2022, 6:22 a.m. UTC | #6
> > In the PR, the spill happens in the initial basic block of the function, i.e.
> > the one with the highest frequency.
> > 
> > Also as noted in the PR, swapping the 'unlikely' branch to 'likely' avoids the spill,
> > even though it does not affect the frequency of the initial basic block, and
> > makes the block with the use more rarely executed.
> 
> The spill is mainly decided by 3 insns related to r92
> 
> 283(insn 3 61 4 2 (set (reg/v:SF 92 [ x ])
> 284        (reg:SF 102)) "test3.c":7:1 142 {*movsf_internal}
> 285     (expr_list:REG_DEAD (reg:SF 102)
> 
> 288(insn 9 4 12 2 (set (reg:SI 89 [ _11 ])
> 289        (subreg:SI (reg/v:SF 92 [ x ]) 0)) "test3.c":3:36 81 {*movsi_internal}
> 290     (nil))
> 
> And
> 382(insn 28 27 29 5 (set (reg:DF 98)
> 383        (float_extend:DF (reg/v:SF 92 [ x ]))) "test3.c":11:13 163 {*extendsfdf2}
> 384     (expr_list:REG_DEAD (reg/v:SF 92 [ x ])
> 385        (nil)))
> 386(insn 29 28 30 5 (s
> 
> The frequency the for INSN 3 and INSN 9 is not affected, but frequency of INSN
> 28 drop from 805 -> 89 after swapping "unlikely" and "likely".  Because of
> that, GPR cost decreases a lot, finally make the RA choose GPR instead of MEM.
> 
> GENERAL_REGS:2356,2356 
> SSE_REGS:6000,6000
> MEM:4089,4089

But why are SSE_REGS costed so high? r92 is used in SFmode, it doesn't make
sense that selecting a GPR for it looks cheaper than xmm0.

> Dump of 301.ira:
> 67  a4(r92,l0) costs: AREG:2356,2356 DREG:2356,2356 CREG:2356,2356 BREG:2356,2356 SIREG:2356,2356 DIREG:2356,2356 AD_REGS:2356,2356 CLOBBERED_REGS:2356,2356 Q_REGS:2356,2356 NON_Q_REGS:2356,2356 TLS_GOTBASE_REGS:2356,2356 GENERAL_REGS:2356,2356 SSE_FIRST_REG:6000,6000 NO_REX_SSE_REGS:6000,6000 SSE_REGS:6000,6000 \
>    MMX_REGS:19534,19534 INT_SSE_REGS:19534,19534 ALL_REGS:214534,214534 MEM:4089,4089
> 
> And although there's no spill, there's an extra VMOVD in the later BB which
> looks suboptimal(Guess we can stand with that since it's cold.)

I think that falls out of the wrong decision for SSE_REGS cost.

Alexander

> 
> 24        vmovd   %eax, %xmm2
> 25        vcvtss2sd       %xmm2, %xmm2, %xmm1
> 26        vmulsd  %xmm0, %xmm1, %xmm0
> 27        vcvtsd2ss       %xmm0, %xmm0, %xmm0
> > 
> > Do you have a root cause analysis that explains the above?
> > 
> > Alexander
Hongtao Liu May 30, 2022, 7:14 a.m. UTC | #7
On Mon, May 30, 2022 at 2:22 PM Alexander Monakov via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> > > In the PR, the spill happens in the initial basic block of the function, i.e.
> > > the one with the highest frequency.
> > >
> > > Also as noted in the PR, swapping the 'unlikely' branch to 'likely' avoids the spill,
> > > even though it does not affect the frequency of the initial basic block, and
> > > makes the block with the use more rarely executed.
> >
> > The spill is mainly decided by 3 insns related to r92
> >
> > 283(insn 3 61 4 2 (set (reg/v:SF 92 [ x ])
> > 284        (reg:SF 102)) "test3.c":7:1 142 {*movsf_internal}
> > 285     (expr_list:REG_DEAD (reg:SF 102)
> >
> > 288(insn 9 4 12 2 (set (reg:SI 89 [ _11 ])
> > 289        (subreg:SI (reg/v:SF 92 [ x ]) 0)) "test3.c":3:36 81 {*movsi_internal}
> > 290     (nil))
> >
> > And
> > 382(insn 28 27 29 5 (set (reg:DF 98)
> > 383        (float_extend:DF (reg/v:SF 92 [ x ]))) "test3.c":11:13 163 {*extendsfdf2}
> > 384     (expr_list:REG_DEAD (reg/v:SF 92 [ x ])
> > 385        (nil)))
> > 386(insn 29 28 30 5 (s
> >
> > The frequency the for INSN 3 and INSN 9 is not affected, but frequency of INSN
> > 28 drop from 805 -> 89 after swapping "unlikely" and "likely".  Because of
> > that, GPR cost decreases a lot, finally make the RA choose GPR instead of MEM.
> >
> > GENERAL_REGS:2356,2356
> > SSE_REGS:6000,6000
> > MEM:4089,4089
>
> But why are SSE_REGS costed so high? r92 is used in SFmode, it doesn't make
> sense that selecting a GPR for it looks cheaper than xmm0.
For INSN3 and INSN 28, SSE_REGS costs zero.
But for INSN 9, it's a SImode move, we have disparaged non-gpr
alternatives in movsi_internal pattern which finally makes SSE_REGS
costs 6 * 1000(1000 is frequency, 6 is move cost between SSE_REGS and
GPR, sse_to_integer/integer_to_sse).
value of sse_to_integer/integer_to_sse is decided as 6(3 times as GPR
move cost) to avoid too many spills between gpr and xmm, we once set
sse_to_integer/integer_to_sse  as 2 and generated many movd
instructions but finally caused frequency drop and regressed
performance.

Another choice is changing movsi_internal alternatives from *v to
?v(just like what we did in *movsf_internal for gpr alternatives),
then the separate change can fix PR, and also eliminate the extra
movd, but it may regress cases elsewhere.
Any thoughts for changing *v to ?v @Uros Bizjak
>
> > Dump of 301.ira:
> > 67  a4(r92,l0) costs: AREG:2356,2356 DREG:2356,2356 CREG:2356,2356 BREG:2356,2356 SIREG:2356,2356 DIREG:2356,2356 AD_REGS:2356,2356 CLOBBERED_REGS:2356,2356 Q_REGS:2356,2356 NON_Q_REGS:2356,2356 TLS_GOTBASE_REGS:2356,2356 GENERAL_REGS:2356,2356 SSE_FIRST_REG:6000,6000 NO_REX_SSE_REGS:6000,6000 SSE_REGS:6000,6000 \
> >    MMX_REGS:19534,19534 INT_SSE_REGS:19534,19534 ALL_REGS:214534,214534 MEM:4089,4089
> >
> > And although there's no spill, there's an extra VMOVD in the later BB which
> > looks suboptimal(Guess we can stand with that since it's cold.)
>
> I think that falls out of the wrong decision for SSE_REGS cost.
>
> Alexander
>
> >
> > 24        vmovd   %eax, %xmm2
> > 25        vcvtss2sd       %xmm2, %xmm2, %xmm1
> > 26        vmulsd  %xmm0, %xmm1, %xmm0
> > 27        vcvtsd2ss       %xmm0, %xmm0, %xmm0
> > >
> > > Do you have a root cause analysis that explains the above?
> > >
> > > Alexander
Alexander Monakov May 30, 2022, 7:44 a.m. UTC | #8
On Mon, 30 May 2022, Hongtao Liu wrote:

> On Mon, May 30, 2022 at 2:22 PM Alexander Monakov via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> > >
> > > The spill is mainly decided by 3 insns related to r92
> > >
> > > 283(insn 3 61 4 2 (set (reg/v:SF 92 [ x ])
> > > 284        (reg:SF 102)) "test3.c":7:1 142 {*movsf_internal}
> > > 285     (expr_list:REG_DEAD (reg:SF 102)
> > >
> > > 288(insn 9 4 12 2 (set (reg:SI 89 [ _11 ])
> > > 289        (subreg:SI (reg/v:SF 92 [ x ]) 0)) "test3.c":3:36 81 {*movsi_internal}
> > > 290     (nil))
> > >
> > > And
> > > 382(insn 28 27 29 5 (set (reg:DF 98)
> > > 383        (float_extend:DF (reg/v:SF 92 [ x ]))) "test3.c":11:13 163 {*extendsfdf2}
> > > 384     (expr_list:REG_DEAD (reg/v:SF 92 [ x ])
> > > 385        (nil)))
> > > 386(insn 29 28 30 5 (s
> > >
> > > The frequency the for INSN 3 and INSN 9 is not affected, but frequency of INSN
> > > 28 drop from 805 -> 89 after swapping "unlikely" and "likely".  Because of
> > > that, GPR cost decreases a lot, finally make the RA choose GPR instead of MEM.
> > >
> > > GENERAL_REGS:2356,2356
> > > SSE_REGS:6000,6000
> > > MEM:4089,4089
> >
> > But why are SSE_REGS costed so high? r92 is used in SFmode, it doesn't make
> > sense that selecting a GPR for it looks cheaper than xmm0.
> For INSN3 and INSN 28, SSE_REGS costs zero.
> But for INSN 9, it's a SImode move, we have disparaged non-gpr
> alternatives in movsi_internal pattern which finally makes SSE_REGS
> costs 6 * 1000(1000 is frequency, 6 is move cost between SSE_REGS and
> GPR, sse_to_integer/integer_to_sse).

But wait, selecting a GPR for r92 makes insn 3 (movsf_internal) an
sse-to-integer move, so it should be equally high cost? Not to mention
that the use in insn 28 (extendsfdf2) should have higher cost also.

Why does GPR cost 2356 instead of 6000 for insn 3 plus extra for insn 28?

Alexander
Hongtao Liu May 30, 2022, 8:34 a.m. UTC | #9
On Mon, May 30, 2022 at 3:44 PM Alexander Monakov <amonakov@ispras.ru> wrote:
>
> On Mon, 30 May 2022, Hongtao Liu wrote:
>
> > On Mon, May 30, 2022 at 2:22 PM Alexander Monakov via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> > > >
> > > > The spill is mainly decided by 3 insns related to r92
> > > >
> > > > 283(insn 3 61 4 2 (set (reg/v:SF 92 [ x ])
> > > > 284        (reg:SF 102)) "test3.c":7:1 142 {*movsf_internal}
> > > > 285     (expr_list:REG_DEAD (reg:SF 102)
> > > >
> > > > 288(insn 9 4 12 2 (set (reg:SI 89 [ _11 ])
> > > > 289        (subreg:SI (reg/v:SF 92 [ x ]) 0)) "test3.c":3:36 81 {*movsi_internal}
> > > > 290     (nil))
> > > >
> > > > And
> > > > 382(insn 28 27 29 5 (set (reg:DF 98)
> > > > 383        (float_extend:DF (reg/v:SF 92 [ x ]))) "test3.c":11:13 163 {*extendsfdf2}
> > > > 384     (expr_list:REG_DEAD (reg/v:SF 92 [ x ])
> > > > 385        (nil)))
> > > > 386(insn 29 28 30 5 (s
> > > >
> > > > The frequency the for INSN 3 and INSN 9 is not affected, but frequency of INSN
> > > > 28 drop from 805 -> 89 after swapping "unlikely" and "likely".  Because of
> > > > that, GPR cost decreases a lot, finally make the RA choose GPR instead of MEM.
> > > >
> > > > GENERAL_REGS:2356,2356
> > > > SSE_REGS:6000,6000
> > > > MEM:4089,4089
> > >
> > > But why are SSE_REGS costed so high? r92 is used in SFmode, it doesn't make
> > > sense that selecting a GPR for it looks cheaper than xmm0.
> > For INSN3 and INSN 28, SSE_REGS costs zero.
> > But for INSN 9, it's a SImode move, we have disparaged non-gpr
> > alternatives in movsi_internal pattern which finally makes SSE_REGS
> > costs 6 * 1000(1000 is frequency, 6 is move cost between SSE_REGS and
> > GPR, sse_to_integer/integer_to_sse).
>
> But wait, selecting a GPR for r92 makes insn 3 (movsf_internal) an
> sse-to-integer move, so it should be equally high cost? Not to mention
> that the use in insn 28 (extendsfdf2) should have higher cost also.
>
> Why does GPR cost 2356 instead of 6000 for insn 3 plus extra for insn 28?
First GPR cost in insn 3 is not necessarily equal to integer_to_sse,
it's the minimal cost of all alternatives, and one alternative is ?r,
the cost is 2.

I think the difference in movsf_internal and movsi_internal for *v and
?r make RA finally choose GPR.
>
> Alexander
Alexander Monakov May 30, 2022, 9:41 a.m. UTC | #10
On Mon, 30 May 2022, Hongtao Liu wrote:

> On Mon, May 30, 2022 at 3:44 PM Alexander Monakov <amonakov@ispras.ru> wrote:
> >
> > On Mon, 30 May 2022, Hongtao Liu wrote:
> >
> > > On Mon, May 30, 2022 at 2:22 PM Alexander Monakov via Gcc-patches
> > > <gcc-patches@gcc.gnu.org> wrote:
> > > > >
> > > > > The spill is mainly decided by 3 insns related to r92
> > > > >
> > > > > 283(insn 3 61 4 2 (set (reg/v:SF 92 [ x ])
> > > > > 284        (reg:SF 102)) "test3.c":7:1 142 {*movsf_internal}
> > > > > 285     (expr_list:REG_DEAD (reg:SF 102)
> > > > >
> > > > > 288(insn 9 4 12 2 (set (reg:SI 89 [ _11 ])
> > > > > 289        (subreg:SI (reg/v:SF 92 [ x ]) 0)) "test3.c":3:36 81 {*movsi_internal}
> > > > > 290     (nil))
> > > > >
> > > > > And
> > > > > 382(insn 28 27 29 5 (set (reg:DF 98)
> > > > > 383        (float_extend:DF (reg/v:SF 92 [ x ]))) "test3.c":11:13 163 {*extendsfdf2}
> > > > > 384     (expr_list:REG_DEAD (reg/v:SF 92 [ x ])
> > > > > 385        (nil)))
> > > > > 386(insn 29 28 30 5 (s
> > > > >
> > > > > The frequency the for INSN 3 and INSN 9 is not affected, but frequency of INSN
> > > > > 28 drop from 805 -> 89 after swapping "unlikely" and "likely".  Because of
> > > > > that, GPR cost decreases a lot, finally make the RA choose GPR instead of MEM.
> > > > >
> > > > > GENERAL_REGS:2356,2356
> > > > > SSE_REGS:6000,6000
> > > > > MEM:4089,4089
> > > >
> > > > But why are SSE_REGS costed so high? r92 is used in SFmode, it doesn't make
> > > > sense that selecting a GPR for it looks cheaper than xmm0.
> > > For INSN3 and INSN 28, SSE_REGS costs zero.
> > > But for INSN 9, it's a SImode move, we have disparaged non-gpr
> > > alternatives in movsi_internal pattern which finally makes SSE_REGS
> > > costs 6 * 1000(1000 is frequency, 6 is move cost between SSE_REGS and
> > > GPR, sse_to_integer/integer_to_sse).
> >
> > But wait, selecting a GPR for r92 makes insn 3 (movsf_internal) an
> > sse-to-integer move, so it should be equally high cost? Not to mention
> > that the use in insn 28 (extendsfdf2) should have higher cost also.
> >
> > Why does GPR cost 2356 instead of 6000 for insn 3 plus extra for insn 28?
> First GPR cost in insn 3 is not necessarily equal to integer_to_sse,
> it's the minimal cost of all alternatives, and one alternative is ?r,
> the cost is 2.
> 
> I think the difference in movsf_internal and movsi_internal for *v and
> ?r make RA finally choose GPR.

I think this is one of the main issues here, if in the end it's the same
'mov %xmmN, <gpr>' instruction, only the pattern name is different.

Alexander
Vladimir Makarov May 31, 2022, 4:28 p.m. UTC | #11
On 2022-05-29 23:05, Hongtao Liu wrote:
> On Fri, May 27, 2022 at 5:12 AM Vladimir Makarov via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
>>
>> On 2022-05-24 23:39, liuhongt wrote:
>>> Rigt now, mem_cost for separate mem alternative is 1 * frequency which
>>> is pretty small and caused the unnecessary SSE spill in the PR, I've tried
>>> to rework backend cost model, but RA still not happy with that(regress
>>> somewhere else). I think the root cause of this is cost for separate 'm'
>>> alternative cost is too small, especially considering that the mov cost
>>> of gpr are 2(default for REGISTER_MOVE_COST). So this patch increase mem_cost
>>> to 2*frequency, also increase 1 for reg_class cost when m alternative.
>>>
>>>
>>> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
>>> Ok for trunk?
>> Thank you for addressing this problem. And sorry I can not approve this
>> patch at least w/o your additional work on benchmarking this change.
>>
>> This code is very old.  It is coming from older RA (former file
>> regclass.c) and existed practically since GCC day 1.  People tried many
>> times to improve this code.  The code also affects many targets.
> Yes, that's why I increased it as low as possible, so it won't regress
> #c6 in the PR.
>> I can approve this patch if you show that there is no regression at
>> least on x86-64 on some credible benchmark, e.g. SPEC2006 or SPEC2017.
>>
> I've tested the patch for SPEC2017 with both  -march=cascadelake
> -Ofast -flto and -O2 -mtune=generic.
> No obvious regression is observed, the binaries are all different from
> before, so I looked at 2 of them, the difference mainly comes from
> different choices of registers(xmm13 -> xmm12).
> Ok for trunk then?

OK.

Thank you for checking SPEC2017.

I hope it will not create troubles for other targets.
Richard Sandiford May 31, 2022, 4:40 p.m. UTC | #12
Vladimir Makarov via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> On 2022-05-29 23:05, Hongtao Liu wrote:
>> On Fri, May 27, 2022 at 5:12 AM Vladimir Makarov via Gcc-patches
>> <gcc-patches@gcc.gnu.org> wrote:
>>>
>>> On 2022-05-24 23:39, liuhongt wrote:
>>>> Rigt now, mem_cost for separate mem alternative is 1 * frequency which
>>>> is pretty small and caused the unnecessary SSE spill in the PR, I've tried
>>>> to rework backend cost model, but RA still not happy with that(regress
>>>> somewhere else). I think the root cause of this is cost for separate 'm'
>>>> alternative cost is too small, especially considering that the mov cost
>>>> of gpr are 2(default for REGISTER_MOVE_COST). So this patch increase mem_cost
>>>> to 2*frequency, also increase 1 for reg_class cost when m alternative.
>>>>
>>>>
>>>> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
>>>> Ok for trunk?
>>> Thank you for addressing this problem. And sorry I can not approve this
>>> patch at least w/o your additional work on benchmarking this change.
>>>
>>> This code is very old.  It is coming from older RA (former file
>>> regclass.c) and existed practically since GCC day 1.  People tried many
>>> times to improve this code.  The code also affects many targets.
>> Yes, that's why I increased it as low as possible, so it won't regress
>> #c6 in the PR.
>>> I can approve this patch if you show that there is no regression at
>>> least on x86-64 on some credible benchmark, e.g. SPEC2006 or SPEC2017.
>>>
>> I've tested the patch for SPEC2017 with both  -march=cascadelake
>> -Ofast -flto and -O2 -mtune=generic.
>> No obvious regression is observed, the binaries are all different from
>> before, so I looked at 2 of them, the difference mainly comes from
>> different choices of registers(xmm13 -> xmm12).
>> Ok for trunk then?
>
> OK.
>
> Thank you for checking SPEC2017.
>
> I hope it will not create troubles for other targets.

Can we hold off for a bit?  Like Alexander says, there seem to be
some inconsistencies in the target patterns, so I think we should
first rule out any changes being needed there.

Thanks,
Richard
Hongtao Liu May 31, 2022, 11:51 p.m. UTC | #13
On Wed, Jun 1, 2022 at 12:40 AM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Vladimir Makarov via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> > On 2022-05-29 23:05, Hongtao Liu wrote:
> >> On Fri, May 27, 2022 at 5:12 AM Vladimir Makarov via Gcc-patches
> >> <gcc-patches@gcc.gnu.org> wrote:
> >>>
> >>> On 2022-05-24 23:39, liuhongt wrote:
> >>>> Rigt now, mem_cost for separate mem alternative is 1 * frequency which
> >>>> is pretty small and caused the unnecessary SSE spill in the PR, I've tried
> >>>> to rework backend cost model, but RA still not happy with that(regress
> >>>> somewhere else). I think the root cause of this is cost for separate 'm'
> >>>> alternative cost is too small, especially considering that the mov cost
> >>>> of gpr are 2(default for REGISTER_MOVE_COST). So this patch increase mem_cost
> >>>> to 2*frequency, also increase 1 for reg_class cost when m alternative.
> >>>>
> >>>>
> >>>> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> >>>> Ok for trunk?
> >>> Thank you for addressing this problem. And sorry I can not approve this
> >>> patch at least w/o your additional work on benchmarking this change.
> >>>
> >>> This code is very old.  It is coming from older RA (former file
> >>> regclass.c) and existed practically since GCC day 1.  People tried many
> >>> times to improve this code.  The code also affects many targets.
> >> Yes, that's why I increased it as low as possible, so it won't regress
> >> #c6 in the PR.
> >>> I can approve this patch if you show that there is no regression at
> >>> least on x86-64 on some credible benchmark, e.g. SPEC2006 or SPEC2017.
> >>>
> >> I've tested the patch for SPEC2017 with both  -march=cascadelake
> >> -Ofast -flto and -O2 -mtune=generic.
> >> No obvious regression is observed, the binaries are all different from
> >> before, so I looked at 2 of them, the difference mainly comes from
> >> different choices of registers(xmm13 -> xmm12).
> >> Ok for trunk then?
> >
> > OK.
> >
> > Thank you for checking SPEC2017.
> >
> > I hope it will not create troubles for other targets.
>
> Can we hold off for a bit?  Like Alexander says, there seem to be
> some inconsistencies in the target patterns, so I think we should
> first rule out any changes being needed there.
Yes, i'm also testing another patch.
>
> Thanks,
> Richard
diff mbox series

Patch

diff --git a/gcc/ira-costs.cc b/gcc/ira-costs.cc
index 964c94a06ef..f7b8325e195 100644
--- a/gcc/ira-costs.cc
+++ b/gcc/ira-costs.cc
@@ -625,7 +625,8 @@  record_reg_classes (int n_alts, int n_ops, rtx *ops,
 			  for (k = cost_classes_ptr->num - 1; k >= 0; k--)
 			    {
 			      rclass = cost_classes[k];
-			      pp_costs[k] = mem_cost[rclass][0] * frequency;
+			      pp_costs[k] = (mem_cost[rclass][0]
+					     + 1) * frequency;
 			    }
 			}
 		      else
@@ -648,7 +649,8 @@  record_reg_classes (int n_alts, int n_ops, rtx *ops,
 			  for (k = cost_classes_ptr->num - 1; k >= 0; k--)
 			    {
 			      rclass = cost_classes[k];
-			      pp_costs[k] = mem_cost[rclass][1] * frequency;
+			      pp_costs[k] = (mem_cost[rclass][1]
+					     + 1) * frequency;
 			    }
 			}
 		      else
@@ -670,9 +672,9 @@  record_reg_classes (int n_alts, int n_ops, rtx *ops,
 			  for (k = cost_classes_ptr->num - 1; k >= 0; k--)
 			    {
 			      rclass = cost_classes[k];
-			      pp_costs[k] = ((mem_cost[rclass][0]
-					      + mem_cost[rclass][1])
-					     * frequency);
+			      pp_costs[k] = (mem_cost[rclass][0]
+					     + mem_cost[rclass][1]
+					     + 2) * frequency;
 			    }
 			}
 		      else
@@ -861,7 +863,8 @@  record_reg_classes (int n_alts, int n_ops, rtx *ops,
 			  for (k = cost_classes_ptr->num - 1; k >= 0; k--)
 			    {
 			      rclass = cost_classes[k];
-			      pp_costs[k] = mem_cost[rclass][0] * frequency;
+			      pp_costs[k] = (mem_cost[rclass][0]
+					     + 1) * frequency;
 			    }
 			}
 		      else
@@ -884,7 +887,8 @@  record_reg_classes (int n_alts, int n_ops, rtx *ops,
 			  for (k = cost_classes_ptr->num - 1; k >= 0; k--)
 			    {
 			      rclass = cost_classes[k];
-			      pp_costs[k] = mem_cost[rclass][1] * frequency;
+			      pp_costs[k] = (mem_cost[rclass][1]
+					     + 1) * frequency;
 			    }
 			}
 		      else
@@ -906,9 +910,9 @@  record_reg_classes (int n_alts, int n_ops, rtx *ops,
 			  for (k = cost_classes_ptr->num - 1; k >= 0; k--)
 			    {
 			      rclass = cost_classes[k];
-			      pp_costs[k] = ((mem_cost[rclass][0]
-					      + mem_cost[rclass][1])
-					     * frequency);
+			      pp_costs[k] = (mem_cost[rclass][0]
+					     + mem_cost[rclass][1]
+					     + 2) * frequency;
 			    }
 			}
 		      else
@@ -929,7 +933,7 @@  record_reg_classes (int n_alts, int n_ops, rtx *ops,
 		    /* Although we don't need insn to reload from
 		       memory, still accessing memory is usually more
 		       expensive than a register.  */
-		    pp->mem_cost = frequency;
+		    pp->mem_cost = 2 * frequency;
 		  else
 		    /* If the alternative actually allows memory, make
 		       things a bit cheaper since we won't need an
diff --git a/gcc/testsuite/gcc.target/i386/pr105513-1.c b/gcc/testsuite/gcc.target/i386/pr105513-1.c
new file mode 100644
index 00000000000..530f5292252
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr105513-1.c
@@ -0,0 +1,16 @@ 
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -msse2 -mtune=skylake -mfpmath=sse" } */
+/* { dg-final { scan-assembler-not "\\(%rsp\\)" } } */
+
+static int as_int(float x)
+{
+    return (union{float x; int i;}){x}.i;
+}
+
+float f(double y, float x)
+{
+    int i = as_int(x);
+    if (__builtin_expect(i > 99, 0)) return 0;
+    if (i*2u < 77) if (i==2) return 0;
+    return y*x;
+}