diff mbox series

i386: Separate costs of pseudo registers from hard registers

Message ID CAMe9rOpWCLPbpfawhUFO+G1BJJKayUicg3r09-J4z6k5X6BbpQ@mail.gmail.com
State New
Headers show
Series i386: Separate costs of pseudo registers from hard registers | expand

Commit Message

H.J. Lu July 23, 2019, 9:57 p.m. UTC
On Mon, Jun 24, 2019 at 9:16 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Jun 24, 2019 at 6:37 AM Richard Biener <rguenther@suse.de> wrote:
> >
> > On Thu, 20 Jun 2019, Jan Hubicka wrote:
> >
> > > > > Currently, costs of moves are also used for costs of RTL expressions.   This
> > > > > patch:
> > > > >
> > > > > https://gcc.gnu.org/ml/gcc-patches/2018-02/msg00405.html
> > > > >
> > > > > includes:
> > > > >
> > > > > diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
> > > > > index e943d13..8409a5f 100644
> > > > > --- a/gcc/config/i386/x86-tune-costs.h
> > > > > +++ b/gcc/config/i386/x86-tune-costs.h
> > > > > @@ -1557,7 +1557,7 @@ struct processor_costs skylake_cost = {
> > > > >    {4, 4, 4}, /* cost of loading integer registers
> > > > >      in QImode, HImode and SImode.
> > > > >      Relative to reg-reg move (2).  */
> > > > > -  {6, 6, 6}, /* cost of storing integer registers */
> > > > > +  {6, 6, 3}, /* cost of storing integer registers */
> > > > >    2, /* cost of reg,reg fld/fst */
> > > > >    {6, 6, 8}, /* cost of loading fp registers
> > > > >      in SFmode, DFmode and XFmode */
> > >
> > > Well, it seems that the patch was fixing things on wrong spot - the
> > > tables are intended to be mostly latency based. I think we ought to
> > > document divergences from these including benchmarks where the change
> > > helped. Otherwise it is very hard to figure out why the entry does not
> > > match the reality.
> > > > >
> > > > > It lowered the cost for SImode store and made it cheaper than SSE<->integer
> > > > > register move.  It caused a regression:
> > > > >
> > > > > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90878
> > > > >
> > > > > Since the cost for SImode store is also used to compute scalar_store
> > > > > in ix86_builtin_vectorization_cost, it changed loop costs in
> > > > >
> > > > > void
> > > > > foo (long p2, long *diag, long d, long i)
> > > > > {
> > > > >   long k;
> > > > >   k = p2 < 3 ? p2 + p2 : p2 + 3;
> > > > >   while (i < k)
> > > > >     diag[i++] = d;
> > > > > }
> > > > >
> > > > > As the result, the loop is unrolled 4 times with -O3 -march=skylake,
> > > > > instead of 3.
> > > > >
> > > > > My patch separates costs of moves from costs of RTL expressions.  We have
> > > > > a follow up patch which restores the cost for SImode store back to 6 and leave
> > > > > the cost of scalar_store unchanged.  It keeps loop unrolling unchanged and
> > > > > improves powf performance in glibc by 30%.  We are collecting SPEC CPU 2017
> > > > > data now.
> > >
> > > I have seen the problem with scalar_store with AMD tuning as well.
> > > It seems to make SLP vectorizer to be happy about idea of turning
> > > sequence of say integer tores into code which moves all the values into
> > > AVX register and then does one vector store.
> > >
> > > The cost basically compare cost of N scalar stores to 1 scalar store +
> > > vector construction. Vector construction then N*sse_op+addss.
> > >
> > > With testcase:
> > >
> > > short array[8];
> > > test (short a,short b,short c,short d,short e,short f,short g,short h)
> > > {
> > >   array[0]=a;
> > >   array[1]=b;
> > >   array[2]=c;
> > >   array[3]=d;
> > >   array[4]=e;
> > >   array[5]=f;
> > >   array[6]=g;
> > >   array[7]=h;
> > > }
> > > int iarray[8];
> > > test2 (int a,int b,int c,int d,int e,int f,int g,int h)
> > > {
> > >   iarray[0]=a;
> > >   iarray[1]=b;
> > >   iarray[2]=c;
> > >   iarray[3]=d;
> > >   iarray[4]=e;
> > >   iarray[5]=f;
> > >   iarray[6]=g;
> > >   iarray[7]=h;
> > > }
> > >
> > > I get the following codegen:
> > >
> > >
> > > test:
> > >         vmovd   %edi, %xmm0
> > >         vmovd   %edx, %xmm2
> > >         vmovd   %r8d, %xmm1
> > >         vmovd   8(%rsp), %xmm3
> > >         vpinsrw $1, 16(%rsp), %xmm3, %xmm3
> > >         vpinsrw $1, %esi, %xmm0, %xmm0
> > >         vpinsrw $1, %ecx, %xmm2, %xmm2
> > >         vpinsrw $1, %r9d, %xmm1, %xmm1
> > >         vpunpckldq      %xmm2, %xmm0, %xmm0
> > >         vpunpckldq      %xmm3, %xmm1, %xmm1
> > >         vpunpcklqdq     %xmm1, %xmm0, %xmm0
> > >         vmovaps %xmm0, array(%rip)
> > >         ret
> > >
> > > test2:
> > >         vmovd   %r8d, %xmm5
> > >         vmovd   %edx, %xmm6
> > >         vmovd   %edi, %xmm7
> > >         vpinsrd $1, %r9d, %xmm5, %xmm1
> > >         vpinsrd $1, %ecx, %xmm6, %xmm3
> > >         vpinsrd $1, %esi, %xmm7, %xmm0
> > >         vpunpcklqdq     %xmm3, %xmm0, %xmm0
> > >         vmovd   16(%rbp), %xmm4
> > >         vpinsrd $1, 24(%rbp), %xmm4, %xmm2
> > >         vpunpcklqdq     %xmm2, %xmm1, %xmm1
> > >         vinserti128     $0x1, %xmm1, %ymm0, %ymm0
> > >         vmovdqu %ymm0, iarray(%rip)
> > >         vzeroupper
> > >       ret
> > >
> > > which is about 20% slower on my skylake notebook than the
> > > non-SLP-vectorized variant.
> > >
> > > I wonder if the vec_construct costs should be made more realistic.
> > > It is computed as:
> > >
> > >       case vec_construct:
> > >         {
> > >           /* N element inserts into SSE vectors.  */
> > >           int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op;
> > >           /* One vinserti128 for combining two SSE vectors for AVX256.  */
> > >           if (GET_MODE_BITSIZE (mode) == 256)
> > >             cost += ix86_vec_cost (mode, ix86_cost->addss);
> > >           /* One vinserti64x4 and two vinserti128 for combining SSE
> > >              and AVX256 vectors to AVX512.  */
> > >           else if (GET_MODE_BITSIZE (mode) == 512)
> > >             cost += 3 * ix86_vec_cost (mode, ix86_cost->addss);
> > >           return cost;
> > >
> > > So it expects 8 simple SSE operations + one SSE FP arithmetical
> > > operations.  While code above has 8 inter-unit moves + 3 SSE integer
> > > operations to shuffle things around. Not mentioning the increased
> > > register pressure.
> >
> > But aren't the inter-unit moves a red herring?  Your testcase places
> > the sources in integer registers but usually for the case of
> > vectorization we arrive here from strided loads for which we could
> > load the first value into a %xmm reg directly and have the
> > later vpinsr instruction have memory source?
> >
> > Yes, vec_construct cost isn't the full story in this case which is
> > why add_stmt special-cases strided loads/stores adding some
> > pessimization.
> >
> > > I would say that for integer constructs it is a common case that things
> > > needs to be moved from integer unit to SSE.
> >
> > Is it?  For SLP vectorization probably yes.  The costing interface
> > unfortunately is not giving much information here (well, add_stmt
> > has access to the stmt_info ...).
> >
> > > Overall the problem is deeper since vectorizer really may need to get
> > > better idea about latencies and throughputs to estimate loop times more
> > > realistically.
> >
> > Indeed, but I hardly see how we can handle this in a sensible way since
> > we don't even understand performance corner-cases when analyzing them
> > and looking at this info but the HW still behaves in unexpected ways :/
> >
> > > One also may want to account somewhat that stores are often not part
> > > of the hot path and thus their latency is not too critical and the
> > > fact that vector stores prevents later partial memory stalls on the
> > > other hand...
> > >
>
> Costs of moves are closely related to latency and should only be used
> for register allocator.   We shouldn't use costs of moves for RTL costs.
> For register allocator, register <-> register moves are preferred over
> load and store unless it is slower than register -> memory -> register.
> For RTL costs,  we may want to make load and store cheap to improve
> RTL expansion.  But we don't want to change load and store costs for
> register allocator.   We need to separate costs of moves from costs of
> RTL expressions first.
>

Here is the updated patch to improve register allocator and RTL
expressions independently.

Any comments?

Thanks.

Comments

H.J. Lu Aug. 5, 2019, 8:55 p.m. UTC | #1
On Tue, Jul 23, 2019 at 2:57 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Jun 24, 2019 at 9:16 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Mon, Jun 24, 2019 at 6:37 AM Richard Biener <rguenther@suse.de> wrote:
> > >
> > > On Thu, 20 Jun 2019, Jan Hubicka wrote:
> > >
> > > > > > Currently, costs of moves are also used for costs of RTL expressions.   This
> > > > > > patch:
> > > > > >
> > > > > > https://gcc.gnu.org/ml/gcc-patches/2018-02/msg00405.html
> > > > > >
> > > > > > includes:
> > > > > >
> > > > > > diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
> > > > > > index e943d13..8409a5f 100644
> > > > > > --- a/gcc/config/i386/x86-tune-costs.h
> > > > > > +++ b/gcc/config/i386/x86-tune-costs.h
> > > > > > @@ -1557,7 +1557,7 @@ struct processor_costs skylake_cost = {
> > > > > >    {4, 4, 4}, /* cost of loading integer registers
> > > > > >      in QImode, HImode and SImode.
> > > > > >      Relative to reg-reg move (2).  */
> > > > > > -  {6, 6, 6}, /* cost of storing integer registers */
> > > > > > +  {6, 6, 3}, /* cost of storing integer registers */
> > > > > >    2, /* cost of reg,reg fld/fst */
> > > > > >    {6, 6, 8}, /* cost of loading fp registers
> > > > > >      in SFmode, DFmode and XFmode */
> > > >
> > > > Well, it seems that the patch was fixing things on wrong spot - the
> > > > tables are intended to be mostly latency based. I think we ought to
> > > > document divergences from these including benchmarks where the change
> > > > helped. Otherwise it is very hard to figure out why the entry does not
> > > > match the reality.
> > > > > >
> > > > > > It lowered the cost for SImode store and made it cheaper than SSE<->integer
> > > > > > register move.  It caused a regression:
> > > > > >
> > > > > > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90878
> > > > > >
> > > > > > Since the cost for SImode store is also used to compute scalar_store
> > > > > > in ix86_builtin_vectorization_cost, it changed loop costs in
> > > > > >
> > > > > > void
> > > > > > foo (long p2, long *diag, long d, long i)
> > > > > > {
> > > > > >   long k;
> > > > > >   k = p2 < 3 ? p2 + p2 : p2 + 3;
> > > > > >   while (i < k)
> > > > > >     diag[i++] = d;
> > > > > > }
> > > > > >
> > > > > > As the result, the loop is unrolled 4 times with -O3 -march=skylake,
> > > > > > instead of 3.
> > > > > >
> > > > > > My patch separates costs of moves from costs of RTL expressions.  We have
> > > > > > a follow up patch which restores the cost for SImode store back to 6 and leave
> > > > > > the cost of scalar_store unchanged.  It keeps loop unrolling unchanged and
> > > > > > improves powf performance in glibc by 30%.  We are collecting SPEC CPU 2017
> > > > > > data now.
> > > >
> > > > I have seen the problem with scalar_store with AMD tuning as well.
> > > > It seems to make SLP vectorizer to be happy about idea of turning
> > > > sequence of say integer tores into code which moves all the values into
> > > > AVX register and then does one vector store.
> > > >
> > > > The cost basically compare cost of N scalar stores to 1 scalar store +
> > > > vector construction. Vector construction then N*sse_op+addss.
> > > >
> > > > With testcase:
> > > >
> > > > short array[8];
> > > > test (short a,short b,short c,short d,short e,short f,short g,short h)
> > > > {
> > > >   array[0]=a;
> > > >   array[1]=b;
> > > >   array[2]=c;
> > > >   array[3]=d;
> > > >   array[4]=e;
> > > >   array[5]=f;
> > > >   array[6]=g;
> > > >   array[7]=h;
> > > > }
> > > > int iarray[8];
> > > > test2 (int a,int b,int c,int d,int e,int f,int g,int h)
> > > > {
> > > >   iarray[0]=a;
> > > >   iarray[1]=b;
> > > >   iarray[2]=c;
> > > >   iarray[3]=d;
> > > >   iarray[4]=e;
> > > >   iarray[5]=f;
> > > >   iarray[6]=g;
> > > >   iarray[7]=h;
> > > > }
> > > >
> > > > I get the following codegen:
> > > >
> > > >
> > > > test:
> > > >         vmovd   %edi, %xmm0
> > > >         vmovd   %edx, %xmm2
> > > >         vmovd   %r8d, %xmm1
> > > >         vmovd   8(%rsp), %xmm3
> > > >         vpinsrw $1, 16(%rsp), %xmm3, %xmm3
> > > >         vpinsrw $1, %esi, %xmm0, %xmm0
> > > >         vpinsrw $1, %ecx, %xmm2, %xmm2
> > > >         vpinsrw $1, %r9d, %xmm1, %xmm1
> > > >         vpunpckldq      %xmm2, %xmm0, %xmm0
> > > >         vpunpckldq      %xmm3, %xmm1, %xmm1
> > > >         vpunpcklqdq     %xmm1, %xmm0, %xmm0
> > > >         vmovaps %xmm0, array(%rip)
> > > >         ret
> > > >
> > > > test2:
> > > >         vmovd   %r8d, %xmm5
> > > >         vmovd   %edx, %xmm6
> > > >         vmovd   %edi, %xmm7
> > > >         vpinsrd $1, %r9d, %xmm5, %xmm1
> > > >         vpinsrd $1, %ecx, %xmm6, %xmm3
> > > >         vpinsrd $1, %esi, %xmm7, %xmm0
> > > >         vpunpcklqdq     %xmm3, %xmm0, %xmm0
> > > >         vmovd   16(%rbp), %xmm4
> > > >         vpinsrd $1, 24(%rbp), %xmm4, %xmm2
> > > >         vpunpcklqdq     %xmm2, %xmm1, %xmm1
> > > >         vinserti128     $0x1, %xmm1, %ymm0, %ymm0
> > > >         vmovdqu %ymm0, iarray(%rip)
> > > >         vzeroupper
> > > >       ret
> > > >
> > > > which is about 20% slower on my skylake notebook than the
> > > > non-SLP-vectorized variant.
> > > >
> > > > I wonder if the vec_construct costs should be made more realistic.
> > > > It is computed as:
> > > >
> > > >       case vec_construct:
> > > >         {
> > > >           /* N element inserts into SSE vectors.  */
> > > >           int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op;
> > > >           /* One vinserti128 for combining two SSE vectors for AVX256.  */
> > > >           if (GET_MODE_BITSIZE (mode) == 256)
> > > >             cost += ix86_vec_cost (mode, ix86_cost->addss);
> > > >           /* One vinserti64x4 and two vinserti128 for combining SSE
> > > >              and AVX256 vectors to AVX512.  */
> > > >           else if (GET_MODE_BITSIZE (mode) == 512)
> > > >             cost += 3 * ix86_vec_cost (mode, ix86_cost->addss);
> > > >           return cost;
> > > >
> > > > So it expects 8 simple SSE operations + one SSE FP arithmetical
> > > > operations.  While code above has 8 inter-unit moves + 3 SSE integer
> > > > operations to shuffle things around. Not mentioning the increased
> > > > register pressure.
> > >
> > > But aren't the inter-unit moves a red herring?  Your testcase places
> > > the sources in integer registers but usually for the case of
> > > vectorization we arrive here from strided loads for which we could
> > > load the first value into a %xmm reg directly and have the
> > > later vpinsr instruction have memory source?
> > >
> > > Yes, vec_construct cost isn't the full story in this case which is
> > > why add_stmt special-cases strided loads/stores adding some
> > > pessimization.
> > >
> > > > I would say that for integer constructs it is a common case that things
> > > > needs to be moved from integer unit to SSE.
> > >
> > > Is it?  For SLP vectorization probably yes.  The costing interface
> > > unfortunately is not giving much information here (well, add_stmt
> > > has access to the stmt_info ...).
> > >
> > > > Overall the problem is deeper since vectorizer really may need to get
> > > > better idea about latencies and throughputs to estimate loop times more
> > > > realistically.
> > >
> > > Indeed, but I hardly see how we can handle this in a sensible way since
> > > we don't even understand performance corner-cases when analyzing them
> > > and looking at this info but the HW still behaves in unexpected ways :/
> > >
> > > > One also may want to account somewhat that stores are often not part
> > > > of the hot path and thus their latency is not too critical and the
> > > > fact that vector stores prevents later partial memory stalls on the
> > > > other hand...
> > > >
> >
> > Costs of moves are closely related to latency and should only be used
> > for register allocator.   We shouldn't use costs of moves for RTL costs.
> > For register allocator, register <-> register moves are preferred over
> > load and store unless it is slower than register -> memory -> register.
> > For RTL costs,  we may want to make load and store cheap to improve
> > RTL expansion.  But we don't want to change load and store costs for
> > register allocator.   We need to separate costs of moves from costs of
> > RTL expressions first.
> >
>
> Here is the updated patch to improve register allocator and RTL
> expressions independently.
>
> Any comments?
>

PING:

https://gcc.gnu.org/ml/gcc-patches/2019-07/msg01542.html
Jeff Law Aug. 9, 2019, 10:01 p.m. UTC | #2
On 7/23/19 3:57 PM, H.J. Lu wrote:
[ Snip ]
> Here is the updated patch to improve register allocator and RTL
> expressions independently.
> 
> Any comments?
> 
> Thanks.
> 
> 
> -- H.J.
> 
> 
> 0001-i386-Separate-costs-of-pseudo-registers-from-hard-re.patch
> 
> From 79834daf252cecfc3ee51acd864641d2cdaff733 Mon Sep 17 00:00:00 2001
> From: "H.J. Lu" <hjl.tools@gmail.com>
> Date: Fri, 14 Jun 2019 13:30:16 -0700
> Subject: [PATCH] i386: Separate costs of pseudo registers from hard registers
> 
> processor_costs has costs of RTL expressions with pseudo registers and
> and costs of hard register moves:
> 
> 1. Costs of RTL expressions are used to generate the most efficient RTL
> operations with pseudo registers.
> 
> 2. Costs of hard register moves are used by register allocator to
> decide how to allocate and move hard registers.
> 
> Since relative costs of pseudo register load and store versus pseudo
> register moves in RTL expressions can be different from relative costs
> of hard registers, we should separate costs of RTL expressions with
> pseudo registers from costs of hard registers so that register allocator
> and RTL expressions can be improved independently.
> 
> This patch moves costs of hard register moves to the new hard_register
> field and duplicates costs of moves which are also used for costs of RTL
> expressions.
> 
> 	PR target/90878
> 	* config/i386/i386.c (inline_memory_move_cost): Use hard_register
> 	for costs of hard register moves.
> 	(ix86_register_move_cost): Likewise.
> 	* config/i386/i386.h (processor_costs): Move costs of hard
> 	register moves to hard_register.  Add int_load, int_store,
> 	xmm_move, ymm_move, zmm_move, sse_to_integer, integer_to_sse,
> 	sse_load, sse_store, sse_unaligned_load and sse_unaligned_store
> 	for costs of RTL expressions.
> 	* config/i386/x86-tune-costs.h: Move costs of hard register
> 	moves to hard_register.  Duplicate int_load, int_store,
> 	xmm_move, ymm_move, zmm_move, sse_to_integer, integer_to_sse,
> 	sse_load, sse_store for costs of RTL expressions.
This looks reasonable to me.  If you haven't had objections from Jan or
Uros, go ahead and commit it.

I'm assuming this patch isn't supposed to actually change anything yet
and a subsequent patch will twiddle some of the costs, particularly for
skylake.

jeff
H.J. Lu Aug. 9, 2019, 11:13 p.m. UTC | #3
On Fri, Aug 9, 2019 at 3:01 PM Jeff Law <law@redhat.com> wrote:
>
> On 7/23/19 3:57 PM, H.J. Lu wrote:
> [ Snip ]
> > Here is the updated patch to improve register allocator and RTL
> > expressions independently.
> >
> > Any comments?
> >
> > Thanks.
> >
> >
> > -- H.J.
> >
> >
> > 0001-i386-Separate-costs-of-pseudo-registers-from-hard-re.patch
> >
> > From 79834daf252cecfc3ee51acd864641d2cdaff733 Mon Sep 17 00:00:00 2001
> > From: "H.J. Lu" <hjl.tools@gmail.com>
> > Date: Fri, 14 Jun 2019 13:30:16 -0700
> > Subject: [PATCH] i386: Separate costs of pseudo registers from hard registers
> >
> > processor_costs has costs of RTL expressions with pseudo registers and
> > and costs of hard register moves:
> >
> > 1. Costs of RTL expressions are used to generate the most efficient RTL
> > operations with pseudo registers.
> >
> > 2. Costs of hard register moves are used by register allocator to
> > decide how to allocate and move hard registers.
> >
> > Since relative costs of pseudo register load and store versus pseudo
> > register moves in RTL expressions can be different from relative costs
> > of hard registers, we should separate costs of RTL expressions with
> > pseudo registers from costs of hard registers so that register allocator
> > and RTL expressions can be improved independently.
> >
> > This patch moves costs of hard register moves to the new hard_register
> > field and duplicates costs of moves which are also used for costs of RTL
> > expressions.
> >
> >       PR target/90878
> >       * config/i386/i386.c (inline_memory_move_cost): Use hard_register
> >       for costs of hard register moves.
> >       (ix86_register_move_cost): Likewise.
> >       * config/i386/i386.h (processor_costs): Move costs of hard
> >       register moves to hard_register.  Add int_load, int_store,
> >       xmm_move, ymm_move, zmm_move, sse_to_integer, integer_to_sse,
> >       sse_load, sse_store, sse_unaligned_load and sse_unaligned_store
> >       for costs of RTL expressions.
> >       * config/i386/x86-tune-costs.h: Move costs of hard register
> >       moves to hard_register.  Duplicate int_load, int_store,
> >       xmm_move, ymm_move, zmm_move, sse_to_integer, integer_to_sse,
> >       sse_load, sse_store for costs of RTL expressions.
> This looks reasonable to me.  If you haven't had objections from Jan or
> Uros, go ahead and commit it.

Will do.

> I'm assuming this patch isn't supposed to actually change anything yet
> and a subsequent patch will twiddle some of the costs, particularly for
> skylake.
>

We have a one-line followup patch to actually fix:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90878

Thanks.
diff mbox series

Patch

From 79834daf252cecfc3ee51acd864641d2cdaff733 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 14 Jun 2019 13:30:16 -0700
Subject: [PATCH] i386: Separate costs of pseudo registers from hard registers

processor_costs has costs of RTL expressions with pseudo registers and
and costs of hard register moves:

1. Costs of RTL expressions are used to generate the most efficient RTL
operations with pseudo registers.

2. Costs of hard register moves are used by register allocator to
decide how to allocate and move hard registers.

Since relative costs of pseudo register load and store versus pseudo
register moves in RTL expressions can be different from relative costs
of hard registers, we should separate costs of RTL expressions with
pseudo registers from costs of hard registers so that register allocator
and RTL expressions can be improved independently.

This patch moves costs of hard register moves to the new hard_register
field and duplicates costs of moves which are also used for costs of RTL
expressions.

	PR target/90878
	* config/i386/i386.c (inline_memory_move_cost): Use hard_register
	for costs of hard register moves.
	(ix86_register_move_cost): Likewise.
	* config/i386/i386.h (processor_costs): Move costs of hard
	register moves to hard_register.  Add int_load, int_store,
	xmm_move, ymm_move, zmm_move, sse_to_integer, integer_to_sse,
	sse_load, sse_store, sse_unaligned_load and sse_unaligned_store
	for costs of RTL expressions.
	* config/i386/x86-tune-costs.h: Move costs of hard register
	moves to hard_register.  Duplicate int_load, int_store,
	xmm_move, ymm_move, zmm_move, sse_to_integer, integer_to_sse,
	sse_load, sse_store for costs of RTL expressions.
---
 gcc/config/i386/i386.c           |   59 +-
 gcc/config/i386/i386.h           |   59 +-
 gcc/config/i386/x86-tune-costs.h | 1248 ++++++++++++++++++------------
 3 files changed, 824 insertions(+), 542 deletions(-)

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index e278d9c76df..1274ad76534 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -18491,8 +18491,10 @@  inline_memory_move_cost (machine_mode mode, enum reg_class regclass, int in)
 	    return 100;
 	}
       if (in == 2)
-        return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
-      return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
+        return MAX (ix86_cost->hard_register.fp_load [index],
+		    ix86_cost->hard_register.fp_store [index]);
+      return in ? ix86_cost->hard_register.fp_load [index]
+		: ix86_cost->hard_register.fp_store [index];
     }
   if (SSE_CLASS_P (regclass))
     {
@@ -18500,8 +18502,10 @@  inline_memory_move_cost (machine_mode mode, enum reg_class regclass, int in)
       if (index == -1)
 	return 100;
       if (in == 2)
-        return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
-      return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
+        return MAX (ix86_cost->hard_register.sse_load [index],
+		    ix86_cost->hard_register.sse_store [index]);
+      return in ? ix86_cost->hard_register.sse_load [index]
+		: ix86_cost->hard_register.sse_store [index];
     }
   if (MMX_CLASS_P (regclass))
     {
@@ -18518,8 +18522,10 @@  inline_memory_move_cost (machine_mode mode, enum reg_class regclass, int in)
 	    return 100;
 	}
       if (in == 2)
-        return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
-      return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
+        return MAX (ix86_cost->hard_register.mmx_load [index],
+		    ix86_cost->hard_register.mmx_store [index]);
+      return in ? ix86_cost->hard_register.mmx_load [index]
+		: ix86_cost->hard_register.mmx_store [index];
     }
   switch (GET_MODE_SIZE (mode))
     {
@@ -18527,37 +18533,41 @@  inline_memory_move_cost (machine_mode mode, enum reg_class regclass, int in)
 	if (Q_CLASS_P (regclass) || TARGET_64BIT)
 	  {
 	    if (!in)
-	      return ix86_cost->int_store[0];
+	      return ix86_cost->hard_register.int_store[0];
 	    if (TARGET_PARTIAL_REG_DEPENDENCY
 	        && optimize_function_for_speed_p (cfun))
-	      cost = ix86_cost->movzbl_load;
+	      cost = ix86_cost->hard_register.movzbl_load;
 	    else
-	      cost = ix86_cost->int_load[0];
+	      cost = ix86_cost->hard_register.int_load[0];
 	    if (in == 2)
-	      return MAX (cost, ix86_cost->int_store[0]);
+	      return MAX (cost, ix86_cost->hard_register.int_store[0]);
 	    return cost;
 	  }
 	else
 	  {
 	   if (in == 2)
-	     return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
+	     return MAX (ix86_cost->hard_register.movzbl_load,
+			 ix86_cost->hard_register.int_store[0] + 4);
 	   if (in)
-	     return ix86_cost->movzbl_load;
+	     return ix86_cost->hard_register.movzbl_load;
 	   else
-	     return ix86_cost->int_store[0] + 4;
+	     return ix86_cost->hard_register.int_store[0] + 4;
 	  }
 	break;
       case 2:
 	if (in == 2)
-	  return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
-	return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
+	  return MAX (ix86_cost->hard_register.int_load[1],
+		      ix86_cost->hard_register.int_store[1]);
+	return in ? ix86_cost->hard_register.int_load[1]
+		  : ix86_cost->hard_register.int_store[1];
       default:
 	if (in == 2)
-	  cost = MAX (ix86_cost->int_load[2], ix86_cost->int_store[2]);
+	  cost = MAX (ix86_cost->hard_register.int_load[2],
+		      ix86_cost->hard_register.int_store[2]);
 	else if (in)
-	  cost = ix86_cost->int_load[2];
+	  cost = ix86_cost->hard_register.int_load[2];
 	else
-	  cost = ix86_cost->int_store[2];
+	  cost = ix86_cost->hard_register.int_store[2];
 	/* Multiply with the number of GPR moves needed.  */
 	return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
     }
@@ -18627,20 +18637,21 @@  ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
        because of missing QImode and HImode moves to, from or between
        MMX/SSE registers.  */
     return MAX (8, SSE_CLASS_P (class1)
-		? ix86_cost->sse_to_integer : ix86_cost->integer_to_sse);
+		? ix86_cost->hard_register.sse_to_integer
+		: ix86_cost->hard_register.integer_to_sse);
 
   if (MAYBE_FLOAT_CLASS_P (class1))
-    return ix86_cost->fp_move;
+    return ix86_cost->hard_register.fp_move;
   if (MAYBE_SSE_CLASS_P (class1))
     {
       if (GET_MODE_BITSIZE (mode) <= 128)
-	return ix86_cost->xmm_move;
+	return ix86_cost->hard_register.xmm_move;
       if (GET_MODE_BITSIZE (mode) <= 256)
-	return ix86_cost->ymm_move;
-      return ix86_cost->zmm_move;
+	return ix86_cost->hard_register.ymm_move;
+      return ix86_cost->hard_register.zmm_move;
     }
   if (MAYBE_MMX_CLASS_P (class1))
-    return ix86_cost->mmx_move;
+    return ix86_cost->hard_register.mmx_move;
   return 2;
 }
 
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index a2fcdd4c644..c2b2f49cd52 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -237,9 +237,46 @@  struct stringop_algs
   } size [MAX_STRINGOP_ALGS];
 };
 
-/* Define the specific costs for a given cpu */
+/* Define the specific costs for a given cpu.  NB: hard_register is used
+   by TARGET_REGISTER_MOVE_COST and TARGET_MEMORY_MOVE_COST to compute
+   hard register move costs by register allocator.  Relative costs of
+   pseudo register load and store versus pseudo register moves in RTL
+   expressions for TARGET_RTX_COSTS can be different from relative
+   costs of hard registers to get the most efficient operations with
+   pseudo registers.  */
 
 struct processor_costs {
+  /* Costs used by register allocator.  integer->integer register move
+     cost is 2.  */
+  struct
+    {
+      const int movzbl_load;	/* cost of loading using movzbl */
+      const int int_load[3];	/* cost of loading integer registers
+				   in QImode, HImode and SImode relative
+				   to reg-reg move (2).  */
+      const int int_store[3];	/* cost of storing integer register
+				   in QImode, HImode and SImode */
+      const int fp_move;	/* cost of reg,reg fld/fst */
+      const int fp_load[3];	/* cost of loading FP register
+				   in SFmode, DFmode and XFmode */
+      const int fp_store[3];	/* cost of storing FP register
+				   in SFmode, DFmode and XFmode */
+      const int mmx_move;	/* cost of moving MMX register.  */
+      const int mmx_load[2];	/* cost of loading MMX register
+				   in SImode and DImode */
+      const int mmx_store[2];	/* cost of storing MMX register
+				   in SImode and DImode */
+      const int xmm_move;	/* cost of moving XMM register.  */
+      const int ymm_move;	/* cost of moving XMM register.  */
+      const int zmm_move;	/* cost of moving XMM register.  */
+      const int sse_load[5];	/* cost of loading SSE register
+				   in 32bit, 64bit, 128bit, 256bit and 512bit */
+      const int sse_store[5];	/* cost of storing SSE register
+				   in SImode, DImode and TImode.  */
+      const int sse_to_integer;	/* cost of moving SSE register to integer.  */
+      const int integer_to_sse;	/* cost of moving integer register to SSE. */
+    } hard_register;
+
   const int add;		/* cost of an add instruction */
   const int lea;		/* cost of a lea instruction */
   const int shift_var;		/* variable shift costs */
@@ -254,32 +291,20 @@  struct processor_costs {
   const int large_insn;		/* insns larger than this cost more */
   const int move_ratio;		/* The threshold of number of scalar
 				   memory-to-memory move insns.  */
-  const int movzbl_load;	/* cost of loading using movzbl */
   const int int_load[3];	/* cost of loading integer registers
 				   in QImode, HImode and SImode relative
 				   to reg-reg move (2).  */
   const int int_store[3];	/* cost of storing integer register
 				   in QImode, HImode and SImode */
-  const int fp_move;		/* cost of reg,reg fld/fst */
-  const int fp_load[3];		/* cost of loading FP register
-				   in SFmode, DFmode and XFmode */
-  const int fp_store[3];	/* cost of storing FP register
-				   in SFmode, DFmode and XFmode */
-  const int mmx_move;		/* cost of moving MMX register.  */
-  const int mmx_load[2];	/* cost of loading MMX register
-				   in SImode and DImode */
-  const int mmx_store[2];	/* cost of storing MMX register
-				   in SImode and DImode */
-  const int xmm_move, ymm_move, /* cost of moving XMM and YMM register.  */
-	    zmm_move;
   const int sse_load[5];	/* cost of loading SSE register
 				   in 32bit, 64bit, 128bit, 256bit and 512bit */
-  const int sse_unaligned_load[5];/* cost of unaligned load.  */
   const int sse_store[5];	/* cost of storing SSE register
-				   in SImode, DImode and TImode.  */
+				   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  const int sse_unaligned_load[5];/* cost of unaligned load.  */
   const int sse_unaligned_store[5];/* cost of unaligned store.  */
+  const int xmm_move, ymm_move, /* cost of moving XMM and YMM register.  */
+	    zmm_move;
   const int sse_to_integer;	/* cost of moving SSE register to integer.  */
-  const int integer_to_sse;	/* cost of moving integer register to SSE. */
   const int gather_static, gather_per_elt; /* Cost of gather load is computed
 				   as static + per_item * nelts. */
   const int scatter_static, scatter_per_elt; /* Cost of gather store is
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 8b963c07051..ad9ea4bfa08 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -36,6 +36,30 @@  static stringop_algs ix86_size_memset[2] = {
 
 const
 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  2,				     /* cost for loading QImode using movzbl */
+  {2, 2, 2},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {2, 2, 2},				/* cost of storing integer registers */
+  2,					/* cost of reg,reg fld/fst */
+  {2, 2, 2},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {2, 2, 2},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  3,					/* cost of moving MMX register */
+  {3, 3},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {3, 3},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  3, 3, 3,				/* cost of moving XMM,YMM,ZMM register */
+  {3, 3, 3, 3, 3},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {3, 3, 3, 3, 3},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  3, 3,					/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_BYTES (2),			/* cost of an add instruction */
   COSTS_N_BYTES (3),			/* cost of a lea instruction */
   COSTS_N_BYTES (2),			/* variable shift costs */
@@ -55,33 +79,20 @@  struct processor_costs ix86_size_cost = {/* costs for tuning for size */
   COSTS_N_BYTES (3),			/* cost of movzx */
   0,					/* "large" insn */
   2,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2. */
-  2,				     /* cost for loading QImode using movzbl */
   {2, 2, 2},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {2, 2, 2},				/* cost of storing integer registers */
-  2,					/* cost of reg,reg fld/fst */
-  {2, 2, 2},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {2, 2, 2},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  3,					/* cost of moving MMX register */
-  {3, 3},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {3, 3},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  3, 3, 3,				/* cost of moving XMM,YMM,ZMM register */
-  {3, 3, 3, 3, 3},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {3, 3, 3, 3, 3},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {3, 3, 3, 3, 3},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {3, 3, 3, 3, 3},			/* cost of unaligned SSE load
 					   in 128bit, 256bit and 512bit */
-  {3, 3, 3, 3, 3},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
-  {3, 3, 3, 3, 3},				/* cost of unaligned SSE store
+  {3, 3, 3, 3, 3},			/* cost of unaligned SSE store
 					   in 128bit, 256bit and 512bit */
-  3, 3,					/* SSE->integer and integer->SSE moves */
+  3, 3, 3,				/* cost of moving XMM,YMM,ZMM register */
+  3,					/* cost of moving SSE register to integer.  */
   5, 0,					/* Gather load static, per_elt.  */
   5, 0,					/* Gather store static, per_elt.  */
   0,					/* size of l1 cache  */
@@ -127,6 +138,30 @@  static stringop_algs i386_memset[2] = {
 
 static const
 struct processor_costs i386_cost = {	/* 386 specific costs */
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  4,				     /* cost for loading QImode using movzbl */
+  {2, 4, 2},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {2, 4, 2},				/* cost of storing integer registers */
+  2,					/* cost of reg,reg fld/fst */
+  {8, 8, 8},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {8, 8, 8},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {4, 8},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {4, 8},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {4, 8, 16, 32, 64},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  3, 3,					/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
   COSTS_N_INSNS (3),			/* variable shift costs */
@@ -146,32 +181,18 @@  struct processor_costs i386_cost = {	/* 386 specific costs */
   COSTS_N_INSNS (2),			/* cost of movzx */
   15,					/* "large" insn */
   3,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  4,				     /* cost for loading QImode using movzbl */
   {2, 4, 2},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {2, 4, 2},				/* cost of storing integer registers */
-  2,					/* cost of reg,reg fld/fst */
-  {8, 8, 8},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {8, 8, 8},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  2,					/* cost of moving MMX register */
-  {4, 8},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {4, 8},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {4, 8, 16, 32, 64},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 8, 16, 32, 64},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
-  {4, 8, 16, 32, 64},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
-  3, 3,					/* SSE->integer and integer->SSE moves */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  3,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
   4, 4,					/* Gather store static, per_elt.  */
   0,					/* size of l1 cache  */
@@ -216,6 +237,30 @@  static stringop_algs i486_memset[2] = {
 
 static const
 struct processor_costs i486_cost = {	/* 486 specific costs */
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  4,				     /* cost for loading QImode using movzbl */
+  {2, 4, 2},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {2, 4, 2},				/* cost of storing integer registers */
+  2,					/* cost of reg,reg fld/fst */
+  {8, 8, 8},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {8, 8, 8},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {4, 8},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {4, 8},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {4, 8, 16, 32, 64},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  3, 3,					/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
   COSTS_N_INSNS (3),			/* variable shift costs */
@@ -235,32 +280,18 @@  struct processor_costs i486_cost = {	/* 486 specific costs */
   COSTS_N_INSNS (2),			/* cost of movzx */
   15,					/* "large" insn */
   3,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  4,				     /* cost for loading QImode using movzbl */
   {2, 4, 2},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {2, 4, 2},				/* cost of storing integer registers */
-  2,					/* cost of reg,reg fld/fst */
-  {8, 8, 8},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {8, 8, 8},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  2,					/* cost of moving MMX register */
-  {4, 8},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {4, 8},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {4, 8, 16, 32, 64},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 8, 16, 32, 64},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
-  {4, 8, 16, 32, 64},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
-  3, 3,					/* SSE->integer and integer->SSE moves */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  3,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
   4, 4,					/* Gather store static, per_elt.  */
   4,					/* size of l1 cache.  486 has 8kB cache
@@ -307,6 +338,30 @@  static stringop_algs pentium_memset[2] = {
 
 static const
 struct processor_costs pentium_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,				     /* cost for loading QImode using movzbl */
+  {2, 4, 2},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {2, 4, 2},				/* cost of storing integer registers */
+  2,					/* cost of reg,reg fld/fst */
+  {2, 2, 6},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {4, 4, 6},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  8,					/* cost of moving MMX register */
+  {8, 8},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {8, 8},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {4, 8, 16, 32, 64},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  3, 3,					/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
   COSTS_N_INSNS (4),			/* variable shift costs */
@@ -326,32 +381,18 @@  struct processor_costs pentium_cost = {
   COSTS_N_INSNS (2),			/* cost of movzx */
   8,					/* "large" insn */
   6,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  6,				     /* cost for loading QImode using movzbl */
   {2, 4, 2},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {2, 4, 2},				/* cost of storing integer registers */
-  2,					/* cost of reg,reg fld/fst */
-  {2, 2, 6},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {4, 4, 6},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  8,					/* cost of moving MMX register */
-  {8, 8},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {8, 8},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {4, 8, 16, 32, 64},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 8, 16, 32, 64},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
-  {4, 8, 16, 32, 64},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
-  3, 3,					/* SSE->integer and integer->SSE moves */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  3,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
   4, 4,					/* Gather store static, per_elt.  */
   8,					/* size of l1 cache.  */
@@ -389,6 +430,30 @@  struct processor_costs pentium_cost = {
 
 static const
 struct processor_costs lakemont_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,				     /* cost for loading QImode using movzbl */
+  {2, 4, 2},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {2, 4, 2},				/* cost of storing integer registers */
+  2,					/* cost of reg,reg fld/fst */
+  {2, 2, 6},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {4, 4, 6},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  8,					/* cost of moving MMX register */
+  {8, 8},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {8, 8},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {4, 8, 16, 32, 64},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  3, 3,					/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
   COSTS_N_INSNS (1),			/* variable shift costs */
@@ -408,32 +473,18 @@  struct processor_costs lakemont_cost = {
   COSTS_N_INSNS (2),			/* cost of movzx */
   8,					/* "large" insn */
   17,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  6,				     /* cost for loading QImode using movzbl */
   {2, 4, 2},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {2, 4, 2},				/* cost of storing integer registers */
-  2,					/* cost of reg,reg fld/fst */
-  {2, 2, 6},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {4, 4, 6},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  8,					/* cost of moving MMX register */
-  {8, 8},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {8, 8},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {4, 8, 16, 32, 64},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 8, 16, 32, 64},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
-  {4, 8, 16, 32, 64},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
-  3, 3,					/* SSE->integer and integer->SSE moves */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  3,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
   4, 4,					/* Gather store static, per_elt.  */
   8,					/* size of l1 cache.  */
@@ -486,6 +537,30 @@  static stringop_algs pentiumpro_memset[2] = {
   DUMMY_STRINGOP_ALGS};
 static const
 struct processor_costs pentiumpro_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  2,				     /* cost for loading QImode using movzbl */
+  {4, 4, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {2, 2, 2},				/* cost of storing integer registers */
+  2,					/* cost of reg,reg fld/fst */
+  {2, 2, 6},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {4, 4, 6},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {2, 2},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {2, 2},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {4, 8, 16, 32, 64},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  3, 3,					/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
   COSTS_N_INSNS (1),			/* variable shift costs */
@@ -505,32 +580,18 @@  struct processor_costs pentiumpro_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   6,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  2,				     /* cost for loading QImode using movzbl */
   {4, 4, 4},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {2, 2, 2},				/* cost of storing integer registers */
-  2,					/* cost of reg,reg fld/fst */
-  {2, 2, 6},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {4, 4, 6},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  2,					/* cost of moving MMX register */
-  {2, 2},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {2, 2},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {4, 8, 16, 32, 64},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 8, 16, 32, 64},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
-  {4, 8, 16, 32, 64},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
-  3, 3,					/* SSE->integer and integer->SSE moves */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  3,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
   4, 4,					/* Gather store static, per_elt.  */
   8,					/* size of l1 cache.  */
@@ -574,6 +635,30 @@  static stringop_algs geode_memset[2] = {
   DUMMY_STRINGOP_ALGS};
 static const
 struct processor_costs geode_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  2,				     /* cost for loading QImode using movzbl */
+  {2, 2, 2},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {2, 2, 2},				/* cost of storing integer registers */
+  2,					/* cost of reg,reg fld/fst */
+  {2, 2, 2},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {4, 6, 6},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {2, 2},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {2, 2},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {2, 2, 8, 16, 32},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {2, 2, 8, 16, 32},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  6, 6,					/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
   COSTS_N_INSNS (2),			/* variable shift costs */
@@ -593,33 +678,18 @@  struct processor_costs geode_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   4,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  2,				     /* cost for loading QImode using movzbl */
   {2, 2, 2},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {2, 2, 2},				/* cost of storing integer registers */
-  2,					/* cost of reg,reg fld/fst */
-  {2, 2, 2},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {4, 6, 6},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-
-  2,					/* cost of moving MMX register */
-  {2, 2},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {2, 2},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {2, 2, 8, 16, 32},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {2, 2, 8, 16, 32},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {2, 2, 8, 16, 32},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {2, 2, 8, 16, 32},			/* cost of unaligned loads.  */
-  {2, 2, 8, 16, 32},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
-  6, 6,					/* SSE->integer and integer->SSE moves */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  6,					/* cost of moving SSE register to integer.  */
   2, 2,					/* Gather load static, per_elt.  */
   2, 2,					/* Gather store static, per_elt.  */
   64,					/* size of l1 cache.  */
@@ -663,6 +733,30 @@  static stringop_algs k6_memset[2] = {
   DUMMY_STRINGOP_ALGS};
 static const
 struct processor_costs k6_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  3,				     /* cost for loading QImode using movzbl */
+  {4, 5, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {2, 3, 2},				/* cost of storing integer registers */
+  4,					/* cost of reg,reg fld/fst */
+  {6, 6, 6},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {4, 4, 4},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {2, 2},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {2, 2},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {2, 2, 8, 16, 32},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {2, 2, 8, 16, 32},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  6, 6,					/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (2),			/* cost of a lea instruction */
   COSTS_N_INSNS (1),			/* variable shift costs */
@@ -682,32 +776,18 @@  struct processor_costs k6_cost = {
   COSTS_N_INSNS (2),			/* cost of movzx */
   8,					/* "large" insn */
   4,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  3,				     /* cost for loading QImode using movzbl */
   {4, 5, 4},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {2, 3, 2},				/* cost of storing integer registers */
-  4,					/* cost of reg,reg fld/fst */
-  {6, 6, 6},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {4, 4, 4},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  2,					/* cost of moving MMX register */
-  {2, 2},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {2, 2},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {2, 2, 8, 16, 32},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {2, 2, 8, 16, 32},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {2, 2, 8, 16, 32},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {2, 2, 8, 16, 32},			/* cost of unaligned loads.  */
-  {2, 2, 8, 16, 32},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
-  6, 6,					/* SSE->integer and integer->SSE moves */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  6,					/* cost of moving SSE register to integer.  */
   2, 2,					/* Gather load static, per_elt.  */
   2, 2,					/* Gather store static, per_elt.  */
   32,					/* size of l1 cache.  */
@@ -757,6 +837,30 @@  static stringop_algs athlon_memset[2] = {
   DUMMY_STRINGOP_ALGS};
 static const
 struct processor_costs athlon_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  4,				     /* cost for loading QImode using movzbl */
+  {3, 4, 3},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {3, 4, 3},				/* cost of storing integer registers */
+  4,					/* cost of reg,reg fld/fst */
+  {4, 4, 12},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {6, 6, 8},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {4, 4},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {4, 4},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {4, 4, 12, 12, 24},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {4, 4, 10, 10, 20},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  5, 5,					/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (2),			/* cost of a lea instruction */
   COSTS_N_INSNS (1),			/* variable shift costs */
@@ -776,32 +880,18 @@  struct processor_costs athlon_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   9,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  4,				     /* cost for loading QImode using movzbl */
   {3, 4, 3},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {3, 4, 3},				/* cost of storing integer registers */
-  4,					/* cost of reg,reg fld/fst */
-  {4, 4, 12},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {6, 6, 8},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  2,					/* cost of moving MMX register */
-  {4, 4},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {4, 4},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {4, 4, 12, 12, 24},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {4, 4, 12, 12, 24},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 4, 10, 10, 20},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 4, 12, 12, 24},			/* cost of unaligned loads.  */
-  {4, 4, 10, 10, 20},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {4, 4, 10, 10, 20},			/* cost of unaligned stores.  */
-  5, 5,					/* SSE->integer and integer->SSE moves */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  5,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
   4, 4,					/* Gather store static, per_elt.  */
   64,					/* size of l1 cache.  */
@@ -853,6 +943,30 @@  static stringop_algs k8_memset[2] = {
              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
 static const
 struct processor_costs k8_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  4,				     /* cost for loading QImode using movzbl */
+  {3, 4, 3},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {3, 4, 3},				/* cost of storing integer registers */
+  4,					/* cost of reg,reg fld/fst */
+  {4, 4, 12},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {6, 6, 8},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {3, 3},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {4, 4},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {4, 3, 12, 12, 24},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {4, 4, 10, 10, 20},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  5, 5,					/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (2),			/* cost of a lea instruction */
   COSTS_N_INSNS (1),			/* variable shift costs */
@@ -872,32 +986,18 @@  struct processor_costs k8_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   9,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  4,				     /* cost for loading QImode using movzbl */
   {3, 4, 3},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {3, 4, 3},				/* cost of storing integer registers */
-  4,					/* cost of reg,reg fld/fst */
-  {4, 4, 12},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {6, 6, 8},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  2,					/* cost of moving MMX register */
-  {3, 3},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {4, 4},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {4, 3, 12, 12, 24},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {4, 3, 12, 12, 24},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 4, 10, 10, 20},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {4, 3, 12, 12, 24},			/* cost of unaligned loads.  */
-  {4, 4, 10, 10, 20},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {4, 4, 10, 10, 20},			/* cost of unaligned stores.  */
-  5, 5,					/* SSE->integer and integer->SSE moves */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  5,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
   4, 4,					/* Gather store static, per_elt.  */
   64,					/* size of l1 cache.  */
@@ -953,28 +1053,7 @@  static stringop_algs amdfam10_memset[2] = {
   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
              {-1, libcall, false}}}};
 struct processor_costs amdfam10_cost = {
-  COSTS_N_INSNS (1),			/* cost of an add instruction */
-  COSTS_N_INSNS (2),			/* cost of a lea instruction */
-  COSTS_N_INSNS (1),			/* variable shift costs */
-  COSTS_N_INSNS (1),			/* constant shift costs */
-  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
-   COSTS_N_INSNS (4),			/*				 HI */
-   COSTS_N_INSNS (3),			/*				 SI */
-   COSTS_N_INSNS (4),			/*				 DI */
-   COSTS_N_INSNS (5)},			/*			      other */
-  0,					/* cost of multiply per each bit set */
-  {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
-   COSTS_N_INSNS (35),			/*			    HI */
-   COSTS_N_INSNS (51),			/*			    SI */
-   COSTS_N_INSNS (83),			/*			    DI */
-   COSTS_N_INSNS (83)},			/*			    other */
-  COSTS_N_INSNS (1),			/* cost of movsx */
-  COSTS_N_INSNS (1),			/* cost of movzx */
-  8,					/* "large" insn */
-  9,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
   4,				     /* cost for loading QImode using movzbl */
   {3, 4, 3},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
@@ -993,11 +1072,10 @@  struct processor_costs amdfam10_cost = {
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
   {4, 4, 3, 6, 12},			/* cost of loading SSE registers
 					   in 32,64,128,256 and 512-bit */
-  {4, 4, 3, 7, 12},			/* cost of unaligned loads.  */
   {4, 4, 5, 10, 20},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
-  {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
   3, 3,					/* SSE->integer and integer->SSE moves */
+
   					/* On K8:
   					    MOVD reg64, xmmreg Double FSTORE 4
 					    MOVD reg32, xmmreg Double FSTORE 4
@@ -1006,6 +1084,39 @@  struct processor_costs amdfam10_cost = {
 							       1/1  1/1
 					    MOVD reg32, xmmreg Double FADD 3
 							       1/1  1/1 */
+  /* End of register allocator costs.  */
+
+  COSTS_N_INSNS (1),			/* cost of an add instruction */
+  COSTS_N_INSNS (2),			/* cost of a lea instruction */
+  COSTS_N_INSNS (1),			/* variable shift costs */
+  COSTS_N_INSNS (1),			/* constant shift costs */
+  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),			/*				 HI */
+   COSTS_N_INSNS (3),			/*				 SI */
+   COSTS_N_INSNS (4),			/*				 DI */
+   COSTS_N_INSNS (5)},			/*			      other */
+  0,					/* cost of multiply per each bit set */
+  {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
+   COSTS_N_INSNS (35),			/*			    HI */
+   COSTS_N_INSNS (51),			/*			    SI */
+   COSTS_N_INSNS (83),			/*			    DI */
+   COSTS_N_INSNS (83)},			/*			    other */
+  COSTS_N_INSNS (1),			/* cost of movsx */
+  COSTS_N_INSNS (1),			/* cost of movzx */
+  8,					/* "large" insn */
+  9,					/* MOVE_RATIO */
+  {3, 4, 3},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {3, 4, 3},				/* cost of storing integer registers */
+  {4, 4, 3, 6, 12},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 4, 5, 10, 20},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {4, 4, 3, 7, 12},			/* cost of unaligned loads.  */
+  {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  3,					/* cost of moving SSE register to integer.  */
   4, 4,					/* Gather load static, per_elt.  */
   4, 4,					/* Gather store static, per_elt.  */
   64,					/* size of l1 cache.  */
@@ -1062,6 +1173,30 @@  static stringop_algs bdver_memset[2] = {
              {-1, libcall, false}}}};
 
 const struct processor_costs bdver_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  8,				     /* cost for loading QImode using movzbl */
+  {8, 8, 8},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {8, 8, 8},				/* cost of storing integer registers */
+  4,					/* cost of reg,reg fld/fst */
+  {12, 12, 28},				/* cost of loading fp registers
+		   			   in SFmode, DFmode and XFmode */
+  {10, 10, 18},				/* cost of storing fp registers
+ 		   			   in SFmode, DFmode and XFmode */
+  4,					/* cost of moving MMX register */
+  {12, 12},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {10, 10},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {12, 12, 10, 40, 60},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {10, 10, 10, 40, 60},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  16, 20,				/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
   COSTS_N_INSNS (1),			/* variable shift costs */
@@ -1081,32 +1216,18 @@  const struct processor_costs bdver_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   9,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  8,				     /* cost for loading QImode using movzbl */
   {8, 8, 8},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {8, 8, 8},				/* cost of storing integer registers */
-  4,					/* cost of reg,reg fld/fst */
-  {12, 12, 28},				/* cost of loading fp registers
-		   			   in SFmode, DFmode and XFmode */
-  {10, 10, 18},				/* cost of storing fp registers
- 		   			   in SFmode, DFmode and XFmode */
-  4,					/* cost of moving MMX register */
-  {12, 12},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {10, 10},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {12, 12, 10, 40, 60},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {12, 12, 10, 40, 60},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {10, 10, 10, 40, 60},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {12, 12, 10, 40, 60},			/* cost of unaligned loads.  */
-  {10, 10, 10, 40, 60},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {10, 10, 10, 40, 60},			/* cost of unaligned stores.  */
-  16, 20,				/* SSE->integer and integer->SSE moves */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  16,					/* cost of moving SSE register to integer.  */
   12, 12,				/* Gather load static, per_elt.  */
   10, 10,				/* Gather store static, per_elt.  */
   16,					/* size of l1 cache.  */
@@ -1164,6 +1285,37 @@  static stringop_algs znver1_memset[2] = {
   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
 	     {-1, libcall, false}}}};
 struct processor_costs znver1_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+
+  /* reg-reg moves are done by renaming and thus they are even cheaper than
+     1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
+     to doubles of latencies, we do not model this correctly.  It does not
+     seem to make practical difference to bump prices up even more.  */
+  6,					/* cost for loading QImode using
+					   movzbl.  */
+  {6, 6, 6},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {8, 8, 8},				/* cost of storing integer
+					   registers.  */
+  2,					/* cost of reg,reg fld/fst.  */
+  {6, 6, 16},				/* cost of loading fp registers
+		   			   in SFmode, DFmode and XFmode.  */
+  {8, 8, 16},				/* cost of storing fp registers
+ 		   			   in SFmode, DFmode and XFmode.  */
+  2,					/* cost of moving MMX register.  */
+  {6, 6},				/* cost of loading MMX registers
+					   in SImode and DImode.  */
+  {8, 8},				/* cost of storing MMX registers
+					   in SImode and DImode.  */
+  2, 3, 6,				/* cost of moving XMM,YMM,ZMM register.  */
+  {6, 6, 6, 12, 24},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit.  */
+  {8, 8, 8, 16, 32},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit.  */
+  6, 6,					/* SSE->integer and integer->SSE moves.  */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction.  */
   COSTS_N_INSNS (1),			/* cost of a lea instruction.  */
   COSTS_N_INSNS (1),			/* variable shift costs.  */
@@ -1186,39 +1338,19 @@  struct processor_costs znver1_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx.  */
   8,					/* "large" insn.  */
   9,					/* MOVE_RATIO.  */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-
-  /* reg-reg moves are done by renaming and thus they are even cheaper than
-     1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
-     to doubles of latencies, we do not model this correctly.  It does not
-     seem to make practical difference to bump prices up even more.  */
-  6,					/* cost for loading QImode using
-					   movzbl.  */
   {6, 6, 6},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {8, 8, 8},				/* cost of storing integer
 					   registers.  */
-  2,					/* cost of reg,reg fld/fst.  */
-  {6, 6, 16},				/* cost of loading fp registers
-		   			   in SFmode, DFmode and XFmode.  */
-  {8, 8, 16},				/* cost of storing fp registers
- 		   			   in SFmode, DFmode and XFmode.  */
-  2,					/* cost of moving MMX register.  */
-  {6, 6},				/* cost of loading MMX registers
-					   in SImode and DImode.  */
-  {8, 8},				/* cost of storing MMX registers
-					   in SImode and DImode.  */
-  2, 3, 6,				/* cost of moving XMM,YMM,ZMM register.  */
-  {6, 6, 6, 12, 24},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit.  */
+  {6, 6, 6, 12, 24},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {8, 8, 8, 16, 32},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {6, 6, 6, 12, 24},			/* cost of unaligned loads.  */
-  {8, 8, 8, 16, 32},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit.  */
   {8, 8, 8, 16, 32},			/* cost of unaligned stores.  */
-  6, 6,					/* SSE->integer and integer->SSE moves.  */
+  2, 3, 6,				/* cost of moving XMM,YMM,ZMM register.  */
+  6,					/* cost of moving SSE register to integer.  */
   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
      throughput 12.  Approx 9 uops do not depend on vector size and every load
      is 7 uops.  */
@@ -1288,31 +1420,7 @@  static stringop_algs znver2_memset[2] = {
 	     {-1, libcall, false}}}};
 
 struct processor_costs znver2_cost = {
-  COSTS_N_INSNS (1),			/* cost of an add instruction.  */
-  COSTS_N_INSNS (1),			/* cost of a lea instruction.  */
-  COSTS_N_INSNS (1),			/* variable shift costs.  */
-  COSTS_N_INSNS (1),			/* constant shift costs.  */
-  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI.  */
-   COSTS_N_INSNS (3),			/* 				 HI.  */
-   COSTS_N_INSNS (3),			/*				 SI.  */
-   COSTS_N_INSNS (3),			/*				 DI.  */
-   COSTS_N_INSNS (3)},			/*			other.  */
-  0,					/* cost of multiply per each bit
-					   set.  */
-   /* Depending on parameters, idiv can get faster on ryzen.  This is upper
-      bound.  */
-  {COSTS_N_INSNS (16),			/* cost of a divide/mod for QI.  */
-   COSTS_N_INSNS (22),			/* 			    HI.  */
-   COSTS_N_INSNS (30),			/*			    SI.  */
-   COSTS_N_INSNS (45),			/*			    DI.  */
-   COSTS_N_INSNS (45)},			/*			    other.  */
-  COSTS_N_INSNS (1),			/* cost of movsx.  */
-  COSTS_N_INSNS (1),			/* cost of movzx.  */
-  8,					/* "large" insn.  */
-  9,					/* MOVE_RATIO.  */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2.  */
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
 
   /* reg-reg moves are done by renaming and thus they are even cheaper than
      1 cycle.  Because reg-reg move cost is 2 and following tables correspond
@@ -1339,12 +1447,48 @@  struct processor_costs znver2_cost = {
 					   register.  */
   {6, 6, 6, 6, 12},			/* cost of loading SSE registers
 					   in 32,64,128,256 and 512-bit.  */
-  {6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
   {8, 8, 8, 8, 16},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit.  */
-  {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
   6, 6,					/* SSE->integer and integer->SSE
 					   moves.  */
+  /* End of register allocator costs.  */
+
+  COSTS_N_INSNS (1),			/* cost of an add instruction.  */
+  COSTS_N_INSNS (1),			/* cost of a lea instruction.  */
+  COSTS_N_INSNS (1),			/* variable shift costs.  */
+  COSTS_N_INSNS (1),			/* constant shift costs.  */
+  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI.  */
+   COSTS_N_INSNS (3),			/* 				 HI.  */
+   COSTS_N_INSNS (3),			/*				 SI.  */
+   COSTS_N_INSNS (3),			/*				 DI.  */
+   COSTS_N_INSNS (3)},			/*			other.  */
+  0,					/* cost of multiply per each bit
+					   set.  */
+   /* Depending on parameters, idiv can get faster on ryzen.  This is upper
+      bound.  */
+  {COSTS_N_INSNS (16),			/* cost of a divide/mod for QI.  */
+   COSTS_N_INSNS (22),			/* 			    HI.  */
+   COSTS_N_INSNS (30),			/*			    SI.  */
+   COSTS_N_INSNS (45),			/*			    DI.  */
+   COSTS_N_INSNS (45)},			/*			    other.  */
+  COSTS_N_INSNS (1),			/* cost of movsx.  */
+  COSTS_N_INSNS (1),			/* cost of movzx.  */
+  8,					/* "large" insn.  */
+  9,					/* MOVE_RATIO.  */
+  {6, 6, 6},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {8, 8, 8},				/* cost of storing integer
+					   registers.  */
+  {6, 6, 6, 6, 12},			/* cost of loading SSE registers
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {8, 8, 8, 8, 16},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
+  {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
+  2, 2, 3,				/* cost of moving XMM,YMM,ZMM
+					   register.  */
+  6,					/* cost of moving SSE register to integer.  */
   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
      throughput 12.  Approx 9 uops do not depend on vector size and every load
      is 7 uops.  */
@@ -1416,6 +1560,30 @@  static stringop_algs skylake_memset[2] = {
 
 static const
 struct processor_costs skylake_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,				     /* cost for loading QImode using movzbl */
+  {4, 4, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {6, 6, 3},				/* cost of storing integer registers */
+  2,					/* cost of reg,reg fld/fst */
+  {6, 6, 8},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {6, 6, 10},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {6, 6},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {6, 6},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
+  {6, 6, 6, 10, 20},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {8, 8, 8, 12, 24},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  2, 2,					/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1)+1,		/* cost of a lea instruction */
   COSTS_N_INSNS (1),			/* variable shift costs */
@@ -1437,30 +1605,18 @@  struct processor_costs skylake_cost = {
   COSTS_N_INSNS (0),			/* cost of movzx */
   8,					/* "large" insn */
   17,					/* MOVE_RATIO */
-
-  6,				     /* cost for loading QImode using movzbl */
   {4, 4, 4},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {6, 6, 3},				/* cost of storing integer registers */
-  2,					/* cost of reg,reg fld/fst */
-  {6, 6, 8},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {6, 6, 10},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  2,					/* cost of moving MMX register */
-  {6, 6},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {6, 6},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
-  {6, 6, 6, 10, 20},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 10, 20},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {8, 8, 8, 12, 24},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {6, 6, 6, 10, 20},			/* cost of unaligned loads.  */
-  {8, 8, 8, 12, 24},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
-  2, 2,					/* SSE->integer and integer->SSE moves */
+  2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
+  2,					/* cost of moving SSE register to integer.  */
   20, 8,				/* Gather load static, per_elt.  */
   22, 10,				/* Gather store static, per_elt.  */
   64,					/* size of l1 cache.  */
@@ -1509,6 +1665,30 @@  static stringop_algs btver1_memset[2] = {
   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
              {-1, libcall, false}}}};
 const struct processor_costs btver1_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  8,				     /* cost for loading QImode using movzbl */
+  {6, 8, 6},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {6, 8, 6},				/* cost of storing integer registers */
+  4,					/* cost of reg,reg fld/fst */
+  {12, 12, 28},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {12, 12, 38},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  4,					/* cost of moving MMX register */
+  {10, 10},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {12, 12},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {10, 10, 12, 48, 96},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {10, 10, 12, 48, 96},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  14, 14,				/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (2),			/* cost of a lea instruction */
   COSTS_N_INSNS (1),			/* variable shift costs */
@@ -1528,32 +1708,18 @@  const struct processor_costs btver1_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   9,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  8,				     /* cost for loading QImode using movzbl */
   {6, 8, 6},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {6, 8, 6},				/* cost of storing integer registers */
-  4,					/* cost of reg,reg fld/fst */
-  {12, 12, 28},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {12, 12, 38},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  4,					/* cost of moving MMX register */
-  {10, 10},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {12, 12},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {10, 10, 12, 48, 96},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {10, 10, 12, 48, 96},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {10, 10, 12, 48, 96},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {10, 10, 12, 48, 96},			/* cost of unaligned loads.  */
-  {10, 10, 12, 48, 96},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {10, 10, 12, 48, 96},			/* cost of unaligned stores.  */
-  14, 14,				/* SSE->integer and integer->SSE moves */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  14,					/* cost of moving SSE register to integer.  */
   10, 10,				/* Gather load static, per_elt.  */
   10, 10,				/* Gather store static, per_elt.  */
   32,					/* size of l1 cache.  */
@@ -1600,6 +1766,30 @@  static stringop_algs btver2_memset[2] = {
   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
              {-1, libcall, false}}}};
 const struct processor_costs btver2_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  8,				     /* cost for loading QImode using movzbl */
+  {8, 8, 6},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {8, 8, 6},				/* cost of storing integer registers */
+  4,					/* cost of reg,reg fld/fst */
+  {12, 12, 28},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {12, 12, 38},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  4,					/* cost of moving MMX register */
+  {10, 10},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {12, 12},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {10, 10, 12, 48, 96},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {10, 10, 12, 48, 96},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  14, 14,				/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (2),			/* cost of a lea instruction */
   COSTS_N_INSNS (1),			/* variable shift costs */
@@ -1619,32 +1809,18 @@  const struct processor_costs btver2_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   9,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  8,				     /* cost for loading QImode using movzbl */
   {8, 8, 6},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {8, 8, 6},				/* cost of storing integer registers */
-  4,					/* cost of reg,reg fld/fst */
-  {12, 12, 28},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {12, 12, 38},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  4,					/* cost of moving MMX register */
-  {10, 10},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {12, 12},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {10, 10, 12, 48, 96},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {10, 10, 12, 48, 96},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {10, 10, 12, 48, 96},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {10, 10, 12, 48, 96},			/* cost of unaligned loads.  */
-  {10, 10, 12, 48, 96},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {10, 10, 12, 48, 96},			/* cost of unaligned stores.  */
-  14, 14,				/* SSE->integer and integer->SSE moves */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  14,					/* cost of moving SSE register to integer.  */
   10, 10,				/* Gather load static, per_elt.  */
   10, 10,				/* Gather store static, per_elt.  */
   32,					/* size of l1 cache.  */
@@ -1690,28 +1866,7 @@  static stringop_algs pentium4_memset[2] = {
 
 static const
 struct processor_costs pentium4_cost = {
-  COSTS_N_INSNS (1),			/* cost of an add instruction */
-  COSTS_N_INSNS (3),			/* cost of a lea instruction */
-  COSTS_N_INSNS (4),			/* variable shift costs */
-  COSTS_N_INSNS (4),			/* constant shift costs */
-  {COSTS_N_INSNS (15),			/* cost of starting multiply for QI */
-   COSTS_N_INSNS (15),			/*				 HI */
-   COSTS_N_INSNS (15),			/*				 SI */
-   COSTS_N_INSNS (15),			/*				 DI */
-   COSTS_N_INSNS (15)},			/*			      other */
-  0,					/* cost of multiply per each bit set */
-  {COSTS_N_INSNS (56),			/* cost of a divide/mod for QI */
-   COSTS_N_INSNS (56),			/*			    HI */
-   COSTS_N_INSNS (56),			/*			    SI */
-   COSTS_N_INSNS (56),			/*			    DI */
-   COSTS_N_INSNS (56)},			/*			    other */
-  COSTS_N_INSNS (1),			/* cost of movsx */
-  COSTS_N_INSNS (1),			/* cost of movzx */
-  16,					/* "large" insn */
-  6,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
   5,				     /* cost for loading QImode using movzbl */
   {4, 5, 4},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
@@ -1730,11 +1885,42 @@  struct processor_costs pentium4_cost = {
   12, 24, 48,				/* cost of moving XMM,YMM,ZMM register */
   {16, 16, 16, 32, 64},			/* cost of loading SSE registers
 					   in 32,64,128,256 and 512-bit */
+  {16, 16, 16, 32, 64},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  20, 12,				/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
+  COSTS_N_INSNS (1),			/* cost of an add instruction */
+  COSTS_N_INSNS (3),			/* cost of a lea instruction */
+  COSTS_N_INSNS (4),			/* variable shift costs */
+  COSTS_N_INSNS (4),			/* constant shift costs */
+  {COSTS_N_INSNS (15),			/* cost of starting multiply for QI */
+   COSTS_N_INSNS (15),			/*				 HI */
+   COSTS_N_INSNS (15),			/*				 SI */
+   COSTS_N_INSNS (15),			/*				 DI */
+   COSTS_N_INSNS (15)},			/*			      other */
+  0,					/* cost of multiply per each bit set */
+  {COSTS_N_INSNS (56),			/* cost of a divide/mod for QI */
+   COSTS_N_INSNS (56),			/*			    HI */
+   COSTS_N_INSNS (56),			/*			    SI */
+   COSTS_N_INSNS (56),			/*			    DI */
+   COSTS_N_INSNS (56)},			/*			    other */
+  COSTS_N_INSNS (1),			/* cost of movsx */
+  COSTS_N_INSNS (1),			/* cost of movzx */
+  16,					/* "large" insn */
+  6,					/* MOVE_RATIO */
+  {4, 5, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {2, 3, 2},				/* cost of storing integer registers */
+  {16, 16, 16, 32, 64},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {16, 16, 16, 32, 64},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {32, 32, 32, 64, 128},		/* cost of unaligned loads.  */
-  {16, 16, 16, 32, 64},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {32, 32, 32, 64, 128},		/* cost of unaligned stores.  */
-  20, 12,				/* SSE->integer and integer->SSE moves */
+  12, 24, 48,				/* cost of moving XMM,YMM,ZMM register */
+  20,					/* cost of moving SSE register to integer.  */
   16, 16,				/* Gather load static, per_elt.  */
   16, 16,				/* Gather store static, per_elt.  */
   8,					/* size of l1 cache.  */
@@ -1783,6 +1969,30 @@  static stringop_algs nocona_memset[2] = {
 
 static const
 struct processor_costs nocona_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  4,				     /* cost for loading QImode using movzbl */
+  {4, 4, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {4, 4, 4},				/* cost of storing integer registers */
+  12,					/* cost of reg,reg fld/fst */
+  {14, 14, 14},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {14, 14, 14},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  14,					/* cost of moving MMX register */
+  {12, 12},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {12, 12},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  6, 12, 24,				/* cost of moving XMM,YMM,ZMM register */
+  {12, 12, 12, 24, 48},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {12, 12, 12, 24, 48},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  20, 12,				/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
   COSTS_N_INSNS (1),			/* variable shift costs */
@@ -1802,32 +2012,18 @@  struct processor_costs nocona_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   16,					/* "large" insn */
   17,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  4,				     /* cost for loading QImode using movzbl */
   {4, 4, 4},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {4, 4, 4},				/* cost of storing integer registers */
-  12,					/* cost of reg,reg fld/fst */
-  {14, 14, 14},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {14, 14, 14},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  14,					/* cost of moving MMX register */
-  {12, 12},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {12, 12},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  6, 12, 24,				/* cost of moving XMM,YMM,ZMM register */
-  {12, 12, 12, 24, 48},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {12, 12, 12, 24, 48},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {12, 12, 12, 24, 48},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {24, 24, 24, 48, 96},			/* cost of unaligned loads.  */
-  {12, 12, 12, 24, 48},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {24, 24, 24, 48, 96},			/* cost of unaligned stores.  */
-  20, 12,				/* SSE->integer and integer->SSE moves */
+  6, 12, 24,				/* cost of moving XMM,YMM,ZMM register */
+  20,					/* cost of moving SSE register to integer.  */
   12, 12,				/* Gather load static, per_elt.  */
   12, 12,				/* Gather store static, per_elt.  */
   8,					/* size of l1 cache.  */
@@ -1874,6 +2070,30 @@  static stringop_algs atom_memset[2] = {
              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
 static const
 struct processor_costs atom_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,					/* cost for loading QImode using movzbl */
+  {6, 6, 6},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {6, 6, 6},				/* cost of storing integer registers */
+  4,					/* cost of reg,reg fld/fst */
+  {6, 6, 18},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {14, 14, 24},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {8, 8},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {10, 10},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {8, 8, 8, 16, 32},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {8, 8, 8, 16, 32},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  8, 6,					/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
   COSTS_N_INSNS (1),			/* variable shift costs */
@@ -1893,32 +2113,18 @@  struct processor_costs atom_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   17,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  6,					/* cost for loading QImode using movzbl */
   {6, 6, 6},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {6, 6, 6},				/* cost of storing integer registers */
-  4,					/* cost of reg,reg fld/fst */
-  {6, 6, 18},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {14, 14, 24},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  2,					/* cost of moving MMX register */
-  {8, 8},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {10, 10},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {8, 8, 8, 16, 32},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {8, 8, 8, 16, 32},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {8, 8, 8, 16, 32},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {16, 16, 16, 32, 64},			/* cost of unaligned loads.  */
-  {8, 8, 8, 16, 32},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
-  8, 6,					/* SSE->integer and integer->SSE moves */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  8,					/* cost of moving SSE register to integer.  */
   8, 8,					/* Gather load static, per_elt.  */
   8, 8,					/* Gather store static, per_elt.  */
   32,					/* size of l1 cache.  */
@@ -1965,6 +2171,30 @@  static stringop_algs slm_memset[2] = {
              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
 static const
 struct processor_costs slm_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  8,					/* cost for loading QImode using movzbl */
+  {8, 8, 8},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {6, 6, 6},				/* cost of storing integer registers */
+  2,					/* cost of reg,reg fld/fst */
+  {8, 8, 18},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {6, 6, 18},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {8, 8},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {6, 6},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {8, 8, 8, 16, 32},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {8, 8, 8, 16, 32},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  8, 6,					/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
   COSTS_N_INSNS (1),			/* variable shift costs */
@@ -1984,32 +2214,18 @@  struct processor_costs slm_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   17,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  8,					/* cost for loading QImode using movzbl */
   {8, 8, 8},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {6, 6, 6},				/* cost of storing integer registers */
-  2,					/* cost of reg,reg fld/fst */
-  {8, 8, 18},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {6, 6, 18},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  2,					/* cost of moving MMX register */
-  {8, 8},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {6, 6},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {8, 8, 8, 16, 32},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {8, 8, 8, 16, 32},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {8, 8, 8, 16, 32},			/* cost of storing SSE register
+					   in SImode, DImode and TImode.  */
   {16, 16, 16, 32, 64},			/* cost of unaligned loads.  */
-  {8, 8, 8, 16, 32},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
-  8, 6,					/* SSE->integer and integer->SSE moves */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  8,					/* cost of moving SSE register to integer.  */
   8, 8,					/* Gather load static, per_elt.  */
   8, 8,					/* Gather store static, per_elt.  */
   32,					/* size of l1 cache.  */
@@ -2056,6 +2272,30 @@  static stringop_algs intel_memset[2] = {
              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
 static const
 struct processor_costs intel_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,				     /* cost for loading QImode using movzbl */
+  {4, 4, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {6, 6, 6},				/* cost of storing integer registers */
+  2,					/* cost of reg,reg fld/fst */
+  {6, 6, 8},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {6, 6, 10},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {6, 6},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {6, 6},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 2, 2,				/* cost of moving XMM,YMM,ZMM register */
+  {6, 6, 6, 6, 6},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 6, 6},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  4, 4,					/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
   COSTS_N_INSNS (1),			/* variable shift costs */
@@ -2075,32 +2315,18 @@  struct processor_costs intel_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   17,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  6,				     /* cost for loading QImode using movzbl */
   {4, 4, 4},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {6, 6, 6},				/* cost of storing integer registers */
-  2,					/* cost of reg,reg fld/fst */
-  {6, 6, 8},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {6, 6, 10},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  2,					/* cost of moving MMX register */
-  {6, 6},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {6, 6},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 2, 2,				/* cost of moving XMM,YMM,ZMM register */
-  {6, 6, 6, 6, 6},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 6, 6},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {6, 6, 6, 6, 6},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
-  {6, 6, 6, 6, 6},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
-  4, 4,					/* SSE->integer and integer->SSE moves */
+  2, 2, 2,				/* cost of moving XMM,YMM,ZMM register */
+  4,					/* cost of moving SSE register to integer.  */
   6, 6,					/* Gather load static, per_elt.  */
   6, 6,					/* Gather store static, per_elt.  */
   32,					/* size of l1 cache.  */
@@ -2151,6 +2377,30 @@  static stringop_algs generic_memset[2] = {
              {-1, libcall, false}}}};
 static const
 struct processor_costs generic_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,				     /* cost for loading QImode using movzbl */
+  {6, 6, 6},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {6, 6, 6},				/* cost of storing integer registers */
+  4,					/* cost of reg,reg fld/fst */
+  {6, 6, 12},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {6, 6, 12},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {6, 6},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {6, 6},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
+  {6, 6, 6, 10, 15},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 10, 15},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  6, 6,					/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   /* Setting cost to 2 makes our current implementation of synth_mult result in
      use of unnecessary temporary registers causing regression on several
@@ -2173,32 +2423,18 @@  struct processor_costs generic_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   17,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  6,				     /* cost for loading QImode using movzbl */
   {6, 6, 6},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {6, 6, 6},				/* cost of storing integer registers */
-  4,					/* cost of reg,reg fld/fst */
-  {6, 6, 12},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {6, 6, 12},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  2,					/* cost of moving MMX register */
-  {6, 6},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {6, 6},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
-  {6, 6, 6, 10, 15},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 10, 15},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {6, 6, 6, 10, 15},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {6, 6, 6, 10, 15},			/* cost of unaligned loads.  */
-  {6, 6, 6, 10, 15},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {6, 6, 6, 10, 15},			/* cost of unaligned storess.  */
-  6, 6,					/* SSE->integer and integer->SSE moves */
+  2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
+  6,					/* cost of moving SSE register to integer.  */
   18, 6,				/* Gather load static, per_elt.  */
   18, 6,				/* Gather store static, per_elt.  */
   32,					/* size of l1 cache.  */
@@ -2251,6 +2487,30 @@  static stringop_algs core_memset[2] = {
 
 static const
 struct processor_costs core_cost = {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,				     /* cost for loading QImode using movzbl */
+  {4, 4, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {6, 6, 6},				/* cost of storing integer registers */
+  2,					/* cost of reg,reg fld/fst */
+  {6, 6, 8},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {6, 6, 10},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {6, 6},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {6, 6},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
+  {6, 6, 6, 6, 12},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 6, 12},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  2, 2,					/* SSE->integer and integer->SSE moves */
+  /* End of register allocator costs.  */
+
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   /* On all chips taken into consideration lea is 2 cycles and more.  With
      this cost however our current implementation of synth_mult results in
@@ -2277,32 +2537,18 @@  struct processor_costs core_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   17,					/* MOVE_RATIO */
-
-  /* All move costs are relative to integer->integer move times 2 and thus
-     they are latency*2. */
-  6,				     /* cost for loading QImode using movzbl */
   {4, 4, 4},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {6, 6, 6},				/* cost of storing integer registers */
-  2,					/* cost of reg,reg fld/fst */
-  {6, 6, 8},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {6, 6, 10},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  2,					/* cost of moving MMX register */
-  {6, 6},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {6, 6},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
-  {6, 6, 6, 6, 12},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 6, 12},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {6, 6, 6, 6, 12},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
-  {6, 6, 6, 6, 12},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
   {6, 6, 6, 6, 12},			/* cost of unaligned stores.  */
-  2, 2,					/* SSE->integer and integer->SSE moves */
+  2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
+  2,					/* cost of moving SSE register to integer.  */
   /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
      rec. throughput 6.
      So 5 uops statically and one uops per load.  */
-- 
2.20.1