diff mbox series

Zen tuning part 11: Fix cost model of AVX moves, unaligned moves and sse<->int moves

Message ID 20171023082643.GA41960@kam.mff.cuni.cz
State New
Headers show
Series Zen tuning part 11: Fix cost model of AVX moves, unaligned moves and sse<->int moves | expand

Commit Message

Jan Hubicka Oct. 23, 2017, 8:26 a.m. UTC
Hi,
this patch extends processor_costs tables by unaligned moves (which is
needed for vectorizer costmodel), by AVX move costs and split
sse<->integer moves to two sections because AMD chips are very assymetric here
(because of different length of pipelines I assume).

Register move cost used to return 100 for all AVX moves, now it will behave
more rationally and I also disabled code that increases costs of sse<->int
moves through memory by 20 because of memory mismatch stall, at least when
quantity moved fits in integer register.

I think newer CPUs handle well cases where value is stored by parts but
read as a whole, but I need to double check it.  We may disable some of
mismatch logic for those as it was made for early designs where sotre and
loads was required to match in size and be aligned.

I kept hack that increase sse<->int costs to be at least 8.  I will look into
it incrementally - it is true that SSE regs do not play well with MODES_TIEABLE
macro, but I do not think artificial cost of 8 is a good way around.

I also had go through the excercise of updating all the CPU tables.
For RA the relative costs sort of matters only within register of given mode
(i.e. it is cheaper to spill SImode register than DImode), but for vectorization
we are replacing integer load/stores by vector load stores and thus costs needs
to be realistics across different units.

I noticed that many of other tables does not make much sense - some of this
seems to be obivous bugs forgetting that move costs are relative to register
move cost which is 2, so it needs to be latency*2 (if we ignore throughput as
we do for now).

I have added latencies according to Agner Fog's manual and chip optmization
guides.  Geode costs are complete guesswork.  There are some inconsistencies in
Agner's tables so I tried to avoid them to not biass the cost model.  For
unaligned moves I kept scheme of using twice of aligned move for CPUs where
alignments matter and having same cost for modern CPUs where it doesn't seem to
matter.  I suppose we can fine-tune incrementally.

For CPUs that do not support SSE/AVX I have added corresponding multiplies
which at least will make GCC to behave sort-of reasonably with contradicting
-march and -mtune flags.

I have benchmarked the patch on CPU2000 on Zen and Core.  It is spec neutral
but it makes improvements on polyhedron (and followup patch to model scatter
gather improves tonto of CPU2k6)

Bootstrapped/regtested x86_64-linux.

Honza

	* i386.c (dimode_scalar_chain::compute_convert_gain): Use
	xmm_move instead of sse_move.
	(sse_store_index): New function.
	(ix86_register_move_cost): Be more sensible about mismatch stall;
	model AVX moves correctly; make difference between sse->integer and
	integer->sse.
	(ix86_builtin_vectorization_cost): Model correctly aligned and unaligned
	moves; make difference between SSE and AVX.
	* i386.h (processor_costs): Remove sse_move; add xmm_move, ymm_move
	and zmm_move. Increase size of sse load and store tables;
	add unaligned load and store tables; add ssemmx_to_integer.
	* x86-tune-costs.h: Update all entries according to real 
	move latencies from Agner Fog's manual and chip documentation.
diff mbox series

Patch

Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 253982)
+++ config/i386/i386.c	(working copy)
@@ -1601,7 +1601,7 @@  dimode_scalar_chain::compute_convert_gai
       rtx dst = SET_DEST (def_set);
 
       if (REG_P (src) && REG_P (dst))
-	gain += COSTS_N_INSNS (2) - ix86_cost->sse_move;
+	gain += COSTS_N_INSNS (2) - ix86_cost->xmm_move;
       else if (REG_P (src) && MEM_P (dst))
 	gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
       else if (MEM_P (src) && REG_P (dst))
@@ -38603,6 +38603,28 @@  ix86_can_change_mode_class (machine_mode
   return true;
 }
 
+/* Return index of MODE in the sse load/store tables.  */
+
+static inline int
+sse_store_index (machine_mode mode)
+{
+      switch (GET_MODE_SIZE (mode))
+	{
+	  case 4:
+	    return 0;
+	  case 8:
+	    return 1;
+	  case 16:
+	    return 2;
+	  case 32:
+	    return 3;
+	  case 64:
+	    return 4;
+	  default:
+	    return -1;
+	}
+}
+
 /* Return the cost of moving data of mode M between a
    register and memory.  A value of 2 is the default; this cost is
    relative to those in `REGISTER_MOVE_COST'.
@@ -38646,21 +38668,9 @@  inline_memory_move_cost (machine_mode mo
     }
   if (SSE_CLASS_P (regclass))
     {
-      int index;
-      switch (GET_MODE_SIZE (mode))
-	{
-	  case 4:
-	    index = 0;
-	    break;
-	  case 8:
-	    index = 1;
-	    break;
-	  case 16:
-	    index = 2;
-	    break;
-	  default:
-	    return 100;
-	}
+      int index = sse_store_index (mode);
+      if (index == -1)
+	return 100;
       if (in == 2)
         return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
       return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
@@ -38763,8 +38773,10 @@  ix86_register_move_cost (machine_mode mo
       /* In case of copying from general_purpose_register we may emit multiple
          stores followed by single load causing memory size mismatch stall.
          Count this as arbitrarily high cost of 20.  */
-      if (targetm.class_max_nregs (class1, mode)
-	  > targetm.class_max_nregs (class2, mode))
+      if (GET_MODE_BITSIZE (mode) > BITS_PER_WORD
+	  && TARGET_MEMORY_MISMATCH_STALL
+	  && targetm.class_max_nregs (class1, mode)
+	     > targetm.class_max_nregs (class2, mode))
 	cost += 20;
 
       /* In the case of FP/MMX moves, the registers actually overlap, and we
@@ -38786,12 +38798,19 @@  ix86_register_move_cost (machine_mode mo
        where integer modes in MMX/SSE registers are not tieable
        because of missing QImode and HImode moves to, from or between
        MMX/SSE registers.  */
-    return MAX (8, ix86_cost->mmxsse_to_integer);
+    return MAX (8, MMX_CLASS_P (class1) || MMX_CLASS_P (class2)
+		? ix86_cost->mmxsse_to_integer : ix86_cost->ssemmx_to_integer);
 
   if (MAYBE_FLOAT_CLASS_P (class1))
     return ix86_cost->fp_move;
   if (MAYBE_SSE_CLASS_P (class1))
-    return ix86_cost->sse_move;
+    {
+      if (GET_MODE_BITSIZE (mode) <= 128)
+	return ix86_cost->xmm_move;
+      if (GET_MODE_BITSIZE (mode) <= 256)
+	return ix86_cost->ymm_move;
+      return ix86_cost->zmm_move;
+    }
   if (MAYBE_MMX_CLASS_P (class1))
     return ix86_cost->mmx_move;
   return 2;
@@ -44339,6 +44358,7 @@  ix86_builtin_vectorization_cost (enum ve
 {
   bool fp = false;
   machine_mode mode = TImode;
+  int index;
   if (vectype != NULL)
     {
       fp = FLOAT_TYPE_P (vectype);
@@ -44366,13 +44386,16 @@  ix86_builtin_vectorization_cost (enum ve
 			      true);
 
       case vector_load:
+	index = sse_store_index (mode);
+	gcc_assert (index >= 0);
         return ix86_vec_cost (mode,
-			      COSTS_N_INSNS (ix86_cost->sse_load[2]) / 2,
+			      COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2,
 			      true);
 
       case vector_store:
+	index = sse_store_index (mode);
         return ix86_vec_cost (mode,
-			      COSTS_N_INSNS (ix86_cost->sse_store[2]) / 2,
+			      COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2,
 			      true);
 
       case vec_to_scalar:
@@ -44383,14 +44406,18 @@  ix86_builtin_vectorization_cost (enum ve
 	 Do that incrementally.  */
       case unaligned_load:
       case vector_gather_load:
+	index = sse_store_index (mode);
         return ix86_vec_cost (mode,
-			      COSTS_N_INSNS (ix86_cost->sse_load[2]),
+			      COSTS_N_INSNS
+				 (ix86_cost->sse_unaligned_load[index]) / 2,
 			      true);
 
       case unaligned_store:
       case vector_scatter_store:
+	index = sse_store_index (mode);
         return ix86_vec_cost (mode,
-			      COSTS_N_INSNS (ix86_cost->sse_store[2]),
+			      COSTS_N_INSNS
+				 (ix86_cost->sse_unaligned_store[index]) / 2,
 			      true);
 
       case cond_branch_taken:
Index: config/i386/i386.h
===================================================================
--- config/i386/i386.h	(revision 253982)
+++ config/i386/i386.h	(working copy)
@@ -242,13 +242,17 @@  struct processor_costs {
 				   in SImode and DImode */
   const int mmx_store[2];	/* cost of storing MMX register
 				   in SImode and DImode */
-  const int sse_move;		/* cost of moving SSE register.  */
-  const int sse_load[3];	/* cost of loading SSE register
-				   in SImode, DImode and TImode*/
-  const int sse_store[3];	/* cost of storing SSE register
-				   in SImode, DImode and TImode*/
+  const int xmm_move, ymm_move, /* cost of moving XMM and YMM register.  */
+	    zmm_move;
+  const int sse_load[5];	/* cost of loading SSE register
+				   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  const int sse_unaligned_load[5];/* cost of unaligned load.  */
+  const int sse_store[5];	/* cost of storing SSE register
+				   in SImode, DImode and TImode.  */
+  const int sse_unaligned_store[5];/* cost of unaligned store.  */
   const int mmxsse_to_integer;	/* cost of moving mmxsse register to
-				   integer and vice versa.  */
+				   integer.  */
+  const int ssemmx_to_integer;  /* cost of moving integer to mmxsse register. */
   const int l1_cache_size;	/* size of l1 cache, in kilobytes.  */
   const int l2_cache_size;	/* size of l2 cache, in kilobytes.  */
   const int prefetch_block;	/* bytes moved to cache for prefetch.  */
Index: config/i386/x86-tune-costs.h
===================================================================
--- config/i386/x86-tune-costs.h	(revision 253982)
+++ config/i386/x86-tune-costs.h	(working copy)
@@ -1,4 +1,26 @@ 
+/* Costs of operations of individual x86 CPUs.
+   Copyright (C) 1988-2017 Free Software Foundation, Inc.
 
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
 /* Processor costs (relative to an add) */
 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes.  */
 #define COSTS_N_BYTES(N) ((N) * 2)
@@ -33,6 +55,8 @@  struct processor_costs ix86_size_cost =
   COSTS_N_BYTES (3),			/* cost of movzx */
   0,					/* "large" insn */
   2,					/* MOVE_RATIO */
+
+  /* All move costs are relative to integer->integer move times 2. */
   2,				     /* cost for loading QImode using movzbl */
   {2, 2, 2},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
@@ -48,12 +72,16 @@  struct processor_costs ix86_size_cost =
 					   in SImode and DImode */
   {3, 3},				/* cost of storing MMX registers
 					   in SImode and DImode */
-  3,					/* cost of moving SSE register */
-  {3, 3, 3},				/* cost of loading SSE registers
-					   in SImode, DImode and TImode */
-  {3, 3, 3},				/* cost of storing SSE registers
-					   in SImode, DImode and TImode */
-  3,					/* MMX or SSE register to integer */
+  3, 3, 3,				/* cost of moving XMM,YMM,ZMM register */
+  {3, 3, 3, 3, 3},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {3, 3, 3, 3, 3},			/* cost of unaligned SSE load
+					   in 128bit, 256bit and 512bit */
+  {3, 3, 3, 3, 3},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {3, 3, 3, 3, 3},				/* cost of unaligned SSE store
+					   in 128bit, 256bit and 512bit */
+  3, 3,					/* SSE->integer and integer->SSE moves */
   0,					/* size of l1 cache  */
   0,					/* size of l2 cache  */
   0,					/* size of prefetch block */
@@ -112,6 +140,9 @@  struct processor_costs i386_cost = {	/*
   COSTS_N_INSNS (2),			/* cost of movzx */
   15,					/* "large" insn */
   3,					/* MOVE_RATIO */
+
+  /* All move costs are relative to integer->integer move times 2 and thus
+     they are latency*2. */
   4,				     /* cost for loading QImode using movzbl */
   {2, 4, 2},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
@@ -127,12 +158,14 @@  struct processor_costs i386_cost = {	/*
 					   in SImode and DImode */
   {4, 8},				/* cost of storing MMX registers
 					   in SImode and DImode */
-  2,					/* cost of moving SSE register */
-  {4, 8, 16},				/* cost of loading SSE registers
-					   in SImode, DImode and TImode */
-  {4, 8, 16},				/* cost of storing SSE registers
-					   in SImode, DImode and TImode */
-  3,					/* MMX or SSE register to integer */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {4, 8, 16, 32, 64},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
+  {4, 8, 16, 32, 64},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
+  3, 3,					/* SSE->integer and integer->SSE moves */
   0,					/* size of l1 cache  */
   0,					/* size of l2 cache  */
   0,					/* size of prefetch block */
@@ -190,6 +223,9 @@  struct processor_costs i486_cost = {	/*
   COSTS_N_INSNS (2),			/* cost of movzx */
   15,					/* "large" insn */
   3,					/* MOVE_RATIO */
+
+  /* All move costs are relative to integer->integer move times 2 and thus
+     they are latency*2. */
   4,				     /* cost for loading QImode using movzbl */
   {2, 4, 2},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
@@ -205,12 +241,14 @@  struct processor_costs i486_cost = {	/*
 					   in SImode and DImode */
   {4, 8},				/* cost of storing MMX registers
 					   in SImode and DImode */
-  2,					/* cost of moving SSE register */
-  {4, 8, 16},				/* cost of loading SSE registers
-					   in SImode, DImode and TImode */
-  {4, 8, 16},				/* cost of storing SSE registers
-					   in SImode, DImode and TImode */
-  3,					/* MMX or SSE register to integer */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {4, 8, 16, 32, 64},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
+  {4, 8, 16, 32, 64},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
+  3, 3,					/* SSE->integer and integer->SSE moves */
   4,					/* size of l1 cache.  486 has 8kB cache
 					   shared for code and data, so 4kB is
 					   not really precise.  */
@@ -270,6 +308,9 @@  struct processor_costs pentium_cost = {
   COSTS_N_INSNS (2),			/* cost of movzx */
   8,					/* "large" insn */
   6,					/* MOVE_RATIO */
+
+  /* All move costs are relative to integer->integer move times 2 and thus
+     they are latency*2. */
   6,				     /* cost for loading QImode using movzbl */
   {2, 4, 2},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
@@ -285,12 +326,14 @@  struct processor_costs pentium_cost = {
 					   in SImode and DImode */
   {8, 8},				/* cost of storing MMX registers
 					   in SImode and DImode */
-  2,					/* cost of moving SSE register */
-  {4, 8, 16},				/* cost of loading SSE registers
-					   in SImode, DImode and TImode */
-  {4, 8, 16},				/* cost of storing SSE registers
-					   in SImode, DImode and TImode */
-  3,					/* MMX or SSE register to integer */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {4, 8, 16, 32, 64},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
+  {4, 8, 16, 32, 64},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
+  3, 3,					/* SSE->integer and integer->SSE moves */
   8,					/* size of l1 cache.  */
   8,					/* size of l2 cache  */
   0,					/* size of prefetch block */
@@ -341,6 +384,9 @@  struct processor_costs lakemont_cost = {
   COSTS_N_INSNS (2),			/* cost of movzx */
   8,					/* "large" insn */
   17,					/* MOVE_RATIO */
+
+  /* All move costs are relative to integer->integer move times 2 and thus
+     they are latency*2. */
   6,				     /* cost for loading QImode using movzbl */
   {2, 4, 2},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
@@ -356,12 +402,14 @@  struct processor_costs lakemont_cost = {
 					   in SImode and DImode */
   {8, 8},				/* cost of storing MMX registers
 					   in SImode and DImode */
-  2,					/* cost of moving SSE register */
-  {4, 8, 16},				/* cost of loading SSE registers
-					   in SImode, DImode and TImode */
-  {4, 8, 16},				/* cost of storing SSE registers
-					   in SImode, DImode and TImode */
-  3,					/* MMX or SSE register to integer */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {4, 8, 16, 32, 64},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
+  {4, 8, 16, 32, 64},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
+  3, 3,					/* SSE->integer and integer->SSE moves */
   8,					/* size of l1 cache.  */
   8,					/* size of l2 cache  */
   0,					/* size of prefetch block */
@@ -427,6 +475,9 @@  struct processor_costs pentiumpro_cost =
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   6,					/* MOVE_RATIO */
+
+  /* All move costs are relative to integer->integer move times 2 and thus
+     they are latency*2. */
   2,				     /* cost for loading QImode using movzbl */
   {4, 4, 4},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
@@ -442,12 +493,14 @@  struct processor_costs pentiumpro_cost =
 					   in SImode and DImode */
   {2, 2},				/* cost of storing MMX registers
 					   in SImode and DImode */
-  2,					/* cost of moving SSE register */
-  {2, 2, 8},				/* cost of loading SSE registers
-					   in SImode, DImode and TImode */
-  {2, 2, 8},				/* cost of storing SSE registers
-					   in SImode, DImode and TImode */
-  3,					/* MMX or SSE register to integer */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {4, 8, 16, 32, 64},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
+  {4, 8, 16, 32, 64},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
+  3, 3,					/* SSE->integer and integer->SSE moves */
   8,					/* size of l1 cache.  */
   256,					/* size of l2 cache  */
   32,					/* size of prefetch block */
@@ -504,13 +557,16 @@  struct processor_costs geode_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   4,					/* MOVE_RATIO */
-  1,				     /* cost for loading QImode using movzbl */
-  {1, 1, 1},				/* cost of loading integer registers
+
+  /* All move costs are relative to integer->integer move times 2 and thus
+     they are latency*2. */
+  2,				     /* cost for loading QImode using movzbl */
+  {2, 2, 2},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
-  {1, 1, 1},				/* cost of storing integer registers */
-  1,					/* cost of reg,reg fld/fst */
-  {1, 1, 1},				/* cost of loading fp registers
+  {2, 2, 2},				/* cost of storing integer registers */
+  2,					/* cost of reg,reg fld/fst */
+  {2, 2, 2},				/* cost of loading fp registers
 					   in SFmode, DFmode and XFmode */
   {4, 6, 6},				/* cost of storing fp registers
 					   in SFmode, DFmode and XFmode */
@@ -520,12 +576,14 @@  struct processor_costs geode_cost = {
 					   in SImode and DImode */
   {2, 2},				/* cost of storing MMX registers
 					   in SImode and DImode */
-  2,					/* cost of moving SSE register */
-  {2, 2, 8},				/* cost of loading SSE registers
-					   in SImode, DImode and TImode */
-  {2, 2, 8},				/* cost of storing SSE registers
-					   in SImode, DImode and TImode */
-  3,					/* MMX or SSE register to integer */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {2, 2, 8, 16, 32},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {2, 2, 8, 16, 32},			/* cost of unaligned loads.  */
+  {2, 2, 8, 16, 32},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
+  6, 6,					/* SSE->integer and integer->SSE moves */
   64,					/* size of l1 cache.  */
   128,					/* size of l2 cache.  */
   32,					/* size of prefetch block */
@@ -582,6 +640,9 @@  struct processor_costs k6_cost = {
   COSTS_N_INSNS (2),			/* cost of movzx */
   8,					/* "large" insn */
   4,					/* MOVE_RATIO */
+
+  /* All move costs are relative to integer->integer move times 2 and thus
+     they are latency*2. */
   3,				     /* cost for loading QImode using movzbl */
   {4, 5, 4},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
@@ -597,12 +658,14 @@  struct processor_costs k6_cost = {
 					   in SImode and DImode */
   {2, 2},				/* cost of storing MMX registers
 					   in SImode and DImode */
-  2,					/* cost of moving SSE register */
-  {2, 2, 8},				/* cost of loading SSE registers
-					   in SImode, DImode and TImode */
-  {2, 2, 8},				/* cost of storing SSE registers
-					   in SImode, DImode and TImode */
-  6,					/* MMX or SSE register to integer */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {2, 2, 8, 16, 32},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {2, 2, 8, 16, 32},			/* cost of unaligned loads.  */
+  {2, 2, 8, 16, 32},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
+  6, 6,					/* SSE->integer and integer->SSE moves */
   32,					/* size of l1 cache.  */
   32,					/* size of l2 cache.  Some models
 					   have integrated l2 cache, but
@@ -665,6 +728,9 @@  struct processor_costs athlon_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   9,					/* MOVE_RATIO */
+
+  /* All move costs are relative to integer->integer move times 2 and thus
+     they are latency*2. */
   4,				     /* cost for loading QImode using movzbl */
   {3, 4, 3},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
@@ -680,12 +746,14 @@  struct processor_costs athlon_cost = {
 					   in SImode and DImode */
   {4, 4},				/* cost of storing MMX registers
 					   in SImode and DImode */
-  2,					/* cost of moving SSE register */
-  {4, 4, 6},				/* cost of loading SSE registers
-					   in SImode, DImode and TImode */
-  {4, 4, 5},				/* cost of storing SSE registers
-					   in SImode, DImode and TImode */
-  5,					/* MMX or SSE register to integer */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {4, 4, 6, 12, 24},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {4, 4, 6, 12, 24},			/* cost of unaligned loads.  */
+  {4, 4, 5, 10, 20},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
+  5, 5,					/* SSE->integer and integer->SSE moves */
   64,					/* size of l1 cache.  */
   256,					/* size of l2 cache.  */
   64,					/* size of prefetch block */
@@ -750,6 +818,9 @@  struct processor_costs k8_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   9,					/* MOVE_RATIO */
+
+  /* All move costs are relative to integer->integer move times 2 and thus
+     they are latency*2. */
   4,				     /* cost for loading QImode using movzbl */
   {3, 4, 3},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
@@ -765,12 +836,14 @@  struct processor_costs k8_cost = {
 					   in SImode and DImode */
   {4, 4},				/* cost of storing MMX registers
 					   in SImode and DImode */
-  2,					/* cost of moving SSE register */
-  {4, 3, 6},				/* cost of loading SSE registers
-					   in SImode, DImode and TImode */
-  {4, 4, 5},				/* cost of storing SSE registers
-					   in SImode, DImode and TImode */
-  5,					/* MMX or SSE register to integer */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {4, 3, 6, 12, 24},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {4, 3, 6, 12, 24},			/* cost of unaligned loads.  */
+  {4, 4, 5, 10, 20},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
+  5, 5,					/* SSE->integer and integer->SSE moves */
   64,					/* size of l1 cache.  */
   512,					/* size of l2 cache.  */
   64,					/* size of prefetch block */
@@ -839,6 +912,9 @@  struct processor_costs amdfam10_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   9,					/* MOVE_RATIO */
+
+  /* All move costs are relative to integer->integer move times 2 and thus
+     they are latency*2. */
   4,				     /* cost for loading QImode using movzbl */
   {3, 4, 3},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
@@ -854,12 +930,14 @@  struct processor_costs amdfam10_cost = {
 					   in SImode and DImode */
   {4, 4},				/* cost of storing MMX registers
 					   in SImode and DImode */
-  2,					/* cost of moving SSE register */
-  {4, 4, 3},				/* cost of loading SSE registers
-					   in SImode, DImode and TImode */
-  {4, 4, 5},				/* cost of storing SSE registers
-					   in SImode, DImode and TImode */
-  3,					/* MMX or SSE register to integer */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {4, 4, 3, 6, 12},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {4, 4, 3, 7, 12},			/* cost of unaligned loads.  */
+  {4, 4, 5, 10, 20},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
+  3, 3,					/* SSE->integer and integer->SSE moves */
   					/* On K8:
   					    MOVD reg64, xmmreg Double FSTORE 4
 					    MOVD reg32, xmmreg Double FSTORE 4
@@ -937,35 +1015,32 @@  const struct processor_costs bdver1_cost
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   9,					/* MOVE_RATIO */
-  4,				     /* cost for loading QImode using movzbl */
-  {5, 5, 4},				/* cost of loading integer registers
+
+  /* All move costs are relative to integer->integer move times 2 and thus
+     they are latency*2. */
+  8,				     /* cost for loading QImode using movzbl */
+  {8, 8, 8},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
-  {4, 4, 4},				/* cost of storing integer registers */
-  2,					/* cost of reg,reg fld/fst */
-  {5, 5, 12},				/* cost of loading fp registers
+  {8, 8, 8},				/* cost of storing integer registers */
+  4,					/* cost of reg,reg fld/fst */
+  {12, 12, 28},				/* cost of loading fp registers
 		   			   in SFmode, DFmode and XFmode */
-  {4, 4, 8},				/* cost of storing fp registers
+  {10, 10, 18},				/* cost of storing fp registers
  		   			   in SFmode, DFmode and XFmode */
-  2,					/* cost of moving MMX register */
-  {4, 4},				/* cost of loading MMX registers
+  4,					/* cost of moving MMX register */
+  {12, 12},				/* cost of loading MMX registers
 					   in SImode and DImode */
-  {4, 4},				/* cost of storing MMX registers
+  {10, 10},				/* cost of storing MMX registers
 					   in SImode and DImode */
-  2,					/* cost of moving SSE register */
-  {4, 4, 4},				/* cost of loading SSE registers
-					   in SImode, DImode and TImode */
-  {4, 4, 4},				/* cost of storing SSE registers
-					   in SImode, DImode and TImode */
-  2,					/* MMX or SSE register to integer */
-  					/* On K8:
-					    MOVD reg64, xmmreg Double FSTORE 4
-					    MOVD reg32, xmmreg Double FSTORE 4
-					   On AMDFAM10:
-					    MOVD reg64, xmmreg Double FADD 3
-							       1/1  1/1
-					    MOVD reg32, xmmreg Double FADD 3
-							       1/1  1/1 */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {12, 12, 10, 20, 30},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {12, 12, 10, 20, 30},			/* cost of unaligned loads.  */
+  {10, 10, 10, 20, 30},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
+  16, 20,				/* SSE->integer and integer->SSE moves */
   16,					/* size of l1 cache.  */
   2048,					/* size of l2 cache.  */
   64,					/* size of prefetch block */
@@ -1037,35 +1112,32 @@  const struct processor_costs bdver2_cost
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   9,					/* MOVE_RATIO */
-  4,				     /* cost for loading QImode using movzbl */
-  {5, 5, 4},				/* cost of loading integer registers
+
+  /* All move costs are relative to integer->integer move times 2 and thus
+     they are latency*2. */
+  8,				     /* cost for loading QImode using movzbl */
+  {8, 8, 8},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
-  {4, 4, 4},				/* cost of storing integer registers */
-  2,					/* cost of reg,reg fld/fst */
-  {5, 5, 12},				/* cost of loading fp registers
+  {8, 8, 8},				/* cost of storing integer registers */
+  4,					/* cost of reg,reg fld/fst */
+  {12, 12, 28},				/* cost of loading fp registers
 		   			   in SFmode, DFmode and XFmode */
-  {4, 4, 8},				/* cost of storing fp registers
+  {10, 10, 18},				/* cost of storing fp registers
  		   			   in SFmode, DFmode and XFmode */
-  2,					/* cost of moving MMX register */
-  {4, 4},				/* cost of loading MMX registers
+  4,					/* cost of moving MMX register */
+  {12, 12},				/* cost of loading MMX registers
 					   in SImode and DImode */
-  {4, 4},				/* cost of storing MMX registers
+  {10, 10},				/* cost of storing MMX registers
 					   in SImode and DImode */
-  2,					/* cost of moving SSE register */
-  {4, 4, 4},				/* cost of loading SSE registers
-					   in SImode, DImode and TImode */
-  {4, 4, 4},				/* cost of storing SSE registers
-					   in SImode, DImode and TImode */
-  2,					/* MMX or SSE register to integer */
-  					/* On K8:
-					    MOVD reg64, xmmreg Double FSTORE 4
-					    MOVD reg32, xmmreg Double FSTORE 4
-					   On AMDFAM10:
-					    MOVD reg64, xmmreg Double FADD 3
-							       1/1  1/1
-					    MOVD reg32, xmmreg Double FADD 3
-							       1/1  1/1 */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {12, 12, 10, 20, 30},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {12, 12, 10, 20, 30},			/* cost of unaligned loads.  */
+  {10, 10, 10, 20, 30},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
+  16, 20,				/* SSE->integer and integer->SSE moves */
   16,					/* size of l1 cache.  */
   2048,					/* size of l2 cache.  */
   64,					/* size of prefetch block */
@@ -1136,27 +1208,32 @@  struct processor_costs bdver3_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   9,					/* MOVE_RATIO */
-  4,				     /* cost for loading QImode using movzbl */
-  {5, 5, 4},				/* cost of loading integer registers
+
+  /* All move costs are relative to integer->integer move times 2 and thus
+     they are latency*2. */
+  8,				     /* cost for loading QImode using movzbl */
+  {8, 8, 8},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
-  {4, 4, 4},				/* cost of storing integer registers */
-  2,					/* cost of reg,reg fld/fst */
-  {5, 5, 12},				/* cost of loading fp registers
+  {8, 8, 8},				/* cost of storing integer registers */
+  4,					/* cost of reg,reg fld/fst */
+  {12, 12, 28},				/* cost of loading fp registers
 		   			   in SFmode, DFmode and XFmode */
-  {4, 4, 8},				/* cost of storing fp registers
+  {10, 10, 18},				/* cost of storing fp registers
  		   			   in SFmode, DFmode and XFmode */
-  2,					/* cost of moving MMX register */
-  {4, 4},				/* cost of loading MMX registers
+  4,					/* cost of moving MMX register */
+  {12, 12},				/* cost of loading MMX registers
 					   in SImode and DImode */
-  {4, 4},				/* cost of storing MMX registers
+  {10, 10},				/* cost of storing MMX registers
 					   in SImode and DImode */
-  2,					/* cost of moving SSE register */
-  {4, 4, 4},				/* cost of loading SSE registers
-					   in SImode, DImode and TImode */
-  {4, 4, 4},				/* cost of storing SSE registers
-					   in SImode, DImode and TImode */
-  2,					/* MMX or SSE register to integer */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {12, 12, 10, 20, 30},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {12, 12, 10, 20, 30},			/* cost of unaligned loads.  */
+  {10, 10, 10, 20, 30},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
+  16, 20,				/* SSE->integer and integer->SSE moves */
   16,					/* size of l1 cache.  */
   2048,					/* size of l2 cache.  */
   64,					/* size of prefetch block */
@@ -1226,27 +1303,32 @@  struct processor_costs bdver4_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   9,					/* MOVE_RATIO */
-  4,				     /* cost for loading QImode using movzbl */
-  {5, 5, 4},				/* cost of loading integer registers
+
+  /* All move costs are relative to integer->integer move times 2 and thus
+     they are latency*2. */
+  8,				     /* cost for loading QImode using movzbl */
+  {8, 8, 8},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
-  {4, 4, 4},				/* cost of storing integer registers */
-  2,					/* cost of reg,reg fld/fst */
-  {5, 5, 12},				/* cost of loading fp registers
+  {8, 8, 8},				/* cost of storing integer registers */
+  4,					/* cost of reg,reg fld/fst */
+  {12, 12, 28},				/* cost of loading fp registers
 		   			   in SFmode, DFmode and XFmode */
-  {4, 4, 8},				/* cost of storing fp registers
+  {10, 10, 18},				/* cost of storing fp registers
  		   			   in SFmode, DFmode and XFmode */
-  2,					/* cost of moving MMX register */
-  {4, 4},				/* cost of loading MMX registers
+  4,					/* cost of moving MMX register */
+  {12, 12},				/* cost of loading MMX registers
 					   in SImode and DImode */
-  {4, 4},				/* cost of storing MMX registers
+  {10, 10},				/* cost of storing MMX registers
 					   in SImode and DImode */
-  2,					/* cost of moving SSE register */
-  {4, 4, 4},				/* cost of loading SSE registers
-					   in SImode, DImode and TImode */
-  {4, 4, 4},				/* cost of storing SSE registers
-					   in SImode, DImode and TImode */
-  2,					/* MMX or SSE register to integer */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {12, 12, 10, 20, 30},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {12, 12, 10, 20, 30},			/* cost of unaligned loads.  */
+  {10, 10, 10, 20, 30},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
+  16, 20,				/* SSE->integer and integer->SSE moves */
   16,					/* size of l1 cache.  */
   2048,					/* size of l2 cache.  */
   64,					/* size of prefetch block */
@@ -1321,6 +1403,9 @@  struct processor_costs znver1_cost = {
   8,					/* "large" insn.  */
   9,					/* MOVE_RATIO.  */
 
+  /* All move costs are relative to integer->integer move times 2 and thus
+     they are latency*2. */
+
   /* reg-reg moves are done by renaming and thus they are even cheaper than
      1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
      to doubles of latencies, we do not model this correctly.  It does not
@@ -1342,12 +1427,14 @@  struct processor_costs znver1_cost = {
 					   in SImode and DImode.  */
   {8, 8},				/* cost of storing MMX registers
 					   in SImode and DImode.  */
-  2,					/* cost of moving SSE register.  */
-  {6, 6, 6},				/* cost of loading SSE registers
-					   in SImode, DImode and TImode.  */
-  {8, 8, 8},				/* cost of storing SSE registers
-					   in SImode, DImode and TImode.  */
-  6,					/* MMX or SSE register to integer.  */
+  2, 3, 6,				/* cost of moving XMM,YMM,ZMM register.  */
+  {6, 6, 6, 10, 20},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit.  */
+  {6, 6, 6, 10, 20},			/* cost of unaligned loads.  */
+  {8, 8, 8, 8, 16},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit.  */
+  {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
+  6, 6,					/* SSE->integer and integer->SSE moves.  */
   32,					/* size of l1 cache.  */
   512,					/* size of l2 cache.  */
   64,					/* size of prefetch block.  */
@@ -1426,35 +1513,32 @@  const struct processor_costs btver1_cost
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   9,					/* MOVE_RATIO */
-  4,				     /* cost for loading QImode using movzbl */
-  {3, 4, 3},				/* cost of loading integer registers
+
+  /* All move costs are relative to integer->integer move times 2 and thus
+     they are latency*2. */
+  8,				     /* cost for loading QImode using movzbl */
+  {6, 8, 6},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
-  {3, 4, 3},				/* cost of storing integer registers */
+  {6, 8, 6},				/* cost of storing integer registers */
   4,					/* cost of reg,reg fld/fst */
-  {4, 4, 12},				/* cost of loading fp registers
+  {12, 12, 28},				/* cost of loading fp registers
 					   in SFmode, DFmode and XFmode */
-  {6, 6, 8},				/* cost of storing fp registers
+  {12, 12, 38},				/* cost of storing fp registers
 					   in SFmode, DFmode and XFmode */
-  2,					/* cost of moving MMX register */
-  {3, 3},				/* cost of loading MMX registers
+  4,					/* cost of moving MMX register */
+  {10, 10},				/* cost of loading MMX registers
 					   in SImode and DImode */
-  {4, 4},				/* cost of storing MMX registers
+  {12, 12},				/* cost of storing MMX registers
 					   in SImode and DImode */
-  2,					/* cost of moving SSE register */
-  {4, 4, 3},				/* cost of loading SSE registers
-					   in SImode, DImode and TImode */
-  {4, 4, 5},				/* cost of storing SSE registers
-					   in SImode, DImode and TImode */
-  3,					/* MMX or SSE register to integer */
-					/* On K8:
-					   MOVD reg64, xmmreg Double FSTORE 4
-					   MOVD reg32, xmmreg Double FSTORE 4
-					   On AMDFAM10:
-					   MOVD reg64, xmmreg Double FADD 3
-							       1/1  1/1
-					    MOVD reg32, xmmreg Double FADD 3
-							       1/1  1/1 */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {10, 10, 12, 24, 48},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {10, 10, 12, 24, 48},			/* cost of unaligned loads.  */
+  {10, 10, 12, 24, 48},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {10, 10, 12, 24, 48},			/* cost of unaligned stores.  */
+  14, 14,				/* SSE->integer and integer->SSE moves */
   32,					/* size of l1 cache.  */
   512,					/* size of l2 cache.  */
   64,					/* size of prefetch block */
@@ -1514,35 +1598,32 @@  const struct processor_costs btver2_cost
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   9,					/* MOVE_RATIO */
-  4,				     /* cost for loading QImode using movzbl */
-  {3, 4, 3},				/* cost of loading integer registers
+
+  /* All move costs are relative to integer->integer move times 2 and thus
+     they are latency*2. */
+  8,				     /* cost for loading QImode using movzbl */
+  {8, 8, 6},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
-  {3, 4, 3},				/* cost of storing integer registers */
+  {8, 8, 6},				/* cost of storing integer registers */
   4,					/* cost of reg,reg fld/fst */
-  {4, 4, 12},				/* cost of loading fp registers
+  {12, 12, 28},				/* cost of loading fp registers
 					   in SFmode, DFmode and XFmode */
-  {6, 6, 8},				/* cost of storing fp registers
+  {12, 12, 38},				/* cost of storing fp registers
 					   in SFmode, DFmode and XFmode */
-  2,					/* cost of moving MMX register */
-  {3, 3},				/* cost of loading MMX registers
+  4,					/* cost of moving MMX register */
+  {10, 10},				/* cost of loading MMX registers
 					   in SImode and DImode */
-  {4, 4},				/* cost of storing MMX registers
+  {12, 12},				/* cost of storing MMX registers
 					   in SImode and DImode */
-  2,					/* cost of moving SSE register */
-  {4, 4, 3},				/* cost of loading SSE registers
-					   in SImode, DImode and TImode */
-  {4, 4, 5},				/* cost of storing SSE registers
-					   in SImode, DImode and TImode */
-  3,					/* MMX or SSE register to integer */
-					/* On K8:
-					   MOVD reg64, xmmreg Double FSTORE 4
-					   MOVD reg32, xmmreg Double FSTORE 4
-					   On AMDFAM10:
-					   MOVD reg64, xmmreg Double FADD 3
-							       1/1  1/1
-					    MOVD reg32, xmmreg Double FADD 3
-							       1/1  1/1 */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {10, 10, 12, 24, 48},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {10, 10, 12, 24, 48},			/* cost of unaligned loads.  */
+  {10, 10, 12, 24, 48},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {10, 10, 12, 24, 48},			/* cost of unaligned stores.  */
+  14, 14,				/* SSE->integer and integer->SSE moves */
   32,					/* size of l1 cache.  */
   2048,					/* size of l2 cache.  */
   64,					/* size of prefetch block */
@@ -1601,27 +1682,32 @@  struct processor_costs pentium4_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   16,					/* "large" insn */
   6,					/* MOVE_RATIO */
-  2,				     /* cost for loading QImode using movzbl */
+
+  /* All move costs are relative to integer->integer move times 2 and thus
+     they are latency*2. */
+  5,				     /* cost for loading QImode using movzbl */
   {4, 5, 4},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {2, 3, 2},				/* cost of storing integer registers */
-  2,					/* cost of reg,reg fld/fst */
-  {2, 2, 6},				/* cost of loading fp registers
+  12,					/* cost of reg,reg fld/fst */
+  {14, 14, 14},				/* cost of loading fp registers
 					   in SFmode, DFmode and XFmode */
-  {4, 4, 6},				/* cost of storing fp registers
+  {14, 14, 14},				/* cost of storing fp registers
 					   in SFmode, DFmode and XFmode */
-  2,					/* cost of moving MMX register */
-  {2, 2},				/* cost of loading MMX registers
+  12,					/* cost of moving MMX register */
+  {16, 16},				/* cost of loading MMX registers
 					   in SImode and DImode */
-  {2, 2},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  12,					/* cost of moving SSE register */
-  {12, 12, 12},				/* cost of loading SSE registers
-					   in SImode, DImode and TImode */
-  {2, 2, 8},				/* cost of storing SSE registers
-					   in SImode, DImode and TImode */
-  10,					/* MMX or SSE register to integer */
+  {16, 16},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  12, 24, 48,				/* cost of moving XMM,YMM,ZMM register */
+  {16, 16, 16, 32, 64},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {32, 32, 32, 64, 128},		/* cost of unaligned loads.  */
+  {16, 16, 16, 32, 64},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {32, 32, 32, 64, 128},		/* cost of unaligned stores.  */
+  20, 12,				/* SSE->integer and integer->SSE moves */
   8,					/* size of l1 cache.  */
   256,					/* size of l2 cache.  */
   64,					/* size of prefetch block */
@@ -1683,27 +1769,32 @@  struct processor_costs nocona_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   16,					/* "large" insn */
   17,					/* MOVE_RATIO */
+
+  /* All move costs are relative to integer->integer move times 2 and thus
+     they are latency*2. */
   4,				     /* cost for loading QImode using movzbl */
   {4, 4, 4},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
   {4, 4, 4},				/* cost of storing integer registers */
-  3,					/* cost of reg,reg fld/fst */
-  {12, 12, 12},				/* cost of loading fp registers
+  12,					/* cost of reg,reg fld/fst */
+  {14, 14, 14},				/* cost of loading fp registers
 					   in SFmode, DFmode and XFmode */
-  {4, 4, 4},				/* cost of storing fp registers
+  {14, 14, 14},				/* cost of storing fp registers
 					   in SFmode, DFmode and XFmode */
-  6,					/* cost of moving MMX register */
+  14,					/* cost of moving MMX register */
   {12, 12},				/* cost of loading MMX registers
 					   in SImode and DImode */
   {12, 12},				/* cost of storing MMX registers
 					   in SImode and DImode */
-  6,					/* cost of moving SSE register */
-  {12, 12, 12},				/* cost of loading SSE registers
-					   in SImode, DImode and TImode */
-  {12, 12, 12},				/* cost of storing SSE registers
-					   in SImode, DImode and TImode */
-  8,					/* MMX or SSE register to integer */
+  6, 12, 24,				/* cost of moving XMM,YMM,ZMM register */
+  {12, 12, 12, 24, 48},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {24, 24, 24, 48, 96},			/* cost of unaligned loads.  */
+  {12, 12, 12, 24, 48},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {24, 24, 24, 48, 96},			/* cost of unaligned stores.  */
+  20, 12,				/* SSE->integer and integer->SSE moves */
   8,					/* size of l1 cache.  */
   1024,					/* size of l2 cache.  */
   64,					/* size of prefetch block */
@@ -1763,27 +1854,32 @@  struct processor_costs atom_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   17,					/* MOVE_RATIO */
-  4,					/* cost for loading QImode using movzbl */
-  {4, 4, 4},				/* cost of loading integer registers
+
+  /* All move costs are relative to integer->integer move times 2 and thus
+     they are latency*2. */
+  6,					/* cost for loading QImode using movzbl */
+  {6, 6, 6},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
-  {4, 4, 4},				/* cost of storing integer registers */
+  {6, 6, 6},				/* cost of storing integer registers */
   4,					/* cost of reg,reg fld/fst */
-  {12, 12, 12},				/* cost of loading fp registers
+  {6, 6, 18},				/* cost of loading fp registers
 					   in SFmode, DFmode and XFmode */
-  {6, 6, 8},				/* cost of storing fp registers
+  {14, 14, 24},				/* cost of storing fp registers
 					   in SFmode, DFmode and XFmode */
   2,					/* cost of moving MMX register */
   {8, 8},				/* cost of loading MMX registers
 					   in SImode and DImode */
-  {8, 8},				/* cost of storing MMX registers
+  {10, 10},				/* cost of storing MMX registers
 					   in SImode and DImode */
-  2,					/* cost of moving SSE register */
-  {8, 8, 8},				/* cost of loading SSE registers
-					   in SImode, DImode and TImode */
-  {8, 8, 8},				/* cost of storing SSE registers
-					   in SImode, DImode and TImode */
-  5,					/* MMX or SSE register to integer */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {8, 8, 8, 16, 32},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {16, 16, 16, 32, 64},			/* cost of unaligned loads.  */
+  {8, 8, 8, 16, 32},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
+  8, 6,					/* SSE->integer and integer->SSE moves */
   32,					/* size of l1 cache.  */
   256,					/* size of l2 cache.  */
   64,					/* size of prefetch block */
@@ -1843,27 +1939,32 @@  struct processor_costs slm_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   17,					/* MOVE_RATIO */
-  4,					/* cost for loading QImode using movzbl */
-  {4, 4, 4},				/* cost of loading integer registers
+
+  /* All move costs are relative to integer->integer move times 2 and thus
+     they are latency*2. */
+  8,					/* cost for loading QImode using movzbl */
+  {8, 8, 8},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
-  {4, 4, 4},				/* cost of storing integer registers */
-  4,					/* cost of reg,reg fld/fst */
-  {12, 12, 12},				/* cost of loading fp registers
+  {6, 6, 6},				/* cost of storing integer registers */
+  2,					/* cost of reg,reg fld/fst */
+  {8, 8, 18},				/* cost of loading fp registers
 					   in SFmode, DFmode and XFmode */
-  {6, 6, 8},				/* cost of storing fp registers
+  {6, 6, 18},				/* cost of storing fp registers
 					   in SFmode, DFmode and XFmode */
   2,					/* cost of moving MMX register */
   {8, 8},				/* cost of loading MMX registers
 					   in SImode and DImode */
-  {8, 8},				/* cost of storing MMX registers
+  {6, 6},				/* cost of storing MMX registers
 					   in SImode and DImode */
-  2,					/* cost of moving SSE register */
-  {8, 8, 8},				/* cost of loading SSE registers
-					   in SImode, DImode and TImode */
-  {8, 8, 8},				/* cost of storing SSE registers
-					   in SImode, DImode and TImode */
-  5,					/* MMX or SSE register to integer */
+  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
+  {8, 8, 8, 16, 32},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {16, 16, 16, 32, 64},			/* cost of unaligned loads.  */
+  {8, 8, 8, 16, 32},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
+  8, 6,					/* SSE->integer and integer->SSE moves */
   32,					/* size of l1 cache.  */
   256,					/* size of l2 cache.  */
   64,					/* size of prefetch block */
@@ -1923,6 +2024,9 @@  struct processor_costs intel_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   17,					/* MOVE_RATIO */
+
+  /* All move costs are relative to integer->integer move times 2 and thus
+     they are latency*2. */
   6,				     /* cost for loading QImode using movzbl */
   {4, 4, 4},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
@@ -1938,12 +2042,14 @@  struct processor_costs intel_cost = {
 					   in SImode and DImode */
   {6, 6},				/* cost of storing MMX registers
 					   in SImode and DImode */
-  2,					/* cost of moving SSE register */
-  {6, 6, 6},				/* cost of loading SSE registers
-					   in SImode, DImode and TImode */
-  {6, 6, 6},				/* cost of storing SSE registers
-					   in SImode, DImode and TImode */
-  2,					/* MMX or SSE register to integer */
+  2, 2, 2,				/* cost of moving XMM,YMM,ZMM register */
+  {6, 6, 6, 6, 6},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
+  {6, 6, 6, 6, 6},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
+  4, 4,					/* SSE->integer and integer->SSE moves */
   32,					/* size of l1 cache.  */
   256,					/* size of l2 cache.  */
   64,					/* size of prefetch block */
@@ -2010,6 +2116,9 @@  struct processor_costs generic_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   17,					/* MOVE_RATIO */
+
+  /* All move costs are relative to integer->integer move times 2 and thus
+     they are latency*2. */
   4,				     /* cost for loading QImode using movzbl */
   {4, 4, 4},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
@@ -2025,12 +2134,14 @@  struct processor_costs generic_cost = {
 					   in SImode and DImode */
   {6, 6},				/* cost of storing MMX registers
 					   in SImode and DImode */
-  2,					/* cost of moving SSE register */
-  {6, 6, 6},				/* cost of loading SSE registers
-					   in SImode, DImode and TImode */
-  {6, 6, 6},				/* cost of storing SSE registers
-					   in SImode, DImode and TImode */
-  6,					/* MMX or SSE register to integer */
+  2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
+  {6, 6, 6, 10, 15},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {10, 10, 10, 15, 20},			/* cost of unaligned loads.  */
+  {6, 6, 6, 10, 15},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {10, 10, 10, 15, 20},			/* cost of unaligned storess.  */
+  20, 20,				/* SSE->integer and integer->SSE moves */
   32,					/* size of l1 cache.  */
   512,					/* size of l2 cache.  */
   64,					/* size of prefetch block */
@@ -2102,6 +2213,9 @@  struct processor_costs core_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   17,					/* MOVE_RATIO */
+
+  /* All move costs are relative to integer->integer move times 2 and thus
+     they are latency*2. */
   6,				     /* cost for loading QImode using movzbl */
   {4, 4, 4},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
@@ -2117,12 +2231,14 @@  struct processor_costs core_cost = {
 					   in SImode and DImode */
   {6, 6},				/* cost of storing MMX registers
 					   in SImode and DImode */
-  2,					/* cost of moving SSE register */
-  {6, 6, 6},				/* cost of loading SSE registers
-					   in SImode, DImode and TImode */
-  {6, 6, 6},				/* cost of storing SSE registers
-					   in SImode, DImode and TImode */
-  2,					/* MMX or SSE register to integer */
+  2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
+  {6, 6, 6, 6, 12},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
+  {6, 6, 6, 6, 12},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 6, 12},			/* cost of unaligned stores.  */
+  2, 2,					/* SSE->integer and integer->SSE moves */
   64,					/* size of l1 cache.  */
   512,					/* size of l2 cache.  */
   64,					/* size of prefetch block */