diff mbox series

Zen tuning part 8: Fix rtx costs.

Message ID 20171013204710.GB8466@kam.mff.cuni.cz
State New
Headers show
Series Zen tuning part 8: Fix rtx costs. | expand

Commit Message

Jan Hubicka Oct. 13, 2017, 8:47 p.m. UTC
Hi,
this patch fixes costs of basic operations for Zen. It also models SSE more
carefully.  ix86_rtx_costs is still based on x87 costs of operations which is
not very realistic today when x87 and SSE costs are often quite different.

The latencies in this patch are based on Agner Fog's values and I hope they
match reality for all supported CPUs.  Costs of vector operations are still
off (and as is the vectorizer costmodel) I will look into that incrementally.

Bootstrapped/regtested x86_64-linux, will commit it tomorrow after periodic
testers pick today changes.

Bootstrapped/regtested x86_64-linux.
Honza

	* i386.c (ix86_rtx_costs): Make difference between x87 and SSE
	operations.
	* i386.h (struct processor_costs): Add addss, mulss, mulsd, divss,
	divsd, sqrtss and sqrtsd
	* x86-tune-costs.h: Add new entries to all costs.
	(znver1_cost): Fix to match real instruction latencies.
diff mbox series

Patch

Index: i386.c
===================================================================
--- i386.c	(revision 253694)
+++ i386.c	(working copy)
@@ -38812,6 +38812,9 @@  ix86_rtx_costs (rtx x, machine_mode mode
   enum rtx_code outer_code = (enum rtx_code) outer_code_i;
   const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
   int src_cost;
+  machine_mode inner_mode = mode;
+  if (VECTOR_MODE_P (mode))
+    inner_mode = GET_MODE_INNER (mode);
 
   switch (code)
     {
@@ -39012,7 +39015,7 @@  ix86_rtx_costs (rtx x, machine_mode mode
 
         /* ??? SSE scalar/vector cost should be used here.  */
         /* ??? Bald assumption that fma has the same cost as fmul.  */
-        *total = cost->fmul;
+        *total = mode == SFmode ? cost->mulss : cost->mulsd;
 	*total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
 
         /* Negate in op0 or op2 is free: FMS, FNMA, FNMS.  */
@@ -39031,8 +39034,7 @@  ix86_rtx_costs (rtx x, machine_mode mode
     case MULT:
       if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
 	{
-	  /* ??? SSE scalar cost should be used here.  */
-	  *total = cost->fmul;
+	  *total = inner_mode == DFmode ? cost->mulsd : cost->mulss;
 	  return false;
 	}
       else if (X87_FLOAT_MODE_P (mode))
@@ -39043,7 +39045,7 @@  ix86_rtx_costs (rtx x, machine_mode mode
       else if (FLOAT_MODE_P (mode))
 	{
 	  /* ??? SSE vector cost should be used here.  */
-	  *total = cost->fmul;
+	  *total = inner_mode == DFmode ? cost->mulsd : cost->mulss;
 	  return false;
 	}
       else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
@@ -39071,7 +39073,7 @@  ix86_rtx_costs (rtx x, machine_mode mode
 	  else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
 	    *total = cost->fmul * 2 + cost->fabs * 5;
 	  else
-	    *total = cost->fmul;
+	    *total = inner_mode == DFmode ? cost->mulsd : cost->mulss;
 	  return false;
 	}
       else
@@ -39125,13 +39127,12 @@  ix86_rtx_costs (rtx x, machine_mode mode
     case MOD:
     case UMOD:
       if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
-	/* ??? SSE cost should be used here.  */
-	*total = cost->fdiv;
+	*total = inner_mode == DFmode ? cost->divsd : cost->divss;
       else if (X87_FLOAT_MODE_P (mode))
 	*total = cost->fdiv;
       else if (FLOAT_MODE_P (mode))
 	/* ??? SSE vector cost should be used here.  */
-	*total = cost->fdiv;
+	*total = inner_mode == DFmode ? cost->divsd : cost->divss;
       else
 	*total = cost->divide[MODE_INDEX (mode)];
       return false;
@@ -39210,8 +39211,7 @@  ix86_rtx_costs (rtx x, machine_mode mode
 
       if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
 	{
-	  /* ??? SSE cost should be used here.  */
-	  *total = cost->fadd;
+	  *total = cost->addss;
 	  return false;
 	}
       else if (X87_FLOAT_MODE_P (mode))
@@ -39221,8 +39221,8 @@  ix86_rtx_costs (rtx x, machine_mode mode
 	}
       else if (FLOAT_MODE_P (mode))
 	{
-	  /* ??? SSE vector cost should be used here.  */
-	  *total = cost->fadd;
+	  /* We should account if registers are split.  */
+	  *total = cost->addss;
 	  return false;
 	}
       /* FALLTHRU */
@@ -39317,13 +39317,12 @@  ix86_rtx_costs (rtx x, machine_mode mode
 
     case SQRT:
       if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
-	/* ??? SSE cost should be used here.  */
-	*total = cost->fsqrt;
+	*total = mode == SFmode ? cost->sqrtss : cost->sqrtsd;
       else if (X87_FLOAT_MODE_P (mode))
 	*total = cost->fsqrt;
       else if (FLOAT_MODE_P (mode))
 	/* ??? SSE vector cost should be used here.  */
-	*total = cost->fsqrt;
+	*total = mode == SFmode ? cost->sqrtss : cost->sqrtsd;
       return false;
 
     case UNSPEC:
Index: i386.h
===================================================================
--- i386.h	(revision 253694)
+++ i386.h	(working copy)
@@ -257,6 +257,13 @@  struct processor_costs {
   const int fsqrt;		/* cost of FSQRT instruction.  */
 				/* Specify what algorithm
 				   to use for stringops on unknown size.  */
+  const int addss;		/* cost of ADDSS/SD SUBSS/SD instructions.  */
+  const int mulss;		/* cost of MULSS instructions.  */
+  const int mulsd;		/* cost of MULSD instructions.  */
+  const int divss;		/* cost of DIVSS instructions.  */
+  const int divsd;		/* cost of DIVSD instructions.  */
+  const int sqrtss;		/* cost of SQRTSS instructions.  */
+  const int sqrtsd;		/* cost of SQRTSD instructions.  */
   const int reassoc_int, reassoc_fp, reassoc_vec_int, reassoc_vec_fp;
 				/* Specify reassociation width for integer,
 				   fp, vector integer and vector fp
Index: x86-tune-costs.h
===================================================================
--- x86-tune-costs.h	(revision 253694)
+++ x86-tune-costs.h	(working copy)
@@ -65,6 +65,14 @@  struct processor_costs ix86_size_cost =
   COSTS_N_BYTES (2),			/* cost of FABS instruction.  */
   COSTS_N_BYTES (2),			/* cost of FCHS instruction.  */
   COSTS_N_BYTES (2),			/* cost of FSQRT instruction.  */
+
+  COSTS_N_BYTES (2),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_BYTES (2),			/* cost of MULSS instruction.  */
+  COSTS_N_BYTES (2),			/* cost of MULSD instruction.  */
+  COSTS_N_BYTES (2),			/* cost of DIVSS instruction.  */
+  COSTS_N_BYTES (2),			/* cost of DIVSD instruction.  */
+  COSTS_N_BYTES (2),			/* cost of SQRTSS instruction.  */
+  COSTS_N_BYTES (2),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   ix86_size_memcpy,
   ix86_size_memset,
@@ -142,6 +150,14 @@  struct processor_costs i386_cost = {	/*
   COSTS_N_INSNS (22),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (24),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (122),			/* cost of FSQRT instruction.  */
+
+  COSTS_N_INSNS (23),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (27),			/* cost of MULSS instruction.  */
+  COSTS_N_INSNS (27),			/* cost of MULSD instruction.  */
+  COSTS_N_INSNS (88),			/* cost of DIVSS instruction.  */
+  COSTS_N_INSNS (88),			/* cost of DIVSD instruction.  */
+  COSTS_N_INSNS (122),			/* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (122),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   i386_memcpy,
   i386_memset,
@@ -220,6 +236,14 @@  struct processor_costs i486_cost = {	/*
   COSTS_N_INSNS (3),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (3),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (83),			/* cost of FSQRT instruction.  */
+
+  COSTS_N_INSNS (8),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (16),			/* cost of MULSS instruction.  */
+  COSTS_N_INSNS (16),			/* cost of MULSD instruction.  */
+  COSTS_N_INSNS (73),			/* cost of DIVSS instruction.  */
+  COSTS_N_INSNS (74),			/* cost of DIVSD instruction.  */
+  COSTS_N_INSNS (83),			/* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (83),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   i486_memcpy,
   i486_memset,
@@ -296,6 +320,14 @@  struct processor_costs pentium_cost = {
   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (70),			/* cost of FSQRT instruction.  */
+
+  COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (3),			/* cost of MULSS instruction.  */
+  COSTS_N_INSNS (3),			/* cost of MULSD instruction.  */
+  COSTS_N_INSNS (39),			/* cost of DIVSS instruction.  */
+  COSTS_N_INSNS (39),			/* cost of DIVSD instruction.  */
+  COSTS_N_INSNS (70),			/* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (70),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   pentium_memcpy,
   pentium_memset,
@@ -365,6 +397,14 @@  struct processor_costs lakemont_cost = {
   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (70),			/* cost of FSQRT instruction.  */
+
+  COSTS_N_INSNS (5),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (5),			/* cost of MULSS instruction.  */
+  COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
+  COSTS_N_INSNS (31),			/* cost of DIVSS instruction.  */
+  COSTS_N_INSNS (60),			/* cost of DIVSD instruction.  */
+  COSTS_N_INSNS (31),			/* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (63),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   pentium_memcpy,
   pentium_memset,
@@ -449,6 +489,14 @@  struct processor_costs pentiumpro_cost =
   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (56),			/* cost of FSQRT instruction.  */
+
+  COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
+  COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
+  COSTS_N_INSNS (18),			/* cost of DIVSS instruction.  */
+  COSTS_N_INSNS (18),			/* cost of DIVSD instruction.  */
+  COSTS_N_INSNS (31),			/* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (31),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   pentiumpro_memcpy,
   pentiumpro_memset,
@@ -525,6 +573,14 @@  struct processor_costs geode_cost = {
   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (54),			/* cost of FSQRT instruction.  */
+
+  COSTS_N_INSNS (6),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (11),			/* cost of MULSS instruction.  */
+  COSTS_N_INSNS (11),			/* cost of MULSD instruction.  */
+  COSTS_N_INSNS (47),			/* cost of DIVSS instruction.  */
+  COSTS_N_INSNS (47),			/* cost of DIVSD instruction.  */
+  COSTS_N_INSNS (54),			/* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (54),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   geode_memcpy,
   geode_memset,
@@ -603,6 +659,14 @@  struct processor_costs k6_cost = {
   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (56),			/* cost of FSQRT instruction.  */
+
+  COSTS_N_INSNS (2),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (2),			/* cost of MULSS instruction.  */
+  COSTS_N_INSNS (2),			/* cost of MULSD instruction.  */
+  COSTS_N_INSNS (56),			/* cost of DIVSS instruction.  */
+  COSTS_N_INSNS (56),			/* cost of DIVSD instruction.  */
+  COSTS_N_INSNS (56),			/* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (56),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   k6_memcpy,
   k6_memset,
@@ -681,6 +745,15 @@  struct processor_costs athlon_cost = {
   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
+
+  COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
+  COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
+  /* 11-16  */
+  COSTS_N_INSNS (16),			/* cost of DIVSS instruction.  */
+  COSTS_N_INSNS (24),			/* cost of DIVSD instruction.  */
+  COSTS_N_INSNS (19),			/* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (19),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   athlon_memcpy,
   athlon_memset,
@@ -768,6 +841,15 @@  struct processor_costs k8_cost = {
   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
+
+  COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
+  COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
+  /* 11-16  */
+  COSTS_N_INSNS (16),			/* cost of DIVSS instruction.  */
+  COSTS_N_INSNS (20),			/* cost of DIVSD instruction.  */
+  COSTS_N_INSNS (19),			/* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (27),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   k8_memcpy,
   k8_memset,
@@ -862,6 +944,15 @@  struct processor_costs amdfam10_cost = {
   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
+
+  COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
+  COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
+  /* 11-16  */
+  COSTS_N_INSNS (16),			/* cost of DIVSS instruction.  */
+  COSTS_N_INSNS (20),			/* cost of DIVSD instruction.  */
+  COSTS_N_INSNS (19),			/* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (27),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   amdfam10_memcpy,
   amdfam10_memset,
@@ -957,6 +1048,16 @@  const struct processor_costs bdver1_cost
   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
+
+  COSTS_N_INSNS (6),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (6),			/* cost of MULSS instruction.  */
+  COSTS_N_INSNS (6),			/* cost of MULSD instruction.  */
+  /* 9-24  */
+  COSTS_N_INSNS (24),			/* cost of DIVSS instruction.  */
+  /* 9-27  */
+  COSTS_N_INSNS (27),			/* cost of DIVSD instruction.  */
+  COSTS_N_INSNS (15),			/* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (26),			/* cost of SQRTSD instruction.  */
   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   bdver1_memcpy,
   bdver1_memset,
@@ -1053,6 +1154,16 @@  const struct processor_costs bdver2_cost
   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
+
+  COSTS_N_INSNS (6),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (6),			/* cost of MULSS instruction.  */
+  COSTS_N_INSNS (6),			/* cost of MULSD instruction.  */
+  /* 9-24  */
+  COSTS_N_INSNS (24),			/* cost of DIVSS instruction.  */
+  /* 9-27  */
+  COSTS_N_INSNS (27),			/* cost of DIVSD instruction.  */
+  COSTS_N_INSNS (15),			/* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (26),			/* cost of SQRTSD instruction.  */
   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   bdver2_memcpy,
   bdver2_memset,
@@ -1140,6 +1251,16 @@  struct processor_costs bdver3_cost = {
   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
+
+  COSTS_N_INSNS (6),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (6),			/* cost of MULSS instruction.  */
+  COSTS_N_INSNS (6),			/* cost of MULSD instruction.  */
+  /* 9-24  */
+  COSTS_N_INSNS (24),			/* cost of DIVSS instruction.  */
+  /* 9-27  */
+  COSTS_N_INSNS (27),			/* cost of DIVSD instruction.  */
+  COSTS_N_INSNS (15),			/* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (26),			/* cost of SQRTSD instruction.  */
   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   bdver3_memcpy,
   bdver3_memset,
@@ -1226,6 +1347,16 @@  struct processor_costs bdver4_cost = {
   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
+
+  COSTS_N_INSNS (6),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (6),			/* cost of MULSS instruction.  */
+  COSTS_N_INSNS (6),			/* cost of MULSD instruction.  */
+  /* 9-24  */
+  COSTS_N_INSNS (24),			/* cost of DIVSS instruction.  */
+  /* 9-27  */
+  COSTS_N_INSNS (27),			/* cost of DIVSD instruction.  */
+  COSTS_N_INSNS (15),			/* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (26),			/* cost of SQRTSD instruction.  */
   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   bdver4_memcpy,
   bdver4_memset,
@@ -1264,15 +1395,17 @@  struct processor_costs znver1_cost = {
   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI.  */
    COSTS_N_INSNS (3),			/*				 HI.  */
    COSTS_N_INSNS (3),			/*				 SI.  */
-   COSTS_N_INSNS (4),			/*				 DI.  */
-   COSTS_N_INSNS (4)},			/*			      other.  */
+   COSTS_N_INSNS (3),			/*				 DI.  */
+   COSTS_N_INSNS (3)},			/*			      other.  */
   0,					/* cost of multiply per each bit
 					    set.  */
-  {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI.  */
-   COSTS_N_INSNS (35),			/*			    HI.  */
-   COSTS_N_INSNS (51),			/*			    SI.  */
-   COSTS_N_INSNS (83),			/*			    DI.  */
-   COSTS_N_INSNS (83)},			/*			    other.  */
+   /* Depending on parameters, idiv can get faster on ryzen.  This is upper
+      bound.  */
+  {COSTS_N_INSNS (16),			/* cost of a divide/mod for QI.  */
+   COSTS_N_INSNS (22),			/*			    HI.  */
+   COSTS_N_INSNS (30),			/*			    SI.  */
+   COSTS_N_INSNS (45),			/*			    DI.  */
+   COSTS_N_INSNS (45)},			/*			    other.  */
   COSTS_N_INSNS (1),			/* cost of movsx.  */
   COSTS_N_INSNS (1),			/* cost of movzx.  */
   8,					/* "large" insn.  */
@@ -1310,12 +1443,23 @@  struct processor_costs znver1_cost = {
      time).  */
   100,					/* number of parallel prefetches.  */
   3,					/* Branch cost.  */
-  COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (6),			/* cost of FMUL instruction.  */
-  COSTS_N_INSNS (42),			/* cost of FDIV instruction.  */
-  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
-  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
-  COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
+  COSTS_N_INSNS (5),			/* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
+  /* Latency of fdiv is 8-15.  */
+  COSTS_N_INSNS (15),			/* cost of FDIV instruction.  */
+  COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
+  COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
+  /* Latency of fsqrt is 4-10.  */
+  COSTS_N_INSNS (10),			/* cost of FSQRT instruction.  */
+
+  COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (3),			/* cost of MULSS instruction.  */
+  COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
+  COSTS_N_INSNS (10),			/* cost of DIVSS instruction.  */
+  /* 9-13  */
+  COSTS_N_INSNS (13),			/* cost of DIVSD instruction.  */
+  COSTS_N_INSNS (10),			/* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (15),			/* cost of SQRTSD instruction.  */
   /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
      and it can execute 2 integer additions and 2 multiplications thus
      reassociation may make sense up to with of 6.  SPEC2k6 bencharks suggests
@@ -1413,6 +1557,14 @@  const struct processor_costs btver1_cost
   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
+
+  COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (2),			/* cost of MULSS instruction.  */
+  COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
+  COSTS_N_INSNS (13),			/* cost of DIVSS instruction.  */
+  COSTS_N_INSNS (17),			/* cost of DIVSD instruction.  */
+  COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (48),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   btver1_memcpy,
   btver1_memset,
@@ -1499,6 +1651,14 @@  const struct processor_costs btver2_cost
   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
+
+  COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (2),			/* cost of MULSS instruction.  */
+  COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
+  COSTS_N_INSNS (13),			/* cost of DIVSS instruction.  */
+  COSTS_N_INSNS (19),			/* cost of DIVSD instruction.  */
+  COSTS_N_INSNS (16),			/* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (21),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   btver2_memcpy,
   btver2_memset,
@@ -1576,6 +1736,14 @@  struct processor_costs pentium4_cost = {
   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (43),			/* cost of FSQRT instruction.  */
+
+  COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (6),			/* cost of MULSS instruction.  */
+  COSTS_N_INSNS (6),			/* cost of MULSD instruction.  */
+  COSTS_N_INSNS (23),			/* cost of DIVSS instruction.  */
+  COSTS_N_INSNS (38),			/* cost of DIVSD instruction.  */
+  COSTS_N_INSNS (23),			/* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (38),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   pentium4_memcpy,
   pentium4_memset,
@@ -1656,6 +1824,14 @@  struct processor_costs nocona_cost = {
   COSTS_N_INSNS (3),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (3),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (44),			/* cost of FSQRT instruction.  */
+
+  COSTS_N_INSNS (5),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (7),			/* cost of MULSS instruction.  */
+  COSTS_N_INSNS (7),			/* cost of MULSD instruction.  */
+  COSTS_N_INSNS (32),			/* cost of DIVSS instruction.  */
+  COSTS_N_INSNS (40),			/* cost of DIVSD instruction.  */
+  COSTS_N_INSNS (32),			/* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (41),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   nocona_memcpy,
   nocona_memset,
@@ -1734,6 +1910,14 @@  struct processor_costs atom_cost = {
   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
+
+  COSTS_N_INSNS (5),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
+  COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
+  COSTS_N_INSNS (31),			/* cost of DIVSS instruction.  */
+  COSTS_N_INSNS (60),			/* cost of DIVSD instruction.  */
+  COSTS_N_INSNS (31),			/* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (63),			/* cost of SQRTSD instruction.  */
   2, 2, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
   atom_memcpy,
   atom_memset,
@@ -1812,6 +1996,14 @@  struct processor_costs slm_cost = {
   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
+
+  COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
+  COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
+  COSTS_N_INSNS (39),			/* cost of DIVSS instruction.  */
+  COSTS_N_INSNS (69),			/* cost of DIVSD instruction.  */
+  COSTS_N_INSNS (20),			/* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (35),			/* cost of SQRTSD instruction.  */
   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   slm_memcpy,
   slm_memset,
@@ -1890,6 +2082,14 @@  struct processor_costs intel_cost = {
   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
+
+  COSTS_N_INSNS (8),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (8),			/* cost of MULSS instruction.  */
+  COSTS_N_INSNS (8),			/* cost of MULSD instruction.  */
+  COSTS_N_INSNS (20),			/* cost of DIVSS instruction.  */
+  COSTS_N_INSNS (20),			/* cost of DIVSD instruction.  */
+  COSTS_N_INSNS (40),			/* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (40),			/* cost of SQRTSD instruction.  */
   1, 4, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   intel_memcpy,
   intel_memset,
@@ -1978,6 +2178,14 @@  struct processor_costs generic_cost = {
   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
+
+  COSTS_N_INSNS (8),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (8),			/* cost of MULSS instruction.  */
+  COSTS_N_INSNS (8),			/* cost of MULSD instruction.  */
+  COSTS_N_INSNS (20),			/* cost of DIVSS instruction.  */
+  COSTS_N_INSNS (20),			/* cost of DIVSD instruction.  */
+  COSTS_N_INSNS (40),			/* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (40),			/* cost of SQRTSD instruction.  */
   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   generic_memcpy,
   generic_memset,
@@ -2065,6 +2273,14 @@  struct processor_costs core_cost = {
   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
+
+  COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
+  COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
+  COSTS_N_INSNS (18),			/* cost of DIVSS instruction.  */
+  COSTS_N_INSNS (32),			/* cost of DIVSD instruction.  */
+  COSTS_N_INSNS (30),			/* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (58),			/* cost of SQRTSD instruction.  */
   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
   core_memcpy,
   core_memset,