diff mbox series

Zen tuning part 1 (reassociation width)

Message ID 20171005134602.GA57013@kam.mff.cuni.cz
State New
Headers show
Series Zen tuning part 1 (reassociation width) | expand

Commit Message

Jan Hubicka Oct. 5, 2017, 1:46 p.m. UTC
Hi,
this patch enables reassociation of integer and vector operations for Zen.
While doing so I have noticed that the logic is split across three target hooks
(TARGET_VECTOR_PARALLEL_EXECUTION, TARGET_REASSOC_INT_TO_PARALLEL and
TARGET_REASSOC_FP_TO_PARALLEL) and function ix86_reassociation_width.

This makes it quite non-obious that all three places needs to be kept in sync
and in fact the comments in ix86_reassociation_width seems to suggest that
often this was not done properly.

This patch replaces it by similar scheme as arm backend - the reassociation
widths are split to int, fp, vector int and vector fp and present in cost
tables (where one looks while doing the tuning).

ix86_reassociation_width then just handles special cases which does not fit
into table scheme very well (like assymetry of integer vector operations on Zen
and the fact htat Zen splits 256 bit operations to 128bit parts and thus
reassociation width is smaller).

I have also kept existing logic of capping the value by 2 for 32bit compilation.
I did not benchmarked that thoroughly but I assume it sort of makes sense given
the register pressure costs.

I have tried my best to preserve existing settings for different target (some
of them I think does not make much sense).  One change I am aware of is that I
enabled reassociation for core2 which shares same cost table with later core
based CPUs.  I think that should be a good idea because core2 parallelizm was
similar to later variants.

For zen i have experiented with enabling reassociation in all four cases
(int/fp vector or not). Using the theroretical width needed by the CPU leads
to small regression in specFP2000 and thus I chose 4 instead of 6. We may want
to reduce the value if further regressions are tracked down to this patch.

Incrementally we may try to experient with sanitizing the values for other targets
and generic tuning.

Bootstrapped/regtested x86_64-linux.
Will commit it shortly.

Honza

	* i386.c (ix86_size_cost, i386_cost, i486_cost, pentium_cost,
	lakemont_cost, pentiumpro_cost, geode_cost, k6_cost,
	athlon_cost, k8_cost, amdfam10_cost, btver1_cost, btver2_cost,
	pentium4_cost, nocona_cost): Set reassociation width to 1.
	(bdver1_cost, bdver2_cost, bdver3_cost, bdver4_cost): Set reassociation
	width to 2 for fp operations and 1 otherwise.
	(znver1_cost): Set scalar reassoc width to 4 and vector to 3 and 6
	for int and fp.
	(atom_cost): Set reassociation width to 2.
	(slm_cost, generic_cost): Set fp reassociation width to 2 and 1 otherwise.
	(intel_cost): Set fp reassociation width to 4 and 1 otherwise.
	(core_cost): Set fp reassociation width to 4 and vector to 2.
	(ix86_reassociation_width): Rewrite using cost table; special case
	plus/minus on Zen; honor X86_TUNE_SSE_SPLIT_REGS
	and TARGET_AVX128_OPTIMAL.
	* i386.h (processor_costs): Add
	reassoc_int, reassoc_fp, reassoc_vec_int, reassoc_vec_fp.
	(TARGET_VECTOR_PARALLEL_EXECUTION, TARGET_REASSOC_INT_TO_PARALLEL,
	TARGET_REASSOC_FP_TO_PARALLEL): Remove.
	* x86-tune.def (X86_TUNE_REASSOC_INT_TO_PARALLEL): Remove.
	(X86_TUNE_REASSOC_FP_TO_PARALLEL): Remove.
	(X86_TUNE_VECTOR_PARALLEL_EXECUTION):  Remove.
diff mbox series

Patch

Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 253443)
+++ config/i386/i386.c	(working copy)
@@ -177,6 +177,7 @@  struct processor_costs ix86_size_cost =
   COSTS_N_BYTES (2),			/* cost of FABS instruction.  */
   COSTS_N_BYTES (2),			/* cost of FCHS instruction.  */
   COSTS_N_BYTES (2),			/* cost of FSQRT instruction.  */
+  1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   ix86_size_memcpy,
   ix86_size_memset,
   1,					/* scalar_stmt_cost.  */
@@ -253,6 +254,7 @@  struct processor_costs i386_cost = {	/*
   COSTS_N_INSNS (22),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (24),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (122),			/* cost of FSQRT instruction.  */
+  1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   i386_memcpy,
   i386_memset,
   1,					/* scalar_stmt_cost.  */
@@ -330,6 +332,7 @@  struct processor_costs i486_cost = {	/*
   COSTS_N_INSNS (3),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (3),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (83),			/* cost of FSQRT instruction.  */
+  1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   i486_memcpy,
   i486_memset,
   1,					/* scalar_stmt_cost.  */
@@ -405,6 +408,7 @@  struct processor_costs pentium_cost = {
   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (70),			/* cost of FSQRT instruction.  */
+  1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   pentium_memcpy,
   pentium_memset,
   1,					/* scalar_stmt_cost.  */
@@ -473,6 +477,7 @@  struct processor_costs lakemont_cost = {
   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (70),			/* cost of FSQRT instruction.  */
+  1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   pentium_memcpy,
   pentium_memset,
   1,					/* scalar_stmt_cost.  */
@@ -556,6 +561,7 @@  struct processor_costs pentiumpro_cost =
   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (56),			/* cost of FSQRT instruction.  */
+  1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   pentiumpro_memcpy,
   pentiumpro_memset,
   1,					/* scalar_stmt_cost.  */
@@ -631,6 +637,7 @@  struct processor_costs geode_cost = {
   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (54),			/* cost of FSQRT instruction.  */
+  1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   geode_memcpy,
   geode_memset,
   1,					/* scalar_stmt_cost.  */
@@ -708,6 +715,7 @@  struct processor_costs k6_cost = {
   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (56),			/* cost of FSQRT instruction.  */
+  1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   k6_memcpy,
   k6_memset,
   1,					/* scalar_stmt_cost.  */
@@ -785,6 +793,7 @@  struct processor_costs athlon_cost = {
   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
+  1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   athlon_memcpy,
   athlon_memset,
   1,					/* scalar_stmt_cost.  */
@@ -871,7 +880,7 @@  struct processor_costs k8_cost = {
   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
-
+  1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   k8_memcpy,
   k8_memset,
   4,					/* scalar_stmt_cost.  */
@@ -965,7 +974,7 @@  struct processor_costs amdfam10_cost = {
   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
-
+  1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   amdfam10_memcpy,
   amdfam10_memset,
   4,					/* scalar_stmt_cost.  */
@@ -1060,7 +1069,7 @@  const struct processor_costs bdver1_cost
   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
-
+  1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   bdver1_memcpy,
   bdver1_memset,
   6,					/* scalar_stmt_cost.  */
@@ -1156,7 +1165,7 @@  const struct processor_costs bdver2_cost
   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
-
+  1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   bdver2_memcpy,
   bdver2_memset,
   6,					/* scalar_stmt_cost.  */
@@ -1243,7 +1252,7 @@  struct processor_costs bdver3_cost = {
   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
-
+  1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   bdver3_memcpy,
   bdver3_memset,
   6,					/* scalar_stmt_cost.  */
@@ -1329,7 +1338,7 @@  struct processor_costs bdver4_cost = {
   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
-
+  1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   bdver4_memcpy,
   bdver4_memset,
   6,					/* scalar_stmt_cost.  */
@@ -1419,7 +1428,15 @@  struct processor_costs znver1_cost = {
   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
-
+  /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
+     and it can execute 2 integer additions and 2 multiplications thus
+     reassociation may make sense up to with of 6.  SPEC2k6 bencharks suggests
+     that 4 works better than 6 probably due to register pressure.
+
+     Integer vector operations are taken by FP unit and execute 3 vector
+     plus/minus operations per cycle but only one multiply.  This is adjusted
+     in ix86_reassociation_width.  */
+  4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
   znver1_memcpy,
   znver1_memset,
   6,					/* scalar_stmt_cost.  */
@@ -1508,7 +1525,7 @@  const struct processor_costs btver1_cost
   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
-
+  1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   btver1_memcpy,
   btver1_memset,
   4,					/* scalar_stmt_cost.  */
@@ -1594,6 +1611,7 @@  const struct processor_costs btver2_cost
   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
+  1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   btver2_memcpy,
   btver2_memset,
   4,					/* scalar_stmt_cost.  */
@@ -1670,6 +1688,7 @@  struct processor_costs pentium4_cost = {
   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (43),			/* cost of FSQRT instruction.  */
+  1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   pentium4_memcpy,
   pentium4_memset,
   1,					/* scalar_stmt_cost.  */
@@ -1749,6 +1768,7 @@  struct processor_costs nocona_cost = {
   COSTS_N_INSNS (3),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (3),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (44),			/* cost of FSQRT instruction.  */
+  1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   nocona_memcpy,
   nocona_memset,
   1,					/* scalar_stmt_cost.  */
@@ -1826,6 +1846,7 @@  struct processor_costs atom_cost = {
   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
+  2, 2, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
   atom_memcpy,
   atom_memset,
   1,					/* scalar_stmt_cost.  */
@@ -1903,6 +1924,7 @@  struct processor_costs slm_cost = {
   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
+  1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   slm_memcpy,
   slm_memset,
   1,					/* scalar_stmt_cost.  */
@@ -1980,6 +2002,7 @@  struct processor_costs intel_cost = {
   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
+  1, 4, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   intel_memcpy,
   intel_memset,
   1,					/* scalar_stmt_cost.  */
@@ -2067,6 +2090,7 @@  struct processor_costs generic_cost = {
   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
+  1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   generic_memcpy,
   generic_memset,
   1,					/* scalar_stmt_cost.  */
@@ -2153,6 +2177,7 @@  struct processor_costs core_cost = {
   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
+  1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
   core_memcpy,
   core_memset,
   1,					/* scalar_stmt_cost.  */
@@ -51830,34 +51855,47 @@  has_dispatch (rtx_insn *insn, int action
 /* Implementation of reassociation_width target hook used by
    reassoc phase to identify parallelism level in reassociated
    tree.  Statements tree_code is passed in OPC.  Arguments type
-   is passed in MODE.
-
-   Currently parallel reassociation is enabled for Atom
-   processors only and we set reassociation width to be 2
-   because Atom may issue up to 2 instructions per cycle.
-
-   Return value should be fixed if parallel reassociation is
-   enabled for other processors.  */
+   is passed in MODE.  */
 
 static int
-ix86_reassociation_width (unsigned int, machine_mode mode)
+ix86_reassociation_width (unsigned int op, machine_mode mode)
 {
+  int width = 1;
   /* Vector part.  */
   if (VECTOR_MODE_P (mode))
     {
-      if (TARGET_VECTOR_PARALLEL_EXECUTION)
-	return 2;
-      else
+      int div;
+      if (INTEGRAL_MODE_P (mode))
+	width = ix86_cost->reassoc_vec_int;
+      else if (FLOAT_MODE_P (mode))
+	width = ix86_cost->reassoc_vec_fp;
+
+      if (width == 1)
+	return 1;
+
+      /* Integer vector instructions execute in FP unit
+	 and can execute 3 additions and one multiplication per cycle.  */
+      if (ix86_tune == PROCESSOR_ZNVER1 && INTEGRAL_MODE_P (mode)
+	  && op != PLUS && op != MINUS)
 	return 1;
-    }
 
+      /* Account for targets that splits wide vectors into multiple parts.  */
+      if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (mode) > 128)
+	div = GET_MODE_BITSIZE (mode) / 128;
+      else if (X86_TUNE_SSE_SPLIT_REGS && GET_MODE_BITSIZE (mode) > 64)
+	div = GET_MODE_BITSIZE (mode) / 64;
+      width = (width + div - 1) / div;
+    }
   /* Scalar part.  */
-  if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
-    return 2;
-  else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
-    return ((TARGET_64BIT && ix86_tune == PROCESSOR_HASWELL)? 4 : 2);
-  else
-    return 1;
+  else if (INTEGRAL_MODE_P (mode))
+    width = ix86_cost->reassoc_int;
+  else if (FLOAT_MODE_P (mode))
+    width = ix86_cost->reassoc_fp;
+
+  /* Avoid using too many registers in 32bit mode.  */
+  if (!TARGET_64BIT && width > 2)
+    width = 2;
+  return width;
 }
 
 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
Index: config/i386/i386.h
===================================================================
--- config/i386/i386.h	(revision 253443)
+++ config/i386/i386.h	(working copy)
@@ -257,6 +257,13 @@  struct processor_costs {
   const int fsqrt;		/* cost of FSQRT instruction.  */
 				/* Specify what algorithm
 				   to use for stringops on unknown size.  */
+  const int reassoc_int, reassoc_fp, reassoc_vec_int, reassoc_vec_fp;
+				/* Specify reassociation width for integer,
+				   fp, vector integer and vector fp
+				   operations.  Generally should correspond
+				   to number of instructions executed in
+				   parallel.  See also
+				   ix86_reassociation_width.  */
   struct stringop_algs *memcpy, *memset;
   const int scalar_stmt_cost;   /* Cost of any scalar operation, excluding
 				   load and store.  */
@@ -466,8 +473,6 @@  extern unsigned char ix86_tune_features[
 	ix86_tune_features[X86_TUNE_USE_VECTOR_CONVERTS]
 #define TARGET_SLOW_PSHUFB \
 	ix86_tune_features[X86_TUNE_SLOW_PSHUFB]
-#define TARGET_VECTOR_PARALLEL_EXECUTION \
-	ix86_tune_features[X86_TUNE_VECTOR_PARALLEL_EXECUTION]
 #define TARGET_AVOID_4BYTE_PREFIXES \
 	ix86_tune_features[X86_TUNE_AVOID_4BYTE_PREFIXES]
 #define TARGET_FUSE_CMP_AND_BRANCH_32 \
@@ -488,10 +493,6 @@  extern unsigned char ix86_tune_features[
 	ix86_tune_features[X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL]
 #define TARGET_AVX128_OPTIMAL \
 	ix86_tune_features[X86_TUNE_AVX128_OPTIMAL]
-#define TARGET_REASSOC_INT_TO_PARALLEL \
-	ix86_tune_features[X86_TUNE_REASSOC_INT_TO_PARALLEL]
-#define TARGET_REASSOC_FP_TO_PARALLEL \
-	ix86_tune_features[X86_TUNE_REASSOC_FP_TO_PARALLEL]
 #define TARGET_GENERAL_REGS_SSE_SPILL \
 	ix86_tune_features[X86_TUNE_GENERAL_REGS_SSE_SPILL]
 #define TARGET_AVOID_MEM_OPND_FOR_CMOVE \
Index: config/i386/x86-tune.def
===================================================================
--- config/i386/x86-tune.def	(revision 253443)
+++ config/i386/x86-tune.def	(working copy)
@@ -117,16 +117,6 @@  DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_S
 DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch",
           m_SANDYBRIDGE | m_HASWELL)
 
-/* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
-   during reassociation of integer computation.  */
-DEF_TUNE (X86_TUNE_REASSOC_INT_TO_PARALLEL, "reassoc_int_to_parallel",
-          m_BONNELL)
-
-/* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
-   during reassociation of fp computation.  */
-DEF_TUNE (X86_TUNE_REASSOC_FP_TO_PARALLEL, "reassoc_fp_to_parallel",
-          m_BONNELL | m_SILVERMONT | m_HASWELL | m_KNL | m_KNM |m_INTEL | m_BDVER1
-	  | m_BDVER2 | m_ZNVER1 | m_GENERIC)
 
 /*****************************************************************************/
 /* Function prologue, epilogue and function calling sequences.               */
@@ -391,11 +381,6 @@  DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS,
 DEF_TUNE (X86_TUNE_SLOW_PSHUFB, "slow_pshufb",
           m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL)
 
-/* X86_TUNE_VECTOR_PARALLEL_EXECUTION: Indicates tunings with ability to
-   execute 2 or more vector instructions in parallel.  */
-DEF_TUNE (X86_TUNE_VECTOR_PARALLEL_EXECUTION, "vec_parallel",
-          m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
-
 /* X86_TUNE_AVOID_4BYTE_PREFIXES: Avoid instructions requiring 4+ bytes of prefixes.  */
 DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes",
           m_SILVERMONT | m_INTEL)