Patchwork Fix core's cost of stringops

login
register
mail settings
Submitter Jan Hubicka
Date Nov. 23, 2012, 4:01 p.m.
Message ID <20121123160123.GB26654@kam.mff.cuni.cz>
Download mbox | patch
Permalink /patch/201361/
State New
Headers show

Comments

Jan Hubicka - Nov. 23, 2012, 4:01 p.m.
Hi,
when SSE memcpy/memset code was reverted last stage3 (because of alignment
handling bug), it went also with some unrelated changes, in particular
retunning of stringop descriptors for the new glibc implementation that finally
is sane for modern architectures.  This patch partly restores the second
part; I decided to do it on per-CPU basis and started with core.

The patch is rather long since our current Core tunning is just cut&paste
from generic model sharing the cost structure. This needs to be unshared
and further there are two generic cpus, one for 32bit and one for 64bit.
The reason is that generic tunning takes into account 32bit only CPUs,
while 64bit tunning ignores them.  This is not needed for Core, since it
is just one CPU.

Finally Core for the first time do not care about alignment of stringops; 
this allows us to drop the costly alignment prologue and thus we need extra
field in the descriptor table deciding on when it is good idea to do so.

Bootstrapped/regtested x86_64-linux, comitted.

Honza

	* i386-c.c (ix86_target_macros_internal): Update handling of core
	chips.
	* i386.c (DUMMY_STRINGOP_ALGS): Update
	(*_cost): Update.
	(core_cost): Copy from generic64_cost; fix stringop descriptors.
	(m_CORE2_32, m_CORE2_64, m_COREI7_32, m_COREI7_64, m_CORE2I7_32,
	m_CORE2I7_64): Remove.
	(m_CORE2, m_COREI7, m_CORE2I7): Update.
	(initial_ix86_tune_features): Update.
	(processor_target): Update.
	(ix86_option_override_internal): Update.
	(ix86_option_override_internal): Remove PROCESSOR_CORE2_64,
	PROCESSOR_COREI7_64 special cases.
	(decide_alg): Add noalign parameter; set it.
	(ix86_expand_movmem, ix86_expand_setmem): Update noalign handling.
	(ix86_issue_rate): Update.
	(ia32_multipass_dfa_lookahead): Update.
	(ix86_sched_init_global): Update.
	(get_builtin_code_for_version): Update.
	* i386.h (stringop_strategy): Add noalign flag.
	(TARGET_CORE2_32, TARGET_CORE2_64, TARGET_COREI7_32, TARGET_COREI7_64):
	Remove.
	(TARGET_CORE2, TARGET_COREI7): New.
	(enum processor_type): Remove PROCESSOR_CORE2_32, PROCESSOR_CORE2_64,
	PROCESSOR_COREI7_32, PROCESSOR_COREI7_64; add PROCESSOR_CORE2,
	PROCESSOR_COREI7.

Patch

Index: config/i386/i386-c.c
===================================================================
--- config/i386/i386-c.c	(revision 193749)
+++ config/i386/i386-c.c	(working copy)
@@ -134,13 +134,11 @@  ix86_target_macros_internal (HOST_WIDE_I
       def_or_undef (parse_in, "__nocona");
       def_or_undef (parse_in, "__nocona__");
       break;
-    case PROCESSOR_CORE2_32:
-    case PROCESSOR_CORE2_64:
+    case PROCESSOR_CORE2:
       def_or_undef (parse_in, "__core2");
       def_or_undef (parse_in, "__core2__");
       break;
-    case PROCESSOR_COREI7_32:
-    case PROCESSOR_COREI7_64:
+    case PROCESSOR_COREI7:
       def_or_undef (parse_in, "__corei7");
       def_or_undef (parse_in, "__corei7__");
       break;
@@ -228,12 +226,10 @@  ix86_target_macros_internal (HOST_WIDE_I
     case PROCESSOR_NOCONA:
       def_or_undef (parse_in, "__tune_nocona__");
       break;
-    case PROCESSOR_CORE2_32:
-    case PROCESSOR_CORE2_64:
+    case PROCESSOR_CORE2:
       def_or_undef (parse_in, "__tune_core2__");
       break;
-    case PROCESSOR_COREI7_32:
-    case PROCESSOR_COREI7_64:
+    case PROCESSOR_COREI7:
       def_or_undef (parse_in, "__tune_corei7__");
       break;
     case PROCESSOR_ATOM:
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 193749)
+++ config/i386/i386.c	(working copy)
@@ -83,7 +83,7 @@  static rtx legitimize_dllimport_symbol (
 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes.  */
 #define COSTS_N_BYTES(N) ((N) * 2)
 
-#define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
+#define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
 
 const
 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
@@ -138,10 +138,10 @@  struct processor_costs ix86_size_cost =
   COSTS_N_BYTES (2),			/* cost of FABS instruction.  */
   COSTS_N_BYTES (2),			/* cost of FCHS instruction.  */
   COSTS_N_BYTES (2),			/* cost of FSQRT instruction.  */
-  {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
-   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
-  {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
-   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
+  {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
+   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}},
+  {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
+   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}},
   1,					/* scalar_stmt_cost.  */
   1,					/* scalar load_cost.  */
   1,					/* scalar_store_cost.  */
@@ -209,9 +209,9 @@  struct processor_costs i386_cost = {	/*
   COSTS_N_INSNS (22),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (24),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (122),			/* cost of FSQRT instruction.  */
-  {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
+  {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
    DUMMY_STRINGOP_ALGS},
-  {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
+  {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
    DUMMY_STRINGOP_ALGS},
   1,					/* scalar_stmt_cost.  */
   1,					/* scalar load_cost.  */
@@ -281,9 +281,9 @@  struct processor_costs i486_cost = {	/*
   COSTS_N_INSNS (3),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (3),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (83),			/* cost of FSQRT instruction.  */
-  {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
+  {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
    DUMMY_STRINGOP_ALGS},
-  {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
+  {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
    DUMMY_STRINGOP_ALGS},
   1,					/* scalar_stmt_cost.  */
   1,					/* scalar load_cost.  */
@@ -351,9 +351,9 @@  struct processor_costs pentium_cost = {
   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (70),			/* cost of FSQRT instruction.  */
-  {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
+  {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
    DUMMY_STRINGOP_ALGS},
-  {{libcall, {{-1, rep_prefix_4_byte}}},
+  {{libcall, {{-1, rep_prefix_4_byte, false}}},
    DUMMY_STRINGOP_ALGS},
   1,					/* scalar_stmt_cost.  */
   1,					/* scalar load_cost.  */
@@ -426,11 +426,13 @@  struct processor_costs pentiumpro_cost =
      noticeable win, for bigger blocks either rep movsl or rep movsb is
      way to go.  Rep movsb has apparently more expensive startup time in CPU,
      but after 4K the difference is down in the noise.  */
-  {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
-			{8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
+  {{rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
+			{8192, rep_prefix_4_byte, false},
+		        {-1, rep_prefix_1_byte, false}}},
    DUMMY_STRINGOP_ALGS},
-  {{rep_prefix_4_byte, {{1024, unrolled_loop},
-  			{8192, rep_prefix_4_byte}, {-1, libcall}}},
+  {{rep_prefix_4_byte, {{1024, unrolled_loop, false},
+  			{8192, rep_prefix_4_byte, false},
+			{-1, libcall, false}}},
    DUMMY_STRINGOP_ALGS},
   1,					/* scalar_stmt_cost.  */
   1,					/* scalar load_cost.  */
@@ -499,9 +501,9 @@  struct processor_costs geode_cost = {
   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (54),			/* cost of FSQRT instruction.  */
-  {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
+  {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
    DUMMY_STRINGOP_ALGS},
-  {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
+  {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
    DUMMY_STRINGOP_ALGS},
   1,					/* scalar_stmt_cost.  */
   1,					/* scalar load_cost.  */
@@ -572,9 +574,9 @@  struct processor_costs k6_cost = {
   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (56),			/* cost of FSQRT instruction.  */
-  {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
+  {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
    DUMMY_STRINGOP_ALGS},
-  {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
+  {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
    DUMMY_STRINGOP_ALGS},
   1,					/* scalar_stmt_cost.  */
   1,					/* scalar load_cost.  */
@@ -645,9 +647,9 @@  struct processor_costs athlon_cost = {
   /* For some reason, Athlon deals better with REP prefix (relative to loops)
      compared to K8. Alignment becomes important after 8 bytes for memcpy and
      128 bytes for memset.  */
-  {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
+  {{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
    DUMMY_STRINGOP_ALGS},
-  {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
+  {{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
    DUMMY_STRINGOP_ALGS},
   1,					/* scalar_stmt_cost.  */
   1,					/* scalar load_cost.  */
@@ -723,11 +725,14 @@  struct processor_costs k8_cost = {
   /* K8 has optimized REP instruction for medium sized blocks, but for very
      small blocks it is better to use loop. For large blocks, libcall can
      do nontemporary accesses and beat inline considerably.  */
-  {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
-   {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
-  {{libcall, {{8, loop}, {24, unrolled_loop},
-	      {2048, rep_prefix_4_byte}, {-1, libcall}}},
-   {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+  {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
+	      {-1, rep_prefix_4_byte, false}}},
+   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+	      {-1, libcall, false}}}},
+  {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
+	      {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+   {libcall, {{48, unrolled_loop, false},
+	      {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
   4,					/* scalar_stmt_cost.  */
   2,					/* scalar load_cost.  */
   2,					/* scalar_store_cost.  */
@@ -810,11 +815,14 @@  struct processor_costs amdfam10_cost = {
   /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
      very small blocks it is better to use loop. For large blocks, libcall can
      do nontemporary accesses and beat inline considerably.  */
-  {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
-   {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
-  {{libcall, {{8, loop}, {24, unrolled_loop},
-	      {2048, rep_prefix_4_byte}, {-1, libcall}}},
-   {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+  {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
+	      {-1, rep_prefix_4_byte, false}}},
+   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+	      {-1, libcall, false}}}},
+  {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
+	      {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
+	      {-1, libcall, false}}}},
   4,					/* scalar_stmt_cost.  */
   2,					/* scalar load_cost.  */
   2,					/* scalar_store_cost.  */
@@ -897,11 +905,14 @@  struct processor_costs bdver1_cost = {
   /*  BDVER1 has optimized REP instruction for medium sized blocks, but for
       very small blocks it is better to use loop. For large blocks, libcall
       can do nontemporary accesses and beat inline considerably.  */
-  {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
-   {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
-  {{libcall, {{8, loop}, {24, unrolled_loop},
-	      {2048, rep_prefix_4_byte}, {-1, libcall}}},
-   {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+  {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
+	      {-1, rep_prefix_4_byte, false}}},
+   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+	      {-1, libcall, false}}}},
+  {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
+	      {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
+	      {-1, libcall, false}}}},
   6,					/* scalar_stmt_cost.  */
   4,					/* scalar load_cost.  */
   4,					/* scalar_store_cost.  */
@@ -984,11 +995,14 @@  struct processor_costs bdver2_cost = {
   /*  BDVER2 has optimized REP instruction for medium sized blocks, but for
       very small blocks it is better to use loop. For large blocks, libcall
       can do nontemporary accesses and beat inline considerably.  */
-  {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
-   {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
-  {{libcall, {{8, loop}, {24, unrolled_loop},
-	      {2048, rep_prefix_4_byte}, {-1, libcall}}},
-   {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+  {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
+	      {-1, rep_prefix_4_byte, false}}},
+   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+	      {-1, libcall, false}}}},
+  {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
+	      {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
+	      {-1, libcall, false}}}},
   6,					/* scalar_stmt_cost.  */
   4,					/* scalar load_cost.  */
   4,					/* scalar_store_cost.  */
@@ -1063,11 +1077,14 @@  struct processor_costs bdver3_cost = {
   /*  BDVER3 has optimized REP instruction for medium sized blocks, but for
       very small blocks it is better to use loop. For large blocks, libcall
       can do nontemporary accesses and beat inline considerably.  */
-  {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
-   {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
-  {{libcall, {{8, loop}, {24, unrolled_loop},
-	      {2048, rep_prefix_4_byte}, {-1, libcall}}},
-   {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+  {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
+	      {-1, rep_prefix_4_byte, false}}},
+   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+	      {-1, libcall, false}}}},
+  {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
+	      {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
+	      {-1, libcall, false}}}},
   6,					/* scalar_stmt_cost.  */
   4,					/* scalar load_cost.  */
   4,					/* scalar_store_cost.  */
@@ -1145,11 +1162,14 @@  struct processor_costs btver1_cost = {
   /* BTVER1 has optimized REP instruction for medium sized blocks, but for
      very small blocks it is better to use loop. For large blocks, libcall can
      do nontemporary accesses and beat inline considerably.  */
-  {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
-   {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
-  {{libcall, {{8, loop}, {24, unrolled_loop},
-	      {2048, rep_prefix_4_byte}, {-1, libcall}}},
-   {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+  {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
+	      {-1, rep_prefix_4_byte, false}}},
+   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+	      {-1, libcall, false}}}},
+  {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
+	      {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
+	      {-1, libcall, false}}}},
   4,					/* scalar_stmt_cost.  */
   2,					/* scalar load_cost.  */
   2,					/* scalar_store_cost.  */
@@ -1224,11 +1244,14 @@  struct processor_costs btver2_cost = {
   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
 
-  {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
-   {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
-  {{libcall, {{8, loop}, {24, unrolled_loop},
-	      {2048, rep_prefix_4_byte}, {-1, libcall}}},
-   {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+  {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
+	      {-1, rep_prefix_4_byte, false}}},
+   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+	      {-1, libcall, false}}}},
+  {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
+	      {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
+	      {-1, libcall, false}}}},
   4,					/* scalar_stmt_cost.  */
   2,					/* scalar load_cost.  */
   2,					/* scalar_store_cost.  */
@@ -1295,10 +1318,10 @@  struct processor_costs pentium4_cost = {
   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (43),			/* cost of FSQRT instruction.  */
-  {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
+  {{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
    DUMMY_STRINGOP_ALGS},
-  {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
-   {-1, libcall}}},
+  {{libcall, {{6, loop_1_byte, false}, {48, loop, false},
+	      {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
    DUMMY_STRINGOP_ALGS},
   1,					/* scalar_stmt_cost.  */
   1,					/* scalar load_cost.  */
@@ -1366,13 +1389,13 @@  struct processor_costs nocona_cost = {
   COSTS_N_INSNS (3),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (3),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (44),			/* cost of FSQRT instruction.  */
-  {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
-   {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
-	      {100000, unrolled_loop}, {-1, libcall}}}},
-  {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
-   {-1, libcall}}},
-   {libcall, {{24, loop}, {64, unrolled_loop},
-	      {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+  {{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
+   {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
+	      {100000, unrolled_loop, false}, {-1, libcall, false}}}},
+  {{libcall, {{6, loop_1_byte, false}, {48, loop, false},
+	      {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+   {libcall, {{24, loop, false}, {64, unrolled_loop, false},
+	      {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
   1,					/* scalar_stmt_cost.  */
   1,					/* scalar load_cost.  */
   1,					/* scalar_store_cost.  */
@@ -1439,13 +1462,13 @@  struct processor_costs atom_cost = {
   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
-  {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
-   {libcall, {{32, loop}, {64, rep_prefix_4_byte},
-	  {8192, rep_prefix_8_byte}, {-1, libcall}}}},
-  {{libcall, {{8, loop}, {15, unrolled_loop},
-	  {2048, rep_prefix_4_byte}, {-1, libcall}}},
-   {libcall, {{24, loop}, {32, unrolled_loop},
-	  {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+  {{libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
+   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
+	  {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
+  {{libcall, {{8, loop, false}, {15, unrolled_loop, false},
+	  {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
+	  {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
   1,					/* scalar_stmt_cost.  */
   1,					/* scalar load_cost.  */
   1,					/* scalar_store_cost.  */
@@ -1520,9 +1543,92 @@  struct processor_costs generic64_cost =
   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
   {DUMMY_STRINGOP_ALGS,
-   {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+   {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
+	      {-1, libcall, false}}}},
   {DUMMY_STRINGOP_ALGS,
-   {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+   {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
+	      {-1, libcall, false}}}},
+  1,					/* scalar_stmt_cost.  */
+  1,					/* scalar load_cost.  */
+  1,					/* scalar_store_cost.  */
+  1,					/* vec_stmt_cost.  */
+  1,					/* vec_to_scalar_cost.  */
+  1,					/* scalar_to_vec_cost.  */
+  1,					/* vec_align_load_cost.  */
+  2,					/* vec_unalign_load_cost.  */
+  1,					/* vec_store_cost.  */
+  3,					/* cond_taken_branch_cost.  */
+  1,					/* cond_not_taken_branch_cost.  */
+};
+
+/* core_cost should produce code tuned for Core familly of CPUs.  */
+static const
+struct processor_costs core_cost = {
+  COSTS_N_INSNS (1),			/* cost of an add instruction */
+  /* On all chips taken into consideration lea is 2 cycles and more.  With
+     this cost however our current implementation of synth_mult results in
+     use of unnecessary temporary registers causing regression on several
+     SPECfp benchmarks.  */
+  COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
+  COSTS_N_INSNS (1),			/* variable shift costs */
+  COSTS_N_INSNS (1),			/* constant shift costs */
+  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),			/*				 HI */
+   COSTS_N_INSNS (3),			/*				 SI */
+   COSTS_N_INSNS (4),			/*				 DI */
+   COSTS_N_INSNS (2)},			/*			      other */
+  0,					/* cost of multiply per each bit set */
+  {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
+   COSTS_N_INSNS (26),			/*			    HI */
+   COSTS_N_INSNS (42),			/*			    SI */
+   COSTS_N_INSNS (74),			/*			    DI */
+   COSTS_N_INSNS (74)},			/*			    other */
+  COSTS_N_INSNS (1),			/* cost of movsx */
+  COSTS_N_INSNS (1),			/* cost of movzx */
+  8,					/* "large" insn */
+  17,					/* MOVE_RATIO */
+  4,				     /* cost for loading QImode using movzbl */
+  {4, 4, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {4, 4, 4},				/* cost of storing integer registers */
+  4,					/* cost of reg,reg fld/fst */
+  {12, 12, 12},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {6, 6, 8},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {8, 8},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {8, 8},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2,					/* cost of moving SSE register */
+  {8, 8, 8},				/* cost of loading SSE registers
+					   in SImode, DImode and TImode */
+  {8, 8, 8},				/* cost of storing SSE registers
+					   in SImode, DImode and TImode */
+  5,					/* MMX or SSE register to integer */
+  64,					/* size of l1 cache.  */
+  512,					/* size of l2 cache.  */
+  64,					/* size of prefetch block */
+  6,					/* number of parallel prefetches */
+  /* FIXME perhaps more appropriate value is 5.  */
+  3,					/* Branch cost */
+  COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
+  COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
+  COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
+  COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
+  COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
+  {{libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
+   {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
+              {-1, libcall, false}}}},
+  {{libcall, {{6, loop_1_byte, true},
+              {24, loop, true},
+              {8192, rep_prefix_4_byte, true},
+              {-1, libcall, false}}},
+   {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
+              {-1, libcall, false}}}},
   1,					/* scalar_stmt_cost.  */
   1,					/* scalar load_cost.  */
   1,					/* scalar_store_cost.  */
@@ -1591,9 +1697,11 @@  struct processor_costs generic32_cost =
   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
-  {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
+  {{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
+	      {-1, libcall, false}}},
    DUMMY_STRINGOP_ALGS},
-  {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
+  {{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
+	      {-1, libcall, false}}},
    DUMMY_STRINGOP_ALGS},
   1,					/* scalar_stmt_cost.  */
   1,					/* scalar load_cost.  */
@@ -1622,14 +1730,9 @@  const struct processor_costs *ix86_cost
 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
 #define m_NOCONA (1<<PROCESSOR_NOCONA)
 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
-#define m_CORE2_32 (1<<PROCESSOR_CORE2_32)
-#define m_CORE2_64 (1<<PROCESSOR_CORE2_64)
-#define m_COREI7_32 (1<<PROCESSOR_COREI7_32)
-#define m_COREI7_64 (1<<PROCESSOR_COREI7_64)
-#define m_COREI7 (m_COREI7_32 | m_COREI7_64)
-#define m_CORE2I7_32 (m_CORE2_32 | m_COREI7_32)
-#define m_CORE2I7_64 (m_CORE2_64 | m_COREI7_64)
-#define m_CORE2I7 (m_CORE2I7_32 | m_CORE2I7_64)
+#define m_CORE2 (1<<PROCESSOR_CORE2)
+#define m_COREI7 (1<<PROCESSOR_COREI7)
+#define m_CORE2I7 (m_CORE2 | m_COREI7)
 #define m_ATOM (1<<PROCESSOR_ATOM)
 
 #define m_GEODE (1<<PROCESSOR_GEODE)
@@ -1665,7 +1768,7 @@  static unsigned int initial_ix86_tune_fe
      negatively, so enabling for Generic64 seems like good code size
      tradeoff.  We can't enable it for 32bit generic because it does not
      work well with PPro base chips.  */
-  m_386 | m_CORE2I7_64 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
+  m_386 | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
 
   /* X86_TUNE_PUSH_MEMORY */
   m_386 | m_P4_NOCONA | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
@@ -1856,7 +1959,7 @@  static unsigned int initial_ix86_tune_fe
   m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
 
   /* X86_TUNE_AVOID_VECTOR_DECODE */
-  m_CORE2I7_64 | m_K8 | m_GENERIC64,
+  m_CORE2I7 | m_K8 | m_GENERIC64,
 
   /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
      and SImode multiply, but 386 and 486 do HImode multiply faster.  */
@@ -1864,11 +1967,11 @@  static unsigned int initial_ix86_tune_fe
 
   /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
      vector path on AMD machines.  */
-  m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
+  m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
 
   /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
      machines.  */
-  m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
+  m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
 
   /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
      than a MOV.  */
@@ -2329,14 +2432,10 @@  static const struct ptt processor_target
   {&pentium4_cost, 0, 0, 0, 0, 0},
   {&k8_cost, 16, 7, 16, 7, 16},
   {&nocona_cost, 0, 0, 0, 0, 0},
-  /* Core 2 32-bit.  */
-  {&generic32_cost, 16, 10, 16, 10, 16},
-  /* Core 2 64-bit.  */
-  {&generic64_cost, 16, 10, 16, 10, 16},
-  /* Core i7 32-bit.  */
-  {&generic32_cost, 16, 10, 16, 10, 16},
-  /* Core i7 64-bit.  */
-  {&generic64_cost, 16, 10, 16, 10, 16},
+  /* Core 2  */
+  {&core_cost, 16, 10, 16, 10, 16},
+  /* Core i7  */
+  {&core_cost, 16, 10, 16, 10, 16},
   {&generic32_cost, 16, 7, 16, 7, 16},
   {&generic64_cost, 16, 10, 16, 10, 16},
   {&amdfam10_cost, 32, 24, 32, 7, 32},
@@ -2799,23 +2898,23 @@  ix86_option_override_internal (bool main
       {"nocona", PROCESSOR_NOCONA, CPU_NONE,
 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
 	| PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
-      {"core2", PROCESSOR_CORE2_64, CPU_CORE2,
+      {"core2", PROCESSOR_CORE2, CPU_CORE2,
 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
 	| PTA_SSSE3 | PTA_CX16 | PTA_FXSR},
-      {"corei7", PROCESSOR_COREI7_64, CPU_COREI7,
+      {"corei7", PROCESSOR_COREI7, CPU_COREI7,
 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
 	| PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_FXSR},
-      {"corei7-avx", PROCESSOR_COREI7_64, CPU_COREI7,
+      {"corei7-avx", PROCESSOR_COREI7, CPU_COREI7,
 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
 	| PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
 	| PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL
 	| PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
-      {"core-avx-i", PROCESSOR_COREI7_64, CPU_COREI7,
+      {"core-avx-i", PROCESSOR_COREI7, CPU_COREI7,
 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
 	| PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
 	| PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
 	| PTA_RDRND | PTA_F16C | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
-      {"core-avx2", PROCESSOR_COREI7_64, CPU_COREI7,
+      {"core-avx2", PROCESSOR_COREI7, CPU_COREI7,
 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
 	| PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
 	| PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
@@ -3327,14 +3426,6 @@  ix86_option_override_internal (bool main
 		ix86_schedule = CPU_PENTIUMPRO;
 		break;
 
-	      case PROCESSOR_CORE2_64:
-		ix86_tune = PROCESSOR_CORE2_32;
-		break;
-
-	      case PROCESSOR_COREI7_64:
-		ix86_tune = PROCESSOR_COREI7_32;
-		break;
-
 	      default:
 		break;
 	      }
@@ -22282,7 +22373,7 @@  expand_constant_setmem_prologue (rtx dst
 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation.  */
 static enum stringop_alg
 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
-	    int *dynamic_check)
+	    int *dynamic_check, bool *noalign)
 {
   const struct stringop_algs * algs;
   bool optimize_for_speed;
@@ -22293,6 +22384,7 @@  decide_alg (HOST_WIDE_INT count, HOST_WI
   bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
                              || (memset
 				 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
+  *noalign = false;
 
 #define ALG_USABLE_P(alg) (rep_prefix_usable			\
 			   || (alg != rep_prefix_1_byte		\
@@ -22360,7 +22452,10 @@  decide_alg (HOST_WIDE_INT count, HOST_WI
 		  break;
 		}
 	      else if (ALG_USABLE_P (candidate))
-		return candidate;
+		{
+		  *noalign = algs->size[i].noalign;
+		  return candidate;
+		}
 	    }
 	}
       gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
@@ -22401,7 +22496,7 @@  decide_alg (HOST_WIDE_INT count, HOST_WI
         }
       if (max == -1)
 	max = 4096;
-      alg = decide_alg (count, max / 2, memset, dynamic_check);
+      alg = decide_alg (count, max / 2, memset, dynamic_check, noalign);
       gcc_assert (*dynamic_check == -1);
       gcc_assert (alg != libcall);
       if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
@@ -22515,6 +22610,7 @@  ix86_expand_movmem (rtx dst, rtx src, rt
   enum stringop_alg alg;
   int dynamic_check;
   bool need_zero_guard = false;
+  bool noalign;
 
   if (CONST_INT_P (align_exp))
     align = INTVAL (align_exp);
@@ -22539,10 +22635,10 @@  ix86_expand_movmem (rtx dst, rtx src, rt
   /* Step 0: Decide on preferred algorithm, desired alignment and
      size of chunks to be copied by main loop.  */
 
-  alg = decide_alg (count, expected_size, false, &dynamic_check);
+  alg = decide_alg (count, expected_size, false, &dynamic_check, &noalign);
   desired_align = decide_alignment (align, alg, expected_size);
 
-  if (!TARGET_ALIGN_STRINGOPS)
+  if (!TARGET_ALIGN_STRINGOPS || noalign)
     align = desired_align;
 
   if (alg == libcall)
@@ -22910,6 +23006,7 @@  ix86_expand_setmem (rtx dst, rtx count_e
   bool force_loopy_epilogue = false;
   int dynamic_check;
   bool need_zero_guard = false;
+  bool noalign;
 
   if (CONST_INT_P (align_exp))
     align = INTVAL (align_exp);
@@ -22929,10 +23026,10 @@  ix86_expand_setmem (rtx dst, rtx count_e
   /* Step 0: Decide on preferred algorithm, desired alignment and
      size of chunks to be copied by main loop.  */
 
-  alg = decide_alg (count, expected_size, true, &dynamic_check);
+  alg = decide_alg (count, expected_size, true, &dynamic_check, &noalign);
   desired_align = decide_alignment (align, alg, expected_size);
 
-  if (!TARGET_ALIGN_STRINGOPS)
+  if (!TARGET_ALIGN_STRINGOPS || noalign)
     align = desired_align;
 
   if (alg == libcall)
@@ -23935,10 +24032,8 @@  ix86_issue_rate (void)
 
     case PROCESSOR_PENTIUMPRO:
     case PROCESSOR_PENTIUM4:
-    case PROCESSOR_CORE2_32:
-    case PROCESSOR_CORE2_64:
-    case PROCESSOR_COREI7_32:
-    case PROCESSOR_COREI7_64:
+    case PROCESSOR_CORE2:
+    case PROCESSOR_COREI7:
     case PROCESSOR_ATHLON:
     case PROCESSOR_K8:
     case PROCESSOR_AMDFAM10:
@@ -24193,10 +24288,8 @@  ia32_multipass_dfa_lookahead (void)
     case PROCESSOR_K6:
       return 1;
 
-    case PROCESSOR_CORE2_32:
-    case PROCESSOR_CORE2_64:
-    case PROCESSOR_COREI7_32:
-    case PROCESSOR_COREI7_64:
+    case PROCESSOR_CORE2:
+    case PROCESSOR_COREI7:
     case PROCESSOR_ATOM:
       /* Generally, we want haifa-sched:max_issue() to look ahead as far
 	 as many instructions can be executed on a cycle, i.e.,
@@ -24739,10 +24832,8 @@  ix86_sched_init_global (FILE *dump ATTRI
      they are actually used.  */
   switch (ix86_tune)
     {
-    case PROCESSOR_CORE2_32:
-    case PROCESSOR_CORE2_64:
-    case PROCESSOR_COREI7_32:
-    case PROCESSOR_COREI7_64:
+    case PROCESSOR_CORE2:
+    case PROCESSOR_COREI7:
       /* Do not perform multipass scheduling for pre-reload schedule
          to save compile time.  */
       if (reload_completed)
@@ -28572,13 +28663,11 @@  get_builtin_code_for_version (tree decl,
 	{
 	  switch (new_target->arch)
 	    {
-	    case PROCESSOR_CORE2_32:
-	    case PROCESSOR_CORE2_64:
+	    case PROCESSOR_CORE2:
 	      arg_str = "core2";
 	      priority = P_PROC_SSSE3;
 	      break;
-	    case PROCESSOR_COREI7_32:
-	    case PROCESSOR_COREI7_64:
+	    case PROCESSOR_COREI7:
 	      arg_str = "corei7";
 	      priority = P_PROC_SSE4_2;
 	      break;
Index: config/i386/i386.h
===================================================================
--- config/i386/i386.h	(revision 193749)
+++ config/i386/i386.h	(working copy)
@@ -101,6 +101,9 @@  see the files COPYING3 and COPYING.RUNTI
    known at compile time or estimated via feedback, the SIZE array
    is walked in order until MAX is greater then the estimate (or -1
    means infinity).  Corresponding ALG is used then.
+   When NOALIGN is true the code guaranting the alignment of the memory
+   block is skipped.
+
    For example initializer:
     {{256, loop}, {-1, rep_prefix_4_byte}}
    will use loop for blocks smaller or equal to 256 bytes, rep prefix will
@@ -111,6 +114,7 @@  struct stringop_algs
   const struct stringop_strategy {
     const int max;
     const enum stringop_alg alg;
+    int noalign;
   } size [MAX_STRINGOP_ALGS];
 };
 
@@ -242,12 +246,8 @@  extern const struct processor_costs ix86
 #define TARGET_K8 (ix86_tune == PROCESSOR_K8)
 #define TARGET_ATHLON_K8 (TARGET_K8 || TARGET_ATHLON)
 #define TARGET_NOCONA (ix86_tune == PROCESSOR_NOCONA)
-#define TARGET_CORE2_32 (ix86_tune == PROCESSOR_CORE2_32)
-#define TARGET_CORE2_64 (ix86_tune == PROCESSOR_CORE2_64)
-#define TARGET_CORE2 (TARGET_CORE2_32 || TARGET_CORE2_64)
-#define TARGET_COREI7_32 (ix86_tune == PROCESSOR_COREI7_32)
-#define TARGET_COREI7_64 (ix86_tune == PROCESSOR_COREI7_64)
-#define TARGET_COREI7 (TARGET_COREI7_32 || TARGET_COREI7_64)
+#define TARGET_CORE2 (ix86_tune == PROCESSOR_CORE2)
+#define TARGET_COREI7 (ix86_tune == PROCESSOR_COREI7)
 #define TARGET_GENERIC32 (ix86_tune == PROCESSOR_GENERIC32)
 #define TARGET_GENERIC64 (ix86_tune == PROCESSOR_GENERIC64)
 #define TARGET_GENERIC (TARGET_GENERIC32 || TARGET_GENERIC64)
@@ -2092,10 +2092,8 @@  enum processor_type
   PROCESSOR_PENTIUM4,
   PROCESSOR_K8,
   PROCESSOR_NOCONA,
-  PROCESSOR_CORE2_32,
-  PROCESSOR_CORE2_64,
-  PROCESSOR_COREI7_32,
-  PROCESSOR_COREI7_64,
+  PROCESSOR_CORE2,
+  PROCESSOR_COREI7,
   PROCESSOR_GENERIC32,
   PROCESSOR_GENERIC64,
   PROCESSOR_AMDFAM10,