===================================================================
@@ -41,7 +41,9 @@
(define_cpu_unit "bdver1-decode0" "bdver1")
(define_cpu_unit "bdver1-decode1" "bdver1")
(define_cpu_unit "bdver1-decode2" "bdver1")
-(define_cpu_unit "bdver1-decodev" "bdver1")
+(define_cpu_unit "bdver1-decode0b" "bdver1")
+(define_cpu_unit "bdver1-decode1b" "bdver1")
+(define_cpu_unit "bdver1-decode2b" "bdver1")
;; Model the fact that double decoded instruction may take 2 cycles
;; to decode when decoder2 and decoder0 in next cycle
@@ -57,18 +59,26 @@
;; too. Vector decoded instructions then can't be issued when modeled
;; as consuming decoder0+decoder1+decoder2.
;; We solve that by specialized vector decoder unit and exclusion set.
-(presence_set "bdver1-decode2" "bdver1-decode0")
-(exclusion_set "bdver1-decodev" "bdver1-decode0,bdver1-decode1,bdver1-decode2")
-
-(define_reservation "bdver1-vector" "nothing,bdver1-decodev")
-(define_reservation "bdver1-direct1" "nothing,bdver1-decode1")
+(final_presence_set "bdver1-decode2" "bdver1-decode0,bdver1-decode1")
+(presence_set "bdver1-decode0b,bdver1-decode1b,bdver1-decode2b" "bdver1-decode0,bdver1-decode1")
+(final_presence_set "bdver1-decode2b" "bdver1-decode0b,bdver1-decode1b")
+
+(define_reservation "use-decodera" "((bdver1-decode0 | nothing)
+ + (bdver1-decode1 | nothing)
+ + (bdver1-decode2 | nothing))")
+(define_reservation "bdver1-vector" "nothing,((bdver1-decode0+bdver1-decode1+bdver1-decode2)
+ |(use-decodera+bdver1-decode0b+bdver1-decode1b+bdver1-decode2b))")
+(define_reservation "bdver1-direct1" "nothing,(bdver1-decode1|(use-decodera+bdver1-decode1b))")
(define_reservation "bdver1-direct" "nothing,
(bdver1-decode0 | bdver1-decode1
- | bdver1-decode2)")
+ | bdver1-decode2 | (use-decodera+bdver1-decode0b)
+ | (use-decodera+bdver1-decode1b) | (use-decodera+bdver1-decode2b))")
;; Double instructions behaves like two direct instructions.
(define_reservation "bdver1-double" "((bdver1-decode2,bdver1-decode0)
| (nothing,(bdver1-decode0 + bdver1-decode1))
- | (nothing,(bdver1-decode1 + bdver1-decode2)))")
+ | (nothing,(bdver1-decode1 + bdver1-decode2))
+ | (nothing,(use-decodera + bdver1-decode0b + bdver1-decode1b))
+ | (nothing,(use-decodera + bdver1-decode1b + bdver1-decode2b)))")
(define_cpu_unit "bdver1-ieu0" "bdver1_ieu")
@@ -131,17 +141,28 @@
(eq_attr "type" "call,callv"))
"bdver1-double,bdver1-agu")
;; PUSH mem is double path.
+(define_insn_reservation "bdver1_pushmem" 1
+ (and (eq_attr "cpu" "bdver1,bdver2")
+ (and (eq_attr "type" "push")
+ (eq_attr "memory" "both")))
+ "bdver1-direct,bdver1-load,bdver1-store")
(define_insn_reservation "bdver1_push" 1
(and (eq_attr "cpu" "bdver1,bdver2")
(eq_attr "type" "push"))
- "bdver1-direct,bdver1-agu,bdver1-store")
+ "bdver1-direct,bdver1-store")
;; POP r16/mem are double path.
+;; 16bit pops are not really used by GCC.
+(define_insn_reservation "bdver1_popmem" 1
+ (and (eq_attr "cpu" "bdver1,bdver2")
+ (and (eq_attr "type" "pop")
+ (eq_attr "memory" "both")))
+ "bdver1-direct,bdver1-load,bdver1-store")
(define_insn_reservation "bdver1_pop" 1
(and (eq_attr "cpu" "bdver1,bdver2")
(eq_attr "type" "pop"))
- "bdver1-direct,bdver1-ivector")
-;; LEAVE no latency info so far, assume same with amdfam10.
-(define_insn_reservation "bdver1_leave" 3
+ "bdver1-direct,bdver1-load")
+;; By Agner Fog, latency is 4.
+(define_insn_reservation "bdver1_leave" 4
(and (eq_attr "cpu" "bdver1,bdver2")
(eq_attr "type" "leave"))
"bdver1-vector,bdver1-ivector")
===================================================================
@@ -24427,11 +25019,13 @@ ix86_issue_rate (void)
case PROCESSOR_K8:
case PROCESSOR_AMDFAM10:
case PROCESSOR_GENERIC:
- case PROCESSOR_BDVER1:
- case PROCESSOR_BDVER2:
- case PROCESSOR_BDVER3:
case PROCESSOR_BTVER1:
return 3;
+ case PROCESSOR_BDVER3:
+ return 4;
+ case PROCESSOR_BDVER1:
+ case PROCESSOR_BDVER2:
+ return 6;
case PROCESSOR_CORE2:
case PROCESSOR_COREI7:
@@ -24697,11 +25291,27 @@ ix86_adjust_cost (rtx insn, rtx link, rt
case PROCESSOR_GENERIC:
memory = get_attr_memory (insn);
- /* Stack engine allows to execute push&pop instructions in parall. */
+ /* Stack engine allows to execute push&pop instructions in parallel.
+ ??? There seems to be no detailed documentation of AMDFAM10, perhaps
+ it is actually equivalent to the stronger notion of engine bellow. */
if (((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
&& (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
- && (ix86_tune != PROCESSOR_ATHLON && ix86_tune != PROCESSOR_K8))
+ && (ix86_tune == PROCESSOR_AMDFAM10))
+ return 0;
+
+ /* For buldozer up, the stack engine makes value of stack pointer available
+ immediately, no mater about the use. (i.e. when ESP is used as pointer
+ or for arithmetic, the cost is bypassed, too.) */
+ if (ix86_tune >= PROCESSOR_BDVER1
+ && dep_insn_type == TYPE_PUSH)
return 0;
+ if (ix86_tune >= PROCESSOR_BDVER1
+ && dep_insn_type == TYPE_POP)
+ {
+ rtx dest = SET_DEST (PATTERN (dep_insn));
+ if (REG_P (dest) && !reg_referenced_p (dest, PATTERN (insn)))
+ return 0;
+ }
/* Show ability of reorder buffer to hide latency of load by executing
in parallel with previous instruction in case