Patchwork [x86] Fix combine for condditional instructions.

login
register
mail settings
Submitter Uros Bizjak
Date Dec. 13, 2012, 5:03 p.m.
Message ID <CAFULd4Zb28M9fPSM9sB0zem-cbOpZvOy6hRoJ4FBD=xgHV8ocQ@mail.gmail.com>
Download mbox | patch
Permalink /patch/206174/
State New
Headers show

Comments

Uros Bizjak - Dec. 13, 2012, 5:03 p.m.
On Thu, Dec 13, 2012 at 4:02 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:

> We did not see any performance improvement on Atom in 32-bit mode at
> routelookup from eembc_2_0 (eembc_1_1).

I assume that for x86_64 the patch works as expected. Let's take a
bigger hammer for 32bit targets - the splitter that effectively does
the same as your proposed patch. Please note, that this splitter
operates with optimize_function_for_speed_p predicate, so this
transformation will take place in the whole function. Can you perhaps
investigate what happens if this predicate is changed back to
optimize_insn_for_speed_p () - this is what we would like to have
here?

;; Don't do conditional moves with memory inputs.  This splitter helps
;; register starved x86_32 by forcing inputs into registers before reload.
(define_split
  [(set (match_operand:SWI248 0 "register_operand")
	(if_then_else:SWI248 (match_operator 1 "ix86_comparison_operator"
			       [(reg FLAGS_REG) (const_int 0)])
	  (match_operand:SWI248 2 "nonimmediate_operand")
	  (match_operand:SWI248 3 "nonimmediate_operand")))]
  "!TARGET_64BIT && TARGET_CMOVE
   && (MEM_P (operands[2]) || MEM_P (operands[3]))
   && can_create_pseudo_p ()
   && optimize_function_for_speed_p (cfun)"
  [(set (match_dup 0)
	(if_then_else:SWI248 (match_dup 1) (match_dup 2) (match_dup 3)))]
{
  if (MEM_P (operands[2]))
    operands[2] = force_reg (<MODE>mode, operands[2]);
  if (MEM_P (operands[3]))
    operands[3] = force_reg (<MODE>mode, operands[3]);
})

Attached is the complete patch, including peephole2s.

Uros.
Yuri Rumyantsev - Dec. 14, 2012, 10:47 a.m.
Hi Uros,

With your new fix that add if-then-else splitting for memory operand I
got expected performance speed-up - +6.7% for Atom and +8.4% for SNB.
We need to do all testing this weekend and I will get you our final
feedback on Monday.

Thanks ahead for all your help.
Yuri.

2012/12/13 Uros Bizjak <ubizjak@gmail.com>:
> On Thu, Dec 13, 2012 at 4:02 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>
>> We did not see any performance improvement on Atom in 32-bit mode at
>> routelookup from eembc_2_0 (eembc_1_1).
>
> I assume that for x86_64 the patch works as expected. Let's take a
> bigger hammer for 32bit targets - the splitter that effectively does
> the same as your proposed patch. Please note, that this splitter
> operates with optimize_function_for_speed_p predicate, so this
> transformation will take place in the whole function. Can you perhaps
> investigate what happens if this predicate is changed back to
> optimize_insn_for_speed_p () - this is what we would like to have
> here?
>
> ;; Don't do conditional moves with memory inputs.  This splitter helps
> ;; register starved x86_32 by forcing inputs into registers before reload.
> (define_split
>   [(set (match_operand:SWI248 0 "register_operand")
>         (if_then_else:SWI248 (match_operator 1 "ix86_comparison_operator"
>                                [(reg FLAGS_REG) (const_int 0)])
>           (match_operand:SWI248 2 "nonimmediate_operand")
>           (match_operand:SWI248 3 "nonimmediate_operand")))]
>   "!TARGET_64BIT && TARGET_CMOVE
>    && (MEM_P (operands[2]) || MEM_P (operands[3]))
>    && can_create_pseudo_p ()
>    && optimize_function_for_speed_p (cfun)"
>   [(set (match_dup 0)
>         (if_then_else:SWI248 (match_dup 1) (match_dup 2) (match_dup 3)))]
> {
>   if (MEM_P (operands[2]))
>     operands[2] = force_reg (<MODE>mode, operands[2]);
>   if (MEM_P (operands[3]))
>     operands[3] = force_reg (<MODE>mode, operands[3]);
> })
>
> Attached is the complete patch, including peephole2s.
>
> Uros.

Patch

Index: i386.md
===================================================================
--- i386.md	(revision 194451)
+++ i386.md	(working copy)
@@ -16093,6 +16093,27 @@ 
   [(set_attr "type" "icmov")
    (set_attr "mode" "<MODE>")])
 
+;; Don't do conditional moves with memory inputs.  This splitter helps
+;; register starved x86_32 by forcing inputs into registers before reload.
+(define_split
+  [(set (match_operand:SWI248 0 "register_operand")
+	(if_then_else:SWI248 (match_operator 1 "ix86_comparison_operator"
+			       [(reg FLAGS_REG) (const_int 0)])
+	  (match_operand:SWI248 2 "nonimmediate_operand")
+	  (match_operand:SWI248 3 "nonimmediate_operand")))]
+  "!TARGET_64BIT && TARGET_CMOVE
+   && (MEM_P (operands[2]) || MEM_P (operands[3]))
+   && can_create_pseudo_p ()
+   && optimize_function_for_speed_p (cfun)"
+  [(set (match_dup 0)
+	(if_then_else:SWI248 (match_dup 1) (match_dup 2) (match_dup 3)))]
+{
+  if (MEM_P (operands[2]))
+    operands[2] = force_reg (<MODE>mode, operands[2]);
+  if (MEM_P (operands[3]))
+    operands[3] = force_reg (<MODE>mode, operands[3]);
+})
+
 (define_insn "*movqicc_noc"
   [(set (match_operand:QI 0 "register_operand" "=r,r")
 	(if_then_else:QI (match_operator 1 "ix86_comparison_operator"
@@ -16105,14 +16126,12 @@ 
    (set_attr "mode" "QI")])
 
 (define_split
-  [(set (match_operand 0 "register_operand")
-	(if_then_else (match_operator 1 "ix86_comparison_operator"
-			[(reg FLAGS_REG) (const_int 0)])
-		      (match_operand 2 "register_operand")
-		      (match_operand 3 "register_operand")))]
+  [(set (match_operand:SWI12 0 "register_operand")
+	(if_then_else:SWI12 (match_operator 1 "ix86_comparison_operator"
+			      [(reg FLAGS_REG) (const_int 0)])
+		      (match_operand:SWI12 2 "register_operand")
+		      (match_operand:SWI12 3 "register_operand")))]
   "TARGET_CMOVE && !TARGET_PARTIAL_REG_STALL
-   && (GET_MODE (operands[0]) == QImode
-       || GET_MODE (operands[0]) == HImode)
    && reload_completed"
   [(set (match_dup 0)
 	(if_then_else:SI (match_dup 1) (match_dup 2) (match_dup 3)))]
@@ -16122,6 +16141,31 @@ 
   operands[3] = gen_lowpart (SImode, operands[3]);
 })
 
+;; Don't do conditional moves with memory inputs
+(define_peephole2
+  [(match_scratch:SWI248 2 "r")
+   (set (match_operand:SWI248 0 "register_operand")
+	(if_then_else:SWI248 (match_operator 1 "ix86_comparison_operator"
+			       [(reg FLAGS_REG) (const_int 0)])
+	  (match_dup 0)
+	  (match_operand:SWI248 3 "memory_operand")))]
+  "TARGET_CMOVE && optimize_insn_for_speed_p ()"
+  [(set (match_dup 2) (match_dup 3))
+   (set (match_dup 0)
+	(if_then_else:SWI248 (match_dup 1) (match_dup 0) (match_dup 2)))])
+
+(define_peephole2
+  [(match_scratch:SWI248 2 "r")
+   (set (match_operand:SWI248 0 "register_operand")
+	(if_then_else:SWI248 (match_operator 1 "ix86_comparison_operator"
+			       [(reg FLAGS_REG) (const_int 0)])
+	  (match_operand:SWI248 3 "memory_operand")
+	  (match_dup 0)))]
+  "TARGET_CMOVE && optimize_insn_for_speed_p ()"
+  [(set (match_dup 2) (match_dup 3))
+   (set (match_dup 0)
+	(if_then_else:SWI248 (match_dup 1) (match_dup 2) (match_dup 0)))])
+
 (define_expand "mov<mode>cc"
   [(set (match_operand:X87MODEF 0 "register_operand")
 	(if_then_else:X87MODEF
@@ -16209,6 +16253,56 @@ 
   [(set_attr "type" "fcmov,fcmov,icmov,icmov")
    (set_attr "mode" "SF,SF,SI,SI")])
 
+;; Don't do conditional moves with memory inputs.  This splitter helps
+;; register starved x86_32 by forcing inputs into registers before reload.
+(define_split
+  [(set (match_operand:MODEF 0 "register_operand")
+	(if_then_else:MODEF (match_operator 1 "ix86_comparison_operator"
+			      [(reg FLAGS_REG) (const_int 0)])
+	  (match_operand:MODEF 2 "nonimmediate_operand")
+	  (match_operand:MODEF 3 "nonimmediate_operand")))]
+  "!TARGET_64BIT && TARGET_80387 && TARGET_CMOVE
+   && (MEM_P (operands[2]) || MEM_P (operands[3]))
+   && can_create_pseudo_p ()
+   && optimize_function_for_speed_p (cfun)"
+  [(set (match_dup 0)
+	(if_then_else:MODEF (match_dup 1) (match_dup 2) (match_dup 3)))]
+{
+  if (MEM_P (operands[2]))
+    operands[2] = force_reg (<MODE>mode, operands[2]);
+  if (MEM_P (operands[3]))
+    operands[3] = force_reg (<MODE>mode, operands[3]);
+})
+
+;; Don't do conditional moves with memory inputs
+(define_peephole2
+  [(match_scratch:MODEF 2 "r")
+   (set (match_operand:MODEF 0 "register_and_not_any_fp_reg_operand")
+	(if_then_else:MODEF (match_operator 1 "fcmov_comparison_operator"
+			      [(reg FLAGS_REG) (const_int 0)])
+	  (match_dup 0)
+	  (match_operand:MODEF 3 "memory_operand")))]
+  "(<MODE>mode != DFmode || TARGET_64BIT)
+   && TARGET_80387 && TARGET_CMOVE
+   && optimize_insn_for_speed_p ()"
+  [(set (match_dup 2) (match_dup 3))
+   (set (match_dup 0)
+	(if_then_else:MODEF (match_dup 1) (match_dup 0) (match_dup 2)))])
+
+(define_peephole2
+  [(match_scratch:MODEF 2 "r")
+   (set (match_operand:MODEF 0 "register_and_not_any_fp_reg_operand")
+	(if_then_else:MODEF (match_operator 1 "fcmov_comparison_operator"
+			      [(reg FLAGS_REG) (const_int 0)])
+	  (match_operand:MODEF 3 "memory_operand")
+	  (match_dup 0)))]
+  "(<MODE>mode != DFmode || TARGET_64BIT)
+   && TARGET_80387 && TARGET_CMOVE
+   && optimize_insn_for_speed_p ()"
+  [(set (match_dup 2) (match_dup 3))
+   (set (match_dup 0)
+	(if_then_else:MODEF (match_dup 1) (match_dup 2) (match_dup 0)))])
+
 ;; All moves in XOP pcmov instructions are 128 bits and hence we restrict
 ;; the scalar versions to have only XMM registers as operands.