Patchwork [i386] : Fix PR 52908 - xop-mul-1:f9 miscompiled on bulldozer (-mxop)

login
register
mail settings
Submitter Uros Bizjak
Date May 9, 2012, 8:38 p.m.
Message ID <CAFULd4Z65612+=rKHET9cHKjjWzzK5x-xQs3HWt8FY2wDSovPQ@mail.gmail.com>
Download mbox | patch
Permalink /patch/158043/
State New
Headers show

Comments

Uros Bizjak - May 9, 2012, 8:38 p.m.
Hello!

Attached patch fixes PR 52908. There is no need to generate "fake"
multiply instructions after reload, we can expand directly to MAC
instructions. This approach even produced better assembly for a couple
of testcases in gcc.target/i386 testsuite.

2012-05-09  Uros Bizjak  <ubizjak@gmail.com>

	PR target/52908
	* config/i386/sse.md (vec_widen_smult_hi_v4si): Expand using
	xop_pmacsdqh insn pattern instead of xop_mulv2div2di3_high.
	(vec_widen_smult_lo_v4si): Expand using xop_pmacsdql insn pattern
	instead of xop_mulv2div2di3_low.
	(xop_p<macs>dql): Fix vec_select selector.
	(xop_p<macs>dqh): Ditto.
	(xop_mulv2div2di3_low): Remove insn_and_split pattern.
	(xop_mulv2div2di3_high): Ditto.

testsuite/ChangeLog:

	PR target/52908
	* gcc.target/i386/xop-imul32widen-vector.c: Update scan-assembler
	directive to Scan for vpmuldq, not vpmacsdql.

Bootstrapped and regression tested on x86_64-pc-linux-gnu {,-m32}, and
tested on XOP target by Venkataramanan.

Patch was committed to mainline SVN. The version, attached to the PR
should be backported to other release branches, but another volunteer
should do the backport, since I don't have access to XOP target.

Also, please note that XOP horizontal add/subtract instructions (and
possibly others) have vec_select parallel RTX in wrong endiannes.

Uros.

Patch

Index: config/i386/sse.md
===================================================================
--- config/i386/sse.md	(revision 187347)
+++ config/i386/sse.md	(working copy)
@@ -5748,11 +5748,15 @@ 
 
   if (TARGET_XOP)
     {
+      rtx t3 = gen_reg_rtx (V2DImode);
+
       emit_insn (gen_sse2_pshufd_1 (t1, op1, GEN_INT (0), GEN_INT (2),
 				    GEN_INT (1), GEN_INT (3)));
       emit_insn (gen_sse2_pshufd_1 (t2, op2, GEN_INT (0), GEN_INT (2),
 				    GEN_INT (1), GEN_INT (3)));
-      emit_insn (gen_xop_mulv2div2di3_high (operands[0], t1, t2));
+      emit_move_insn (t3, CONST0_RTX (V2DImode));
+
+      emit_insn (gen_xop_pmacsdqh (operands[0], t1, t2, t3));
       DONE;
     }
 
@@ -5777,11 +5781,15 @@ 
 
   if (TARGET_XOP)
     {
+      rtx t3 = gen_reg_rtx (V2DImode);
+
       emit_insn (gen_sse2_pshufd_1 (t1, op1, GEN_INT (0), GEN_INT (2),
 				    GEN_INT (1), GEN_INT (3)));
       emit_insn (gen_sse2_pshufd_1 (t2, op2, GEN_INT (0), GEN_INT (2),
 				    GEN_INT (1), GEN_INT (3)));
-      emit_insn (gen_xop_mulv2div2di3_low (operands[0], t1, t2));
+      emit_move_insn (t3, CONST0_RTX (V2DImode));
+
+      emit_insn (gen_xop_pmacsdql (operands[0], t1, t2, t3));
       DONE;
     }
 
@@ -9792,11 +9800,11 @@ 
 	  (sign_extend:V2DI
 	   (vec_select:V2SI
 	    (match_operand:V4SI 1 "nonimmediate_operand" "%x")
-	    (parallel [(const_int 1) (const_int 3)])))
+	    (parallel [(const_int 0) (const_int 2)])))
 	  (sign_extend:V2DI
 	   (vec_select:V2SI
 	    (match_operand:V4SI 2 "nonimmediate_operand" "xm")
-	    (parallel [(const_int 1) (const_int 3)]))))
+	    (parallel [(const_int 0) (const_int 2)]))))
 	 (match_operand:V2DI 3 "nonimmediate_operand" "x")))]
   "TARGET_XOP"
   "vp<macs>dql\t{%3, %2, %1, %0|%0, %1, %2, %3}"
@@ -9810,93 +9818,17 @@ 
 	  (sign_extend:V2DI
 	   (vec_select:V2SI
 	    (match_operand:V4SI 1 "nonimmediate_operand" "%x")
-	    (parallel [(const_int 0) (const_int 2)])))
+	    (parallel [(const_int 1) (const_int 3)])))
 	  (sign_extend:V2DI
 	   (vec_select:V2SI
 	    (match_operand:V4SI 2 "nonimmediate_operand" "xm")
-	    (parallel [(const_int 0) (const_int 2)]))))
+	    (parallel [(const_int 1) (const_int 3)]))))
 	 (match_operand:V2DI 3 "nonimmediate_operand" "x")))]
   "TARGET_XOP"
   "vp<macs>dqh\t{%3, %2, %1, %0|%0, %1, %2, %3}"
   [(set_attr "type" "ssemuladd")
    (set_attr "mode" "TI")])
 
-;; We don't have a straight 32-bit parallel multiply and extend on XOP, so
-;; fake it with a multiply/add.  In general, we expect the define_split to
-;; occur before register allocation, so we have to handle the corner case where
-;; the target is the same as operands 1/2
-(define_insn_and_split "xop_mulv2div2di3_low"
-  [(set (match_operand:V2DI 0 "register_operand" "=&x")
-	(mult:V2DI
-	  (sign_extend:V2DI
-	    (vec_select:V2SI
-	      (match_operand:V4SI 1 "register_operand" "%x")
-	      (parallel [(const_int 1) (const_int 3)])))
-	  (sign_extend:V2DI
-	    (vec_select:V2SI
-	      (match_operand:V4SI 2 "nonimmediate_operand" "xm")
-	      (parallel [(const_int 1) (const_int 3)])))))]
-  "TARGET_XOP"
-  "#"
-  "&& reload_completed"
-  [(set (match_dup 0)
-	(match_dup 3))
-   (set (match_dup 0)
-	(plus:V2DI
-	 (mult:V2DI
-	  (sign_extend:V2DI
-	   (vec_select:V2SI
-	    (match_dup 1)
-	    (parallel [(const_int 1) (const_int 3)])))
-	  (sign_extend:V2DI
-	   (vec_select:V2SI
-	    (match_dup 2)
-	    (parallel [(const_int 1) (const_int 3)]))))
-	 (match_dup 0)))]
-{
-  operands[3] = CONST0_RTX (V2DImode);
-}
-  [(set_attr "type" "ssemul")
-   (set_attr "mode" "TI")])
-
-;; We don't have a straight 32-bit parallel multiply and extend on XOP, so
-;; fake it with a multiply/add.  In general, we expect the define_split to
-;; occur before register allocation, so we have to handle the corner case where
-;; the target is the same as either operands[1] or operands[2]
-(define_insn_and_split "xop_mulv2div2di3_high"
-  [(set (match_operand:V2DI 0 "register_operand" "=&x")
-	(mult:V2DI
-	  (sign_extend:V2DI
-	    (vec_select:V2SI
-	      (match_operand:V4SI 1 "register_operand" "%x")
-	      (parallel [(const_int 0) (const_int 2)])))
-	  (sign_extend:V2DI
-	    (vec_select:V2SI
-	      (match_operand:V4SI 2 "nonimmediate_operand" "xm")
-	      (parallel [(const_int 0) (const_int 2)])))))]
-  "TARGET_XOP"
-  "#"
-  "&& reload_completed"
-  [(set (match_dup 0)
-	(match_dup 3))
-   (set (match_dup 0)
-	(plus:V2DI
-	 (mult:V2DI
-	  (sign_extend:V2DI
-	   (vec_select:V2SI
-	    (match_dup 1)
-	    (parallel [(const_int 0) (const_int 2)])))
-	  (sign_extend:V2DI
-	   (vec_select:V2SI
-	    (match_dup 2)
-	    (parallel [(const_int 0) (const_int 2)]))))
-	 (match_dup 0)))]
-{
-  operands[3] = CONST0_RTX (V2DImode);
-}
-  [(set_attr "type" "ssemul")
-   (set_attr "mode" "TI")])
-
 ;; XOP parallel integer multiply/add instructions for the intrinisics
 (define_insn "xop_p<macs>wd"
   [(set (match_operand:V4SI 0 "register_operand" "=x")
Index: testsuite/gcc.target/i386/xop-imul32widen-vector.c
===================================================================
--- testsuite/gcc.target/i386/xop-imul32widen-vector.c	(revision 187347)
+++ testsuite/gcc.target/i386/xop-imul32widen-vector.c	(working copy)
@@ -32,5 +32,5 @@ 
   exit (0);
 }
 
-/* { dg-final { scan-assembler "vpmacsdql" } } */
+/* { dg-final { scan-assembler "vpmuldq" } } */
 /* { dg-final { scan-assembler "vpmacsdqh" } } */