Patchwork [1/3] i386: Expand mul<VI8_AVX2> earlier

login
register
mail settings
Submitter Richard Henderson
Date June 27, 2012, 3:20 a.m.
Message ID <1340767258-21086-1-git-send-email-rth@redhat.com>
Download mbox | patch
Permalink /patch/167492/
State New
Headers show

Comments

Richard Henderson - June 27, 2012, 3:20 a.m.
Move the expansion code to i386.c next to mulv4si3.  Eliminate
one shift by adding the highparts before shifting.  Correct costs.

        * config/i386/sse.md (mul<VI8_AVX2>3): Change from insn_and_split
        to expander; move guts to ...
        * config/i386/i386.c (ix86_expand_sse2_mulvxdi3): ... here.  Add
        highparts before shifting up.
        * config/i386/i386-protos.h: Update.
---
 gcc/ChangeLog                 |    8 ++++
 gcc/config/i386/i386-protos.h |    1 +
 gcc/config/i386/i386.c        |   90 +++++++++++++++++++++++++++++++++++++++++
 gcc/config/i386/sse.md        |   84 +------------------------------------
 4 files changed, 102 insertions(+), 81 deletions(-)

Patch

diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index c860e5a..581b25c 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -227,6 +227,7 @@  extern bool ix86_expand_pinsr (rtx *);
 extern void ix86_expand_mul_widen_evenodd (rtx, rtx, rtx, bool, bool);
 extern void ix86_expand_mul_widen_hilo (rtx, rtx, rtx, bool, bool);
 extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx);
+extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx);
 
 /* In i386-c.c  */
 extern void ix86_target_macros (void);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index c825033..5cf230f 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -32293,6 +32293,14 @@  ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
 		extra = 6;
 	      *total = cost->fmul * 2 + cost->fabs * extra;
 	    }
+	  /* V*DImode is emulated with 5-8 insns.  */
+	  else if (mode == V2DImode || mode == V4DImode)
+	    {
+	      if (TARGET_XOP && mode == V2DImode)
+		*total = cost->fmul * 2 + cost->fabs * 3;
+	      else
+		*total = cost->fmul * 3 + cost->fabs * 5;
+	    }
 	  /* Without sse4.1, we don't have PMULLD; it's emulated with 7
 	     insns, including two PMULUDQ.  */
 	  else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
@@ -38915,6 +38923,88 @@  ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
   set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
 }
 
+void
+ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
+{
+  enum machine_mode mode = GET_MODE (op0);
+  rtx t1, t2, t3, t4, t5, t6;
+
+  if (TARGET_XOP && mode == V2DImode)
+    {
+      /* op1: A,B,C,D, op2: E,F,G,H */
+      op1 = gen_lowpart (V4SImode, op1);
+      op2 = gen_lowpart (V4SImode, op2);
+
+      t1 = gen_reg_rtx (V4SImode);
+      t2 = gen_reg_rtx (V4SImode);
+      t3 = gen_reg_rtx (V2DImode);
+      t4 = gen_reg_rtx (V2DImode);
+
+      /* t1: B,A,D,C */
+      emit_insn (gen_sse2_pshufd_1 (t1, op1,
+				    GEN_INT (1),
+				    GEN_INT (0),
+				    GEN_INT (3),
+				    GEN_INT (2)));
+
+      /* t2: (B*E),(A*F),(D*G),(C*H) */
+      emit_insn (gen_mulv4si3 (t2, t1, op2));
+
+      /* t3: (B*E)+(A*F), (D*G)+(C*H) */
+      emit_insn (gen_xop_phadddq (t3, t2));
+
+      /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
+      emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
+
+      /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
+      emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
+    }
+  else
+    {
+      enum machine_mode nmode;
+      rtx (*umul) (rtx, rtx, rtx);
+
+      if (mode == V2DImode)
+	{
+	  umul = gen_sse2_umulv2siv2di3;
+	  nmode = V4SImode;
+	}
+      else if (mode == V4DImode)
+	{
+	  umul = gen_avx2_umulv4siv4di3;
+	  nmode = V8SImode;
+	}
+      else
+	gcc_unreachable ();
+
+
+      /* Multiply low parts.  */
+      t1 = gen_reg_rtx (mode);
+      emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
+
+      /* Shift input vectors right 32 bits so we can multiply high parts.  */
+      t6 = GEN_INT (32);
+      t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
+      t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
+
+      /* Multiply high parts by low parts.  */
+      t4 = gen_reg_rtx (mode);
+      t5 = gen_reg_rtx (mode);
+      emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
+      emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
+
+      /* Combine and shift the highparts back.  */
+      t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
+      t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
+
+      /* Combine high and low parts.  */
+      force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
+    }
+
+  set_unique_reg_note (get_last_insn (), REG_EQUAL,
+		       gen_rtx_MULT (mode, op1, op2));
+}
+
 /* Expand an insert into a vector register through pinsr insn.
    Return true if successful.  */
 
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 4b51415..81e7dc0 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -5592,91 +5592,13 @@ 
    (set_attr "prefix" "orig,vex")
    (set_attr "mode" "<sseinsnmode>")])
 
-(define_insn_and_split "mul<mode>3"
+(define_expand "mul<mode>3"
   [(set (match_operand:VI8_AVX2 0 "register_operand")
 	(mult:VI8_AVX2 (match_operand:VI8_AVX2 1 "register_operand")
 		       (match_operand:VI8_AVX2 2 "register_operand")))]
-  "TARGET_SSE2
-   && can_create_pseudo_p ()"
-  "#"
-  "&& 1"
-  [(const_int 0)]
+  "TARGET_SSE2"
 {
-  rtx t1, t2, t3, t4, t5, t6, thirtytwo;
-  rtx op0, op1, op2;
-
-  op0 = operands[0];
-  op1 = operands[1];
-  op2 = operands[2];
-
-  if (TARGET_XOP && <MODE>mode == V2DImode)
-    {
-      /* op1: A,B,C,D, op2: E,F,G,H */
-      op1 = gen_lowpart (V4SImode, op1);
-      op2 = gen_lowpart (V4SImode, op2);
-
-      t1 = gen_reg_rtx (V4SImode);
-      t2 = gen_reg_rtx (V4SImode);
-      t3 = gen_reg_rtx (V2DImode);
-      t4 = gen_reg_rtx (V2DImode);
-
-      /* t1: B,A,D,C */
-      emit_insn (gen_sse2_pshufd_1 (t1, op1,
-				    GEN_INT (1),
-				    GEN_INT (0),
-				    GEN_INT (3),
-				    GEN_INT (2)));
-
-      /* t2: (B*E),(A*F),(D*G),(C*H) */
-      emit_insn (gen_mulv4si3 (t2, t1, op2));
-
-      /* t4: (B*E)+(A*F), (D*G)+(C*H) */
-      emit_insn (gen_xop_phadddq (t3, t2));
-
-      /* t5: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
-      emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
-
-      /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
-      emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
-    }
-  else
-    {
-      t1 = gen_reg_rtx (<MODE>mode);
-      t2 = gen_reg_rtx (<MODE>mode);
-      t3 = gen_reg_rtx (<MODE>mode);
-      t4 = gen_reg_rtx (<MODE>mode);
-      t5 = gen_reg_rtx (<MODE>mode);
-      t6 = gen_reg_rtx (<MODE>mode);
-      thirtytwo = GEN_INT (32);
-
-      /* Multiply low parts.  */
-      emit_insn (gen_<sse2_avx2>_umulv<ssescalarnum>si<mode>3
-		  (t1, gen_lowpart (<ssepackmode>mode, op1),
-		   gen_lowpart (<ssepackmode>mode, op2)));
-
-      /* Shift input vectors right 32 bits so we can multiply high parts.  */
-      emit_insn (gen_lshr<mode>3 (t2, op1, thirtytwo));
-      emit_insn (gen_lshr<mode>3 (t3, op2, thirtytwo));
-
-      /* Multiply high parts by low parts.  */
-      emit_insn (gen_<sse2_avx2>_umulv<ssescalarnum>si<mode>3
-		  (t4, gen_lowpart (<ssepackmode>mode, op1),
-		   gen_lowpart (<ssepackmode>mode, t3)));
-      emit_insn (gen_<sse2_avx2>_umulv<ssescalarnum>si<mode>3
-		  (t5, gen_lowpart (<ssepackmode>mode, op2),
-		   gen_lowpart (<ssepackmode>mode, t2)));
-
-      /* Shift them back.  */
-      emit_insn (gen_ashl<mode>3 (t4, t4, thirtytwo));
-      emit_insn (gen_ashl<mode>3 (t5, t5, thirtytwo));
-
-      /* Add the three parts together.  */
-      emit_insn (gen_add<mode>3 (t6, t1, t4));
-      emit_insn (gen_add<mode>3 (op0, t6, t5));
-    }
-
-  set_unique_reg_note (get_last_insn (), REG_EQUAL,
-		       gen_rtx_MULT (<MODE>mode, operands[1], operands[2]));
+  ix86_expand_sse2_mulvxdi3 (operands[0], operands[1], operands[2]);
   DONE;
 })