diff mbox

[3/3] PR target/53749

Message ID 1340470867-24070-4-git-send-email-rth@redhat.com
State New
Headers show

Commit Message

Richard Henderson June 23, 2012, 5:01 p.m. UTC
* config/i386/i386.c (ix86_rtx_costs): Add reasonable costs for        
	V*QImode shifts and multiply.        
	(ix86_expand_vecop_qihi): Support shifts.
        * config/i386/i386.md (any_shift): New code iterator.    
	* config/i386/sse.md (ashlv16qi3): Merge ...       
	(<any_shiftrt>v16qi3): ... into ...        
	(<any_shift><VI1_AVX2>3): ... here.  Use ix86_expand_vecop_qihi
	to support SSE and AVX.
diff mbox

Patch

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 7ae2060..fc30632 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -31938,9 +31938,10 @@  ix86_set_reg_reg_cost (enum machine_mode mode)
    scanned.  In either case, *TOTAL contains the cost result.  */
 
 static bool
-ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
+ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
 		bool speed)
 {
+  enum rtx_code code = (enum rtx_code) code_i;
   enum rtx_code outer_code = (enum rtx_code) outer_code_i;
   enum machine_mode mode = GET_MODE (x);
   const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
@@ -32045,7 +32046,31 @@  ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
 	  /* ??? Should be SSE vector operation cost.  */
 	  /* At least for published AMD latencies, this really is the same
 	     as the latency for a simple fpu operation like fabs.  */
-	  *total = cost->fabs;
+	  /* V*QImode is emulated with 1-11 insns.  */
+	  if (mode == V16QImode || mode == V32QImode)
+	    {
+	      int count;
+	      if (TARGET_XOP && mode == V16QImode)
+		{
+		  /* For XOP we use vpshab, which requires a broadcast of the
+		     value to the variable shift insn.  For constants this
+		     means a V16Q const in mem; even when we can perform the
+		     shift with one insn set the cost to prefer paddb.  */
+		  if (CONSTANT_P (XEXP (x, 1)))
+		    {
+		      *total = (cost->fabs
+				+ rtx_cost (XEXP (x, 0), code, 0, speed)
+				+ (speed ? 2 : COSTS_N_BYTES (16)));
+		      return true;
+		    }
+		  count = 3;
+		}
+	      else
+		count = TARGET_SSSE3 ? 7 : 11;
+	      *total = cost->fabs * count;
+	    }
+	  else
+	    *total = cost->fabs;
 	  return false;
 	}
       if (GET_MODE_SIZE (mode) < UNITS_PER_WORD)
@@ -32119,9 +32144,15 @@  ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
 	}
       else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
 	{
+	  /* V*QImode is emulated with 7-13 insns.  */
+	  if (mode == V16QImode || mode == V32QImode)
+	    {
+	      int extra = TARGET_XOP ? 5 : TARGET_SSSE3 ? 6 : 11;
+	      *total = cost->fmul * 2 + cost->fabs * extra;
+	    }
 	  /* Without sse4.1, we don't have PMULLD; it's emulated with 7
 	     insns, including two PMULUDQ.  */
-	  if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
+	  else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
 	    *total = cost->fmul * 2 + cost->fabs * 5;
 	  else
 	    *total = cost->fmul;
@@ -38448,44 +38479,66 @@  ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
   rtx (*gen_ih) (rtx, rtx, rtx);
   rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
   struct expand_vec_perm_d d;
-  bool ok;
+  bool ok, full_interleave;
+  bool uns_p = false;
   int i;
 
-  if (qimode == V16QImode)
+  switch (qimode)
     {
+    case V16QImode:
       himode = V8HImode;
       gen_il = gen_vec_interleave_lowv16qi;
       gen_ih = gen_vec_interleave_highv16qi;
-    }
-  else if (qimode == V32QImode)
-    {
+      break;
+    case V32QImode:
       himode = V16HImode;
       gen_il = gen_avx2_interleave_lowv32qi;
       gen_ih = gen_avx2_interleave_highv32qi;
+      break;
+    default:
+      gcc_unreachable ();
     }
-  else
-    gcc_unreachable ();
 
-  /* Unpack data such that we've got a source byte in each low byte of
-     each word.  We don't care what goes into the high byte of each word.
-     Rather than trying to get zero in there, most convenient is to let
-     it be a copy of the low byte.  */
-  op1_l = gen_reg_rtx (qimode);
-  op1_h = gen_reg_rtx (qimode);
-  emit_insn (gen_il (op1_l, op1, op1));
-  emit_insn (gen_ih (op1_h, op1, op1));
+  op2_l = op2_h = op2;
+  switch (code)
+    {
+    case MULT:
+      /* Unpack data such that we've got a source byte in each low byte of
+	 each word.  We don't care what goes into the high byte of each word.
+	 Rather than trying to get zero in there, most convenient is to let
+	 it be a copy of the low byte.  */
+      op2_l = gen_reg_rtx (qimode);
+      op2_h = gen_reg_rtx (qimode);
+      emit_insn (gen_il (op2_l, op2, op2));
+      emit_insn (gen_ih (op2_h, op2, op2));
+      /* FALLTHRU */
 
-  op2_l = gen_reg_rtx (qimode);
-  op2_h = gen_reg_rtx (qimode);
-  emit_insn (gen_il (op2_l, op2, op2));
-  emit_insn (gen_ih (op2_h, op2, op2));
+      op1_l = gen_reg_rtx (qimode);
+      op1_h = gen_reg_rtx (qimode);
+      emit_insn (gen_il (op1_l, op1, op1));
+      emit_insn (gen_ih (op1_h, op1, op1));
+      full_interleave = qimode == V16QImode;
+      break;
+
+    case ASHIFT:
+    case LSHIFTRT:
+      uns_p = true;
+      /* FALLTHRU */
+    case ASHIFTRT:
+      op1_l = gen_reg_rtx (himode);
+      op1_h = gen_reg_rtx (himode);
+      ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
+      ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
+      full_interleave = true;
+      break;
+    default:
+      gcc_unreachable ();
+    }
 
   /* Perform the operation.  */
-  res_l = expand_simple_binop (himode, code, gen_lowpart (himode, op1_l),
-			       gen_lowpart (himode, op2_l), NULL_RTX,
+  res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
 			       1, OPTAB_DIRECT);
-  res_h = expand_simple_binop (himode, code, gen_lowpart (himode, op1_h),
-			       gen_lowpart (himode, op2_h), NULL_RTX,
+  res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
 			       1, OPTAB_DIRECT);
   gcc_assert (res_l && res_h);
 
@@ -38498,11 +38551,11 @@  ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
   d.one_operand_p = false;
   d.testing_p = false;
 
-  if (qimode == V16QImode)
+  if (full_interleave)
     {
       /* For SSE2, we used an full interleave, so the desired
 	 results are in the even elements.  */
-      for (i = 0; i < 16; ++i)
+      for (i = 0; i < 32; ++i)
 	d.perm[i] = i * 2;
     }
   else
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 879b87b..da2f4b2 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -711,6 +711,9 @@ 
 ;; Mapping of shift-right operators
 (define_code_iterator any_shiftrt [lshiftrt ashiftrt])
 
+;; Mapping of all shift operators
+(define_code_iterator any_shift [ashift lshiftrt ashiftrt])
+
 ;; Base name for define_insn
 (define_code_attr shift_insn
   [(ashift "ashl") (lshiftrt "lshr") (ashiftrt "ashr")])
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index c7c6392..691256d 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -10550,60 +10550,42 @@ 
    (set_attr "prefix_extra" "2")
    (set_attr "mode" "TI")])
 
-;; SSE2 doesn't have some shift variants, so define versions for XOP
-(define_expand "ashlv16qi3"
-  [(set (match_operand:V16QI 0 "register_operand")
-	(ashift:V16QI
-	  (match_operand:V16QI 1 "register_operand")
-	  (match_operand:SI 2 "nonmemory_operand")))]
-  "TARGET_XOP"
-{
-  rtx reg = gen_reg_rtx (V16QImode);
-  rtx par;
-  int i;
-
-  par = gen_rtx_PARALLEL (V16QImode, rtvec_alloc (16));
-  for (i = 0; i < 16; i++)
-    XVECEXP (par, 0, i) = operands[2];
-
-  emit_insn (gen_vec_initv16qi (reg, par));
-  emit_insn (gen_xop_shav16qi3 (operands[0], operands[1], reg));
-  DONE;
-})
-
-(define_expand "<shift_insn>v16qi3"
-  [(set (match_operand:V16QI 0 "register_operand")
-	(any_shiftrt:V16QI
-	  (match_operand:V16QI 1 "register_operand")
+(define_expand "<shift_insn><mode>3"
+  [(set (match_operand:VI1_AVX2 0 "register_operand")
+	(any_shift:VI1_AVX2
+	  (match_operand:VI1_AVX2 1 "register_operand")
 	  (match_operand:SI 2 "nonmemory_operand")))]
-  "TARGET_XOP"
+  "TARGET_SSE2"
 {
-  rtx reg = gen_reg_rtx (V16QImode);
-  rtx par;
-  bool negate = false;
-  rtx (*shift_insn)(rtx, rtx, rtx);
-  int i;
-
-  if (CONST_INT_P (operands[2]))
-    operands[2] = GEN_INT (-INTVAL (operands[2]));
-  else
-    negate = true;
+  if (TARGET_XOP && <MODE>mode == V16QImode)
+    {
+      bool negate = false;
+      rtx (*gen) (rtx, rtx, rtx);
+      rtx tmp, par;
+      int i;
 
-  par = gen_rtx_PARALLEL (V16QImode, rtvec_alloc (16));
-  for (i = 0; i < 16; i++)
-    XVECEXP (par, 0, i) = operands[2];
+      if (<CODE> != ASHIFT)
+	{
+	  if (CONST_INT_P (operands[2]))
+	    operands[2] = GEN_INT (-INTVAL (operands[2]));
+	  else
+	    negate = true;
+	}
+      par = gen_rtx_PARALLEL (V16QImode, rtvec_alloc (16));
+      for (i = 0; i < 16; i++)
+        XVECEXP (par, 0, i) = operands[2];
 
-  emit_insn (gen_vec_initv16qi (reg, par));
+      tmp = gen_reg_rtx (V16QImode);
+      emit_insn (gen_vec_initv16qi (tmp, par));
 
-  if (negate)
-    emit_insn (gen_negv16qi2 (reg, reg));
+      if (negate)
+	emit_insn (gen_negv16qi2 (tmp, tmp));
 
-  if (<CODE> == LSHIFTRT)
-    shift_insn = gen_xop_shlv16qi3;
+      gen = (<CODE> == LSHIFTRT ? gen_xop_shlv16qi3 : gen_xop_shav16qi3);
+      emit_insn (gen (operands[0], operands[1], tmp));
+    }
   else
-    shift_insn = gen_xop_shav16qi3;
-
-  emit_insn (shift_insn (operands[0], operands[1], reg));
+    ix86_expand_vecop_qihi (<CODE>, operands[0], operands[1], operands[2]);
   DONE;
 })