Patchwork [3/4] i386: Add widen_multiply_even/odd builtins and hooks

login
register
mail settings
Submitter Richard Henderson
Date June 25, 2012, 8:39 p.m.
Message ID <1340656798-18033-4-git-send-email-rth@redhat.com>
Download mbox | patch
Permalink /patch/167237/
State New
Headers show

Comments

Richard Henderson - June 25, 2012, 8:39 p.m.
---
 gcc/ChangeLog                          |   19 ++++++
 gcc/config/i386/i386-builtin-types.def |    5 +-
 gcc/config/i386/i386.c                 |  103 +++++++++++++++++++++++++++++++-
 gcc/config/i386/sse.md                 |   14 +++++
 4 files changed, 137 insertions(+), 4 deletions(-)

Patch

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index b95eab5..f63f523 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,5 +1,24 @@ 
 2012-06-25  Richard Henderson  <rth@redhat.com>
 
+	* config/i386/i386-builtin-types.def (V4UDI, V8USI): New.
+	(V2UDI_FUNC_V4USI_V4USI): New.
+	(V4UDI_FUNC_V8USI_V8USI): New.
+	* config/i386/i386.c (ix86_expand_args_builtin): Handle them.
+	(IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V4SI): New.
+	(IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V8SI): New.
+	(IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V4SI): New.
+	(IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V8SI): New.
+	(IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V4SI): New.
+	(IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V8SI): New.
+	(bdesc_args): Add them.
+	(ix86_builtin_mul_widen_even, ix86_builtin_mul_widen_odd): New.
+	(TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_EVEN): New.
+	(TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD): New.
+	(ix86_expand_mul_widen_evenodd): Use xop_pmacsdqh.
+	* config/i386/sse.md (vec_widen_<s>mult_odd_<V124_AVX2>): New.
+
+2012-06-25  Richard Henderson  <rth@redhat.com>
+
 	* config/i386.sse.md (mul<VI4_AVX2>3): Use xop_pmacsdd.
 
 2012-06-25  Richard Henderson  <rth@redhat.com>
diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def
index 401668a..398bf0a 100644
--- a/gcc/config/i386/i386-builtin-types.def
+++ b/gcc/config/i386/i386-builtin-types.def
@@ -97,7 +97,8 @@  DEF_VECTOR_TYPE (V4DI, DI)
 DEF_VECTOR_TYPE (V8SI, SI)
 DEF_VECTOR_TYPE (V16HI, HI)
 DEF_VECTOR_TYPE (V32QI, QI)
-
+DEF_VECTOR_TYPE (V4UDI, UDI, V4DI)
+DEF_VECTOR_TYPE (V8USI, USI, V8SI)
 
 DEF_POINTER_TYPE (PCCHAR, CHAR, CONST)
 DEF_POINTER_TYPE (PCDOUBLE, DOUBLE, CONST)
@@ -283,6 +284,7 @@  DEF_FUNCTION_TYPE (V2DI, V2DI, SI)
 DEF_FUNCTION_TYPE (V2DI, V2DI, V16QI)
 DEF_FUNCTION_TYPE (V2DI, V2DI, V2DI)
 DEF_FUNCTION_TYPE (V2DI, V4SI, V4SI)
+DEF_FUNCTION_TYPE (V2UDI, V4USI, V4USI)
 DEF_FUNCTION_TYPE (V2DI, PCV2DI, V2DI)
 DEF_FUNCTION_TYPE (V2SF, V2SF, V2SF)
 DEF_FUNCTION_TYPE (V2SI, INT, INT)
@@ -349,6 +351,7 @@  DEF_FUNCTION_TYPE (V8SI, V8SI, SI)
 DEF_FUNCTION_TYPE (V8SI, PCV8SI, V8SI)
 DEF_FUNCTION_TYPE (V4DI, V4DI, V4DI)
 DEF_FUNCTION_TYPE (V4DI, V8SI, V8SI)
+DEF_FUNCTION_TYPE (V4UDI, V8USI, V8USI)
 DEF_FUNCTION_TYPE (V4DI, V4DI, V2DI)
 DEF_FUNCTION_TYPE (V4DI, PCV4DI, V4DI)
 DEF_FUNCTION_TYPE (V4DI, V4DI, INT)
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index a1b7628..c825033 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -25754,6 +25754,13 @@  enum ix86_builtins
   IX86_BUILTIN_CPYSGNPS256,
   IX86_BUILTIN_CPYSGNPD256,
 
+  IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V4SI,
+  IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V8SI,
+  IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V4SI,
+  IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V8SI,
+  IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V4SI,
+  IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V8SI,
+
   /* FMA4 instructions.  */
   IX86_BUILTIN_VFMADDSS,
   IX86_BUILTIN_VFMADDSD,
@@ -26612,6 +26619,8 @@  static const struct builtin_description bdesc_args[] =
 
   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_vw_umul_even_v4si", IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V4SI, UNKNOWN, (int) V2UDI_FTYPE_V4USI_V4USI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_odd_v4si, "__builtin_ia32_vw_umul_odd_v4si", IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V4SI, UNKNOWN, (int) V2UDI_FTYPE_V4USI_V4USI },
 
   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
 
@@ -26738,6 +26747,7 @@  static const struct builtin_description bdesc_args[] =
   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_vec_widen_smult_odd_v4si, "__builtin_ia32_vw_smul_odd_v4si", IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V4SI, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
 
   /* SSE4.1 */
@@ -27004,12 +27014,15 @@  static const struct builtin_description bdesc_args[] =
   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2  , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2  , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mulv4siv4di3  , "__builtin_ia32_pmuldq256"  , IX86_BUILTIN_PMULDQ256  , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_smult_odd_v8si, "__builtin_ia32_vw_smul_odd_v8si", IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V8SI, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
   { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
   { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256"  , IX86_BUILTIN_PMULHW256  , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
   { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256"  , IX86_BUILTIN_PMULLW256  , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
   { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256"  , IX86_BUILTIN_PMULLD256  , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulv4siv4di3  , "__builtin_ia32_pmuludq256" , IX86_BUILTIN_PMULUDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulv4siv4di3  , "__builtin_i386_vw_umul_even_v8si" , IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V8SI, UNKNOWN, (int) V4UDI_FTYPE_V8USI_V8USI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_umult_odd_v8si, "__builtin_ia32_vw_umul_odd_v8si", IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V8SI, UNKNOWN, (int) V4UDI_FTYPE_V8USI_V8USI },
   { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
@@ -29142,6 +29155,7 @@  ix86_expand_args_builtin (const struct builtin_description *d,
     case V2DI_FTYPE_V2DI_V2DI:
     case V2DI_FTYPE_V16QI_V16QI:
     case V2DI_FTYPE_V4SI_V4SI:
+    case V2UDI_FTYPE_V4USI_V4USI:
     case V2DI_FTYPE_V2DI_V16QI:
     case V2DI_FTYPE_V2DF_V2DF:
     case V2SI_FTYPE_V2SI_V2SI:
@@ -29166,6 +29180,7 @@  ix86_expand_args_builtin (const struct builtin_description *d,
     case V8SI_FTYPE_V16HI_V16HI:
     case V4DI_FTYPE_V4DI_V4DI:
     case V4DI_FTYPE_V8SI_V8SI:
+    case V4UDI_FTYPE_V8USI_V8USI:
       if (comparison == UNKNOWN)
 	return ix86_expand_binop_builtin (icode, exp, target);
       nargs = 2;
@@ -31042,6 +31057,78 @@  ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
 	return NULL_TREE;
       }
 }
+
+static tree
+ix86_builtin_mul_widen_even (tree type)
+{
+  bool uns_p = TYPE_UNSIGNED (type);
+  enum ix86_builtins code;
+
+  switch (TYPE_MODE (type))
+    {
+    case V4SImode:
+      if (uns_p)
+	{
+	  if (!TARGET_SSE2)
+	    return NULL;
+	  code = IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V4SI;
+	}
+      else
+	{
+	  if (!TARGET_SSE4_1)
+	    return NULL;
+	  code = IX86_BUILTIN_PMULDQ128;
+	}
+      break;
+
+    case V8SImode:
+      if (!TARGET_AVX2)
+	return NULL;
+      code = (uns_p ? IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V8SI
+	      : IX86_BUILTIN_PMULDQ256);
+      break;
+
+    default:
+      return NULL;
+    }
+  return ix86_builtins[code];
+}
+
+static tree
+ix86_builtin_mul_widen_odd (tree type)
+{
+  bool uns_p = TYPE_UNSIGNED (type);
+  enum ix86_builtins code;
+
+  switch (TYPE_MODE (type))
+    {
+    case V4SImode:
+      if (uns_p)
+	{
+	  if (!TARGET_SSE2)
+	    return NULL;
+	  code = IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V4SI;
+	}
+      else
+	{
+	  if (!TARGET_SSE4_1)
+	    return NULL;
+	  code = IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V4SI;
+	}
+      break;
+
+    case V8SImode:
+      if (!TARGET_AVX2)
+	return NULL;
+      code = (uns_p ? IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V8SI
+	      : IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V8SI);
+      break;
+
+    default:
+      return NULL;
+    }
+  return ix86_builtins[code];
+}
 
 /* Helper for avx_vpermilps256_operand et al.  This is also used by
    the expansion functions to turn the parallel back into a mask.
@@ -38663,6 +38750,7 @@  ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
 			       bool uns_p, bool odd_p)
 {
   enum machine_mode mode = GET_MODE (op1);
+  enum machine_mode wmode = GET_MODE (dest);
   rtx x;
 
   /* We only play even/odd games with vectors of SImode.  */
@@ -38672,8 +38760,12 @@  ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
      the even slots.  For some cpus this is faster than a PSHUFD.  */
   if (odd_p)
     {
-      enum machine_mode wmode = GET_MODE (dest);
-
+      if (TARGET_XOP && mode == V4SImode)
+	{
+	  x = force_reg (wmode, CONST0_RTX (wmode));
+	  emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
+	  return;
+	}
       op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
 			  GEN_INT (GET_MODE_UNIT_BITSIZE (mode)), NULL,
 			  1, OPTAB_DIRECT);
@@ -38697,7 +38789,7 @@  ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
     x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
   else if (TARGET_XOP)
     {
-      x = force_reg (V2DImode, CONST0_RTX (V2DImode));
+      x = force_reg (wmode, CONST0_RTX (wmode));
       x = gen_xop_pmacsdql (dest, op1, op2, x);
     }
   else
@@ -39980,6 +40072,11 @@  ix86_memmodel_check (unsigned HOST_WIDE_INT val)
 #undef TARGET_VECTORIZE_BUILTIN_GATHER
 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
 
+#undef TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_EVEN
+#define TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_EVEN ix86_builtin_mul_widen_even
+#undef TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD
+#define TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD ix86_builtin_mul_widen_odd
+
 #undef TARGET_BUILTIN_RECIPROCAL
 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
 
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 93cd9d7..45d3a9c 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -5708,6 +5708,20 @@ 
   DONE;
 })
 
+(define_expand "vec_widen_<s>mult_odd_<mode>"
+  [(match_operand:<sseunpackmode> 0 "register_operand")
+   (any_extend:<sseunpackmode>
+     (match_operand:VI124_AVX2 1 "register_operand"))
+   (match_operand:VI124_AVX2 2 "register_operand")]
+  ; Note that SSE2 does not have signed SI multiply
+  "TARGET_AVX || TARGET_XOP || TARGET_SSE4_1
+   || (TARGET_SSE2 && (<u_bool> || <MODE>mode != V4SImode))"
+{
+  ix86_expand_mul_widen_evenodd (operands[0], operands[1], operands[2],
+				 <u_bool>, true);
+  DONE;
+})
+
 (define_expand "sdot_prod<mode>"
   [(match_operand:<sseunpackmode> 0 "register_operand")
    (match_operand:VI2_AVX2 1 "register_operand")