diff mbox series

[v2,11/11] Native complex ops: Experimental support in x86 backend

Message ID 20230912100713.1074-12-snoiry@kalrayinc.com
State New
Headers show
Series Native complex operations | expand

Commit Message

Sylvain Noiry Sept. 12, 2023, 10:07 a.m. UTC
Summary:
Add an experimental support for native complex operation handling in
the x86 backend. For now it only support add, sub, mul, conj, neg, mov
in SCmode (complex float). Performance gains are still marginal on this
target because there are no particular instructions to speedup complex
operation, except some SIMD tricks.

gcc/ChangeLog:

	* config/i386/i386.cc (classify_argument): Align complex
	element to the whole size, not size of the parts
	(ix86_return_in_memory): Handle complex modes like a scalar
	with the same size
	(ix86_class_max_nregs): Likewise
	(ix86_hard_regno_nregs): Likewise
	(function_value_ms_64): Add case for SCmode
	(ix86_build_const_vector): Likewise
	(ix86_build_signbit_mask): Likewise
	(x86_gen_rtx_complex): New: Implement the gen_rtx_complex
	hook, use registers of complex modes to represent complex
	elements in rtl
	(x86_read_complex_part): New: Implement the read_complex_part
	hook, handle registers of complex modes
	(x86_write_complex_part): New: Implement the write_complex_part
	hook, handle registers of complex modes
	* config/i386/i386.h: Add SCmode in several predicates
	* config/i386/sse.md: Add pattern for some complex operations in
	SCmode. This includes movsc, addsc3, subsc3, negsc2, mulsc3,
	and conjsc2
---
 gcc/config/i386/i386.cc | 296 +++++++++++++++++++++++++++++++++++++++-
 gcc/config/i386/i386.h  |  11 +-
 gcc/config/i386/sse.md  | 144 +++++++++++++++++++
 3 files changed, 440 insertions(+), 11 deletions(-)
diff mbox series

Patch

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 477e6cecc38..77bf80b64b1 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -2348,8 +2348,8 @@  classify_argument (machine_mode mode, const_tree type,
 	mode_alignment = 128;
       else if (mode == XCmode)
 	mode_alignment = 256;
-      if (COMPLEX_MODE_P (mode))
-	mode_alignment /= 2;
+      /*if (COMPLEX_MODE_P (mode))
+	mode_alignment /= 2;*/
       /* Misaligned fields are always returned in memory.  */
       if (bit_offset % mode_alignment)
 	return 0;
@@ -3023,6 +3023,7 @@  pass_in_reg:
     case E_V4BFmode:
     case E_V2SImode:
     case E_V2SFmode:
+    case E_SCmode:
     case E_V1TImode:
     case E_V1DImode:
       if (!type || !AGGREGATE_TYPE_P (type))
@@ -3273,6 +3274,7 @@  pass_in_reg:
     case E_V4BFmode:
     case E_V2SImode:
     case E_V2SFmode:
+    case E_SCmode:
     case E_V1TImode:
     case E_V1DImode:
       if (!type || !AGGREGATE_TYPE_P (type))
@@ -4187,8 +4189,8 @@  function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
 	      && !INTEGRAL_TYPE_P (valtype)
 	      && !VECTOR_FLOAT_TYPE_P (valtype))
 	    break;
-	  if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
-	      && !COMPLEX_MODE_P (mode))
+	  if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode)))
+	     // && !COMPLEX_MODE_P (mode))
 	    regno = FIRST_SSE_REG;
 	  break;
 	case 8:
@@ -4295,7 +4297,7 @@  ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
 	       || INTEGRAL_TYPE_P (type)
 	       || VECTOR_FLOAT_TYPE_P (type))
 	      && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
-	      && !COMPLEX_MODE_P (mode)
+	      //&& !COMPLEX_MODE_P (mode)
 	      && (GET_MODE_SIZE (mode) == 16 || size == 16))
 	    return false;
 
@@ -15752,6 +15754,7 @@  ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
     case E_V8SFmode:
     case E_V4SFmode:
     case E_V2SFmode:
+    case E_SCmode:
     case E_V8DFmode:
     case E_V4DFmode:
     case E_V2DFmode:
@@ -15800,6 +15803,7 @@  ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
     case E_V8SFmode:
     case E_V4SFmode:
     case E_V2SFmode:
+    case E_SCmode:
     case E_V2SImode:
       vec_mode = mode;
       imode = SImode;
@@ -19894,7 +19898,8 @@  ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
   else
     {
       if (COMPLEX_MODE_P (mode))
-	return 2;
+	return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
+	//return 2;
       else
 	return 1;
     }
@@ -20230,7 +20235,8 @@  ix86_hard_regno_nregs (unsigned int regno, machine_mode mode)
       return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
     }
   if (COMPLEX_MODE_P (mode))
-    return 2;
+    return 1;
+    //return 2;
   /* Register pair for mask registers.  */
   if (mode == P2QImode || mode == P2HImode)
     return 2;
@@ -23757,6 +23763,273 @@  ix86_preferred_simd_mode (scalar_mode mode)
     }
 }
 
+static rtx
+x86_gen_rtx_complex (machine_mode mode, rtx real_part, rtx imag_part)
+{
+  machine_mode imode = GET_MODE_INNER (mode);
+
+  if ((real_part == imag_part) && (real_part == CONST0_RTX (imode)))
+    {
+      if (CONST_DOUBLE_P (real_part))
+       return const_double_from_real_value (dconst0, mode);
+      else if (CONST_INT_P (real_part))
+       return GEN_INT (0);
+      else
+       gcc_unreachable ();
+    }
+
+  bool saved_generating_concat_p = generating_concat_p;
+  generating_concat_p = false;
+  rtx complex_reg = gen_reg_rtx (mode);
+  generating_concat_p = saved_generating_concat_p;
+
+  if (real_part)
+    {
+      gcc_assert (imode == GET_MODE (real_part));
+      write_complex_part (complex_reg, real_part, REAL_P, false);
+    }
+
+  if (imag_part)
+    {
+      gcc_assert (imode == GET_MODE (imag_part));
+      write_complex_part (complex_reg, imag_part, IMAG_P, false);
+    }
+
+  return complex_reg;
+}
+
+static rtx
+x86_read_complex_part (rtx cplx, complex_part_t part)
+{
+  machine_mode cmode;
+  scalar_mode imode;
+  unsigned ibitsize;
+
+  if (GET_CODE (cplx) == CONCAT)
+    return XEXP (cplx, part);
+
+  cmode = GET_MODE (cplx);
+  imode = GET_MODE_INNER (cmode);
+  ibitsize = GET_MODE_BITSIZE (imode);
+
+  if (COMPLEX_MODE_P (cmode) && (part == BOTH_P))
+    return cplx;
+
+  /* For constants under 32-bit vector constans are folded during expand,
+   * so we need to compensate for it as cplx is an integer constant
+   * In this case cmode and imode are equal */
+  if (cmode == imode)
+    ibitsize /= 2;
+
+  if (cmode == E_VOIDmode)
+    return cplx;               /* FIXME case used when initialising mock in a complex register */
+
+  if ((cmode == E_DCmode) && (GET_CODE (cplx) == CONST_DOUBLE))        /* FIXME stop generation of DC const_double, because not patterns and wired */
+    return CONST0_RTX (E_DFmode);
+  /* verify aswell SC const_double */
+
+  /* Special case reads from complex constants that got spilled to memory.  */
+  if (MEM_P (cplx) && GET_CODE (XEXP (cplx, 0)) == SYMBOL_REF)
+    {
+      tree decl = SYMBOL_REF_DECL (XEXP (cplx, 0));
+      if (decl && TREE_CODE (decl) == COMPLEX_CST)
+	{
+	  tree cplx_part = (part == IMAG_P) ? TREE_IMAGPART (decl)
+			  : (part == REAL_P) ? TREE_REALPART (decl)
+			  : TREE_COMPLEX_BOTH_PARTS (decl);
+	if (CONSTANT_CLASS_P (cplx_part))
+	  return expand_expr (cplx_part, NULL_RTX, imode, EXPAND_NORMAL);
+	}
+    }
+
+  /* For MEMs simplify_gen_subreg may generate an invalid new address
+     because, e.g., the original address is considered mode-dependent
+     by the target, which restricts simplify_subreg from invoking
+     adjust_address_nv.  Instead of preparing fallback support for an
+     invalid address, we call adjust_address_nv directly.  */
+  if (MEM_P (cplx))
+    {
+      if (part == BOTH_P)
+       return adjust_address_nv (cplx, cmode, 0);
+      else
+       return adjust_address_nv (cplx, imode, (part == IMAG_P)
+				 ? GET_MODE_SIZE (imode) : 0);
+    }
+
+  /* If the sub-object is at least word sized, then we know that subregging
+     will work.  This special case is important, since extract_bit_field
+     wants to operate on integer modes, and there's rarely an OImode to
+     correspond to TCmode.  */
+  if (ibitsize >= BITS_PER_WORD
+      /* For hard regs we have exact predicates.  Assume we can split
+	 the original object if it spans an even number of hard regs.
+	 This special case is important for SCmode on 64-bit platforms
+	 where the natural size of floating-point regs is 32-bit.  */
+      || (REG_P (cplx)
+	  && REGNO (cplx) < FIRST_PSEUDO_REGISTER
+	  && REG_NREGS (cplx) % 2 == 0))
+    {
+      rtx ret = simplify_gen_subreg (imode, cplx, cmode, (part == IMAG_P)
+				     ? GET_MODE_SIZE (imode) : 0);
+      if (ret)
+       return ret;
+      else
+       /* simplify_gen_subreg may fail for sub-word MEMs.  */
+       gcc_assert (MEM_P (cplx) && ibitsize < BITS_PER_WORD);
+    }
+
+  if (part == BOTH_P)
+    return extract_bit_field (cplx, 2 * ibitsize, 0, true, NULL_RTX, cmode,
+			      cmode, false, NULL);
+  else
+    return extract_bit_field (cplx, ibitsize, (part == IMAG_P) ? ibitsize : 0,
+			      true, NULL_RTX, imode, imode, false, NULL);
+}
+
+static void
+x86_write_complex_part (rtx cplx, rtx val, complex_part_t part, bool undefined_p)
+{
+  machine_mode cmode;
+  scalar_mode imode;
+  unsigned ibitsize;
+
+  cmode = GET_MODE (cplx);
+  imode = GET_MODE_INNER (cmode);
+  ibitsize = GET_MODE_BITSIZE (imode);
+
+  /* special case for constants */
+  if (GET_CODE (val) == CONST_VECTOR)
+    {
+      if (part == BOTH_P)
+	{
+	  machine_mode temp_mode = E_BLKmode;;
+	  switch (cmode)
+	    {
+	    case E_CQImode:
+	      temp_mode = E_HImode;
+	      break;
+	    case E_CHImode:
+	      temp_mode = E_SImode;
+	      break;
+	    case E_CSImode:
+	      temp_mode = E_DImode;
+	      break;
+	    case E_SCmode:
+	      temp_mode = E_DFmode;
+	      break;
+	    case E_CDImode:
+	      temp_mode = E_TImode;
+	      break;
+	    case E_DCmode:
+	    default:
+	      break;
+	    }
+
+	  if (temp_mode != E_BLKmode)
+	    {
+	      rtx temp_reg = gen_reg_rtx (temp_mode);
+	      store_bit_field (temp_reg, GET_MODE_BITSIZE (temp_mode), 0, 0,
+			       0, GET_MODE (val), val, false, undefined_p);
+	      emit_move_insn (cplx,
+			      simplify_gen_subreg (cmode, temp_reg, temp_mode,
+						   0));
+	    }
+	  else
+	    {
+	      /* write real part and imag part separetly */
+	      gcc_assert (GET_CODE (val) == CONST_VECTOR);
+	      write_complex_part (cplx, const_vector_elt (val, 0), REAL_P, false);
+	      write_complex_part (cplx, const_vector_elt (val, 1), IMAG_P, false);
+	    }
+	}
+      else
+	write_complex_part (cplx,
+			    const_vector_elt (val,
+			    ((part == REAL_P) ? 0 : 1)),
+			    part, false);
+      return;
+    }
+
+  if ((part == BOTH_P) && !MEM_P (cplx)
+      /*&& (optab_handler (mov_optab, cmode) != CODE_FOR_nothing)*/)
+    {
+      write_complex_part (cplx, read_complex_part(cplx, REAL_P), REAL_P, undefined_p);
+      write_complex_part (cplx, read_complex_part(cplx, IMAG_P), IMAG_P, undefined_p);
+      //emit_move_insn (cplx, val);
+      return;
+    }
+
+  if ((GET_CODE (val) == CONST_DOUBLE) || (GET_CODE (val) == CONST_INT))
+    {
+      if (part == REAL_P)
+	{
+	  emit_move_insn (gen_lowpart (imode, cplx), val);
+	  return;
+	}
+      else if (part == IMAG_P)
+	{
+	  /* cannot set highpart of a pseudo register */
+	  if (REGNO (cplx) < FIRST_PSEUDO_REGISTER)
+	    {
+	      emit_move_insn (gen_highpart (imode, cplx), val);
+	      return;
+	    }
+	}
+      else
+	gcc_unreachable ();
+    }
+
+  if (GET_CODE (cplx) == CONCAT)
+    {
+      emit_move_insn (XEXP (cplx, part), val);
+      return;
+    }
+
+  /* For MEMs simplify_gen_subreg may generate an invalid new address
+     because, e.g., the original address is considered mode-dependent
+     by the target, which restricts simplify_subreg from invoking
+     adjust_address_nv.  Instead of preparing fallback support for an
+     invalid address, we call adjust_address_nv directly.  */
+  if (MEM_P (cplx))
+    {
+      if (part == BOTH_P)
+       emit_move_insn (adjust_address_nv (cplx, cmode, 0), val);
+      else
+       emit_move_insn (adjust_address_nv (cplx, imode, (part == IMAG_P)
+					  ? GET_MODE_SIZE (imode) : 0), val);
+      return;
+    }
+
+  /* If the sub-object is at least word sized, then we know that subregging
+     will work.  This special case is important, since store_bit_field
+     wants to operate on integer modes, and there's rarely an OImode to
+     correspond to TCmode.  */
+  if (ibitsize >= BITS_PER_WORD
+      /* For hard regs we have exact predicates.  Assume we can split
+	 the original object if it spans an even number of hard regs.
+	 This special case is important for SCmode on 64-bit platforms
+	 where the natural size of floating-point regs is 32-bit.  */
+      || (REG_P (cplx)
+	  && REGNO (cplx) < FIRST_PSEUDO_REGISTER
+	  && REG_NREGS (cplx) % 2 == 0))
+    {
+      rtx cplx_part = simplify_gen_subreg (imode, cplx, cmode,
+					   (part == IMAG_P)
+					   ? GET_MODE_SIZE (imode) : 0);
+      if (cplx_part)
+	{
+	  emit_move_insn (cplx_part, val);
+	  return;
+	}
+      else
+       /* simplify_gen_subreg may fail for sub-word MEMs.  */
+       gcc_assert (MEM_P (cplx) && ibitsize < BITS_PER_WORD);
+    }
+
+  store_bit_field (cplx, ibitsize, (part == IMAG_P) ? ibitsize : 0, 0, 0,
+		   imode, val, false, undefined_p);
+}
+
 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
    vectors.  If AVX512F is enabled then try vectorizing with 512bit,
    256bit and 128bit vectors.  */
@@ -25792,6 +26065,15 @@  ix86_libgcc_floating_mode_supported_p
 #undef TARGET_IFUNC_REF_LOCAL_OK
 #define TARGET_IFUNC_REF_LOCAL_OK ix86_ifunc_ref_local_ok
 
+#undef TARGET_GEN_RTX_COMPLEX
+#define TARGET_GEN_RTX_COMPLEX x86_gen_rtx_complex
+
+#undef TARGET_READ_COMPLEX_PART
+#define TARGET_READ_COMPLEX_PART x86_read_complex_part
+
+#undef TARGET_WRITE_COMPLEX_PART
+#define TARGET_WRITE_COMPLEX_PART x86_write_complex_part
+
 #if !TARGET_MACHO && !TARGET_DLLIMPORT_DECL_ATTRIBUTES
 # undef TARGET_ASM_RELOC_RW_MASK
 # define TARGET_ASM_RELOC_RW_MASK ix86_reloc_rw_mask
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 3e8488f2ae8..faa058f3ec0 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1058,7 +1058,8 @@  extern const char *host_detect_local_cpu (int argc, const char **argv);
    || (MODE) == V4QImode || (MODE) == V2HImode || (MODE) == V1SImode	\
    || (MODE) == V2DImode || (MODE) == V2QImode				\
    || (MODE) == DFmode	|| (MODE) == DImode				\
-   || (MODE) == HFmode || (MODE) == BFmode)
+   || (MODE) == HFmode || (MODE) == BFmode				\
+   || (MODE) == SCmode)
 
 #define VALID_SSE_REG_MODE(MODE)					\
   ((MODE) == V1TImode || (MODE) == TImode				\
@@ -1067,7 +1068,7 @@  extern const char *host_detect_local_cpu (int argc, const char **argv);
    || (MODE) == TFmode || (MODE) == TDmode)
 
 #define VALID_MMX_REG_MODE_3DNOW(MODE) \
-  ((MODE) == V2SFmode || (MODE) == SFmode)
+  ((MODE) == V2SFmode || (MODE) == SFmode || (MODE) == SCmode)
 
 /* To match ia32 psABI, V4HFmode should be added here.  */
 #define VALID_MMX_REG_MODE(MODE)					\
@@ -1110,13 +1111,15 @@  extern const char *host_detect_local_cpu (int argc, const char **argv);
    || (MODE) == V16SImode || (MODE) == V32HImode || (MODE) == V8DFmode	\
    || (MODE) == V16SFmode \
    || (MODE) == V32HFmode || (MODE) == V16HFmode || (MODE) == V8HFmode  \
-   || (MODE) == V32BFmode || (MODE) == V16BFmode || (MODE) == V8BFmode)
+   || (MODE) == V32BFmode || (MODE) == V16BFmode || (MODE) == V8BFmode	\
+   || (MODE) == SCmode)
 
 #define X87_FLOAT_MODE_P(MODE)	\
   (TARGET_80387 && ((MODE) == SFmode || (MODE) == DFmode || (MODE) == XFmode))
 
 #define SSE_FLOAT_MODE_P(MODE) \
-  ((TARGET_SSE && (MODE) == SFmode) || (TARGET_SSE2 && (MODE) == DFmode))
+  ((TARGET_SSE && (MODE) == SFmode) || (TARGET_SSE2 && (MODE) == DFmode) \
+   || (TARGET_SSE2 && (MODE) == SCmode))
 
 #define SSE_FLOAT_MODE_SSEMATH_OR_HF_P(MODE)				\
   ((SSE_FLOAT_MODE_P (MODE) && TARGET_SSE_MATH)				\
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 80b43fd7db7..06281eb0fd6 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -30504,3 +30504,147 @@ 
   "TARGET_AVXVNNIINT16"
   "vpdp<vpdpwprodtype>\t{%3, %2, %0|%0, %2, %3}"
    [(set_attr "prefix" "vex")])
+
+(define_expand "movsc"
+  [(match_operand:SC 0 "nonimmediate_operand" "")
+   (match_operand:SC 1 "nonimmediate_operand" "")]
+  ""
+  {
+    emit_insn (gen_movv2sf (simplify_gen_subreg (V2SFmode, operands[0], SCmode, 0),
+			    simplify_gen_subreg (V2SFmode, operands[1], SCmode, 0)));
+    DONE;
+  }
+)
+
+(define_expand "addsc3"
+  [(match_operand:SC 0 "register_operand" "=r")
+   (match_operand:SC 1 "register_operand" "r")
+   (match_operand:SC 2 "register_operand" "r")]
+  ""
+  {
+    emit_insn (gen_addv2sf3 (simplify_gen_subreg (V2SFmode, operands[0], SCmode, 0),
+			     simplify_gen_subreg (V2SFmode, operands[1], SCmode, 0),
+			     simplify_gen_subreg (V2SFmode, operands[2], SCmode, 0)));
+    DONE;
+  }
+)
+
+(define_expand "subsc3"
+  [(match_operand:SC 0 "register_operand" "=r")
+   (match_operand:SC 1 "register_operand" "r")
+   (match_operand:SC 2 "register_operand" "r")]
+  ""
+  {
+    emit_insn (gen_subv2sf3 (simplify_gen_subreg (V2SFmode, operands[0], SCmode, 0),
+			     simplify_gen_subreg (V2SFmode, operands[1], SCmode, 0),
+			     simplify_gen_subreg (V2SFmode, operands[2], SCmode, 0)));
+    DONE;
+  }
+)
+
+(define_expand "negsc2"
+  [(match_operand:SC 0 "register_operand" "=r")
+   (match_operand:SC 1 "register_operand" "r")]
+  ""
+  {
+    emit_insn (gen_negv2sf2 (simplify_gen_subreg (V2SFmode, operands[0], SCmode, 0),
+                             simplify_gen_subreg (V2SFmode, operands[1], SCmode, 0)));
+    DONE;
+  }
+)
+
+(define_expand "sse_shufsc"
+  [(match_operand:V4SF 0 "register_operand")
+   (match_operand:SC 1 "register_operand")
+   (match_operand:SC 2 "vector_operand")
+   (match_operand:SI 3 "const_int_operand")]
+  "TARGET_SSE"
+{
+  int mask = INTVAL (operands[3]);
+  emit_insn (gen_sse_shufsc_sc (operands[0],
+						     operands[1],
+						     operands[2],
+						     GEN_INT ((mask >> 0) & 3),
+						     GEN_INT ((mask >> 2) & 3),
+						     GEN_INT (((mask >> 4) & 3) + 4),
+						     GEN_INT (((mask >> 6) & 3) + 4)));
+  DONE;
+})
+
+(define_insn "sse_shufsc_sc"
+  [(set (match_operand:V4SF 0 "register_operand" "=x,v")
+	(vec_select:V4SF
+	  (vec_concat:V4SF
+	    (match_operand:V2SF 1 "register_operand" "0,v")
+	    (match_operand:V2SF 2 "vector_operand" "xBm,vm"))
+	  (parallel [(match_operand 3 "const_0_to_3_operand")
+		     (match_operand 4 "const_0_to_3_operand")
+		     (match_operand 5 "const_4_to_7_operand")
+		     (match_operand 6 "const_4_to_7_operand")])))]
+  "TARGET_SSE"
+{
+  int mask = 0;
+  mask |= INTVAL (operands[3]) << 0;
+  mask |= INTVAL (operands[4]) << 2;
+  mask |= (INTVAL (operands[5]) - 4) << 4;
+  mask |= (INTVAL (operands[6]) - 4) << 6;
+  operands[3] = GEN_INT (mask);
+
+  switch (which_alternative)
+    {
+    case 0:
+      return "shufps\t{%3, %2, %0|%0, %2, %3}";
+    case 1:
+      return "vshufps\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseshuf")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "orig,maybe_evex")
+   (set_attr "mode" "V4SF")])
+
+(define_expand "mulsc3"
+  [(match_operand:SC 0 "register_operand" "=r")
+   (match_operand:SC 1 "register_operand" "r")
+   (match_operand:SC 2 "register_operand" "r")]
+  "TARGET_SSE3"
+  {
+    rtx a = gen_reg_rtx (V4SFmode);
+    rtx b = gen_reg_rtx (V4SFmode);
+    emit_insn (gen_sse_shufsc (a,
+                                    simplify_gen_subreg (V2SFmode, operands[1], SCmode, 0),
+                                    simplify_gen_subreg (V2SFmode, operands[1], SCmode, 0),
+                                    GEN_INT (0b01000100)));
+    emit_insn (gen_sse_shufsc (b,
+                                    simplify_gen_subreg (V2SFmode, operands[2], SCmode, 0),
+                                    simplify_gen_subreg (V2SFmode, operands[2], SCmode, 0),
+                                    GEN_INT (0b00010100)));
+    emit_insn (gen_mulv4sf3 (a, a, b));
+    emit_insn (gen_sse_shufps (b,
+                                    a,
+                                    a,
+                                    GEN_INT (0b00001101)));
+    emit_insn (gen_sse_shufps (a,
+                                    a,
+                                    a,
+                                    GEN_INT (0b00001000)));
+    emit_insn (gen_vec_addsubv2sf3 (simplify_gen_subreg (V2SFmode, operands[0], SCmode, 0),
+				    simplify_gen_subreg (V2SFmode, a, V4SFmode, 0),
+				    simplify_gen_subreg (V2SFmode, b, V4SFmode, 0)));
+    DONE;
+  }
+)
+
+(define_expand "conjsc2"
+  [(match_operand:SC 0 "register_operand" "=r")
+   (match_operand:SC 1 "register_operand" "r")]
+  ""
+  {
+    emit_insn (gen_negdf2 (simplify_gen_subreg (DFmode, operands[0], SCmode, 0),
+			   simplify_gen_subreg (DFmode, operands[1], SCmode, 0)));
+    DONE;
+  }
+)