diff mbox

[i386] ; Fix PR 66560, Fails to generate ADDSUBPS

Message ID CAFULd4ZEoZeyHEYAh_iLLvr8jU+PzXOm_mfFarr6vGCi9x1sEg@mail.gmail.com
State New
Headers show

Commit Message

Uros Bizjak June 23, 2015, 8:59 a.m. UTC
Hello!

Attached patch introduces combiner splitters to handle every possible
ADDSUB permutation of vec_merge and vec_select/vec_concat operands.
These combiners handle swapped PLUS and MINUS operators, and account
for commutative operands of PLUS RTX. As shown in the attached
testcases, there are quite some ways to create ADDSUB.

2015-06-23  Uros Bizjak  <ubizjak@gmail.com>

    PR target/66560
    * config/i386/predicates.md (addsub_vm_operator): New predicate.
    (addsub_vs_operator): Ditto.
    (addsub_vs_parallel): Ditto.
    * config/i386/sse.md (ssedoublemode): Add V4SF and V2DF modes.
    (avx_addsubv4df3, avx_addsubv8sf3, sse3_addsubv2df3, sse3_addsubv4sf3):
    Put minus RTX before plus and adjust vec_merge selector.
    (*avx_addsubv4df3_1, *avx_addsubv4df3_1s, *sse3_addsubv2df3_1)
    (*sse_addsubv2df3_1s, *avx_addsubv8sf3_1, *avx_addsubv8sf3_1s)
    (*sse3_addsubv4sf3_1, *sse_addsubv4sf3_1s): Remove insn patterns.
    (addsub vec_merge splitters): New combiner splitters.
    (addsub vec_select/vec_concat splitters): Ditto.

testsuite/ChangeLog:

2015-06-23  Uros Bizjak  <ubizjak@gmail.com>

    PR target/66560
    * gcc.target/i386/pr66560-1.c: New test.
    * gcc.target/i386/pr66560-2.c: Ditto.
    * gcc.target/i386/pr66560-3.c: Ditto.
    * gcc.target/i386/pr66560-4.c: Ditto.

Patch was tested on x86_64-linux-gnu {,-m32}.and was committed to mainline SVN.

Uros.
diff mbox

Patch

Index: testsuite/gcc.target/i386/pr66560-1.c
===================================================================
--- testsuite/gcc.target/i386/pr66560-1.c	(revision 0)
+++ testsuite/gcc.target/i386/pr66560-1.c	(revision 0)
@@ -0,0 +1,35 @@ 
+/* PR target/66560 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse4" } */
+
+typedef float v4sf __attribute__((vector_size(16)));
+typedef int v4si __attribute__((vector_size(16)));
+v4sf foo1 (v4sf x, v4sf y)
+{
+  v4sf tem0 = x - y;
+  v4sf tem1 = x + y;
+  return __builtin_shuffle (tem0, tem1, (v4si) { 0, 5, 2, 7 });
+}
+
+v4sf foo2 (v4sf x, v4sf y)
+{
+  v4sf tem0 = x - y;
+  v4sf tem1 = y + x;
+  return __builtin_shuffle (tem0, tem1, (v4si) { 0, 5, 2, 7 });
+}
+
+v4sf foo3 (v4sf x, v4sf y)
+{
+  v4sf tem0 = x + y;
+  v4sf tem1 = x - y;
+  return __builtin_shuffle (tem0, tem1, (v4si) { 4, 1, 6, 3 });
+}
+
+v4sf foo4 (v4sf x, v4sf y)
+{
+  v4sf tem0 = y + x;
+  v4sf tem1 = x - y;
+  return __builtin_shuffle (tem0, tem1, (v4si) { 4, 1, 6, 3 });
+}
+
+/* { dg-final { scan-assembler-times "addsubps" 4 } } */
Index: testsuite/gcc.target/i386/pr66560-2.c
===================================================================
--- testsuite/gcc.target/i386/pr66560-2.c	(revision 0)
+++ testsuite/gcc.target/i386/pr66560-2.c	(revision 0)
@@ -0,0 +1,35 @@ 
+/* PR target/66560 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse4" } */
+
+typedef double v2df __attribute__((vector_size(16)));
+typedef long long v2di __attribute__((vector_size(16)));
+v2df foo1 (v2df x, v2df y)
+{
+  v2df tem0 = x - y;
+  v2df tem1 = x + y;
+  return __builtin_shuffle (tem0, tem1, (v2di) { 0, 3 });
+}
+
+v2df foo2 (v2df x, v2df y)
+{
+  v2df tem0 = x - y;
+  v2df tem1 = y + x;
+  return __builtin_shuffle (tem0, tem1, (v2di) { 0, 3 });
+}
+
+v2df foo3 (v2df x, v2df y)
+{
+  v2df tem0 = x + y;
+  v2df tem1 = x - y;
+  return __builtin_shuffle (tem0, tem1, (v2di) { 2, 1 });
+}
+
+v2df foo4 (v2df x, v2df y)
+{
+  v2df tem0 = y + x;
+  v2df tem1 = x - y;
+  return __builtin_shuffle (tem0, tem1, (v2di) { 2, 1 });
+}
+
+/* { dg-final { scan-assembler-times "addsubpd" 4 } } */
Index: testsuite/gcc.target/i386/pr66560-3.c
===================================================================
--- testsuite/gcc.target/i386/pr66560-3.c	(revision 0)
+++ testsuite/gcc.target/i386/pr66560-3.c	(revision 0)
@@ -0,0 +1,35 @@ 
+/* PR target/66560 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+
+typedef float v8sf __attribute__((vector_size(32)));
+typedef int v8si __attribute__((vector_size(32)));
+v8sf foo1 (v8sf x, v8sf y)
+{
+  v8sf tem0 = x - y;
+  v8sf tem1 = x + y;
+  return __builtin_shuffle (tem0, tem1, (v8si) { 0, 9, 2, 11, 4, 13, 6, 15 });
+}
+
+v8sf foo2 (v8sf x, v8sf y)
+{
+  v8sf tem0 = x - y;
+  v8sf tem1 = y + x;
+  return __builtin_shuffle (tem0, tem1, (v8si) { 0, 9, 2, 11, 4, 13, 6, 15 });
+}
+
+v8sf foo3 (v8sf x, v8sf y)
+{
+  v8sf tem0 = x + y;
+  v8sf tem1 = x - y;
+  return __builtin_shuffle (tem0, tem1, (v8si) { 8, 1, 10, 3, 12, 5, 14, 7 });
+}
+
+v8sf foo4 (v8sf x, v8sf y)
+{
+  v8sf tem0 = y + x;
+  v8sf tem1 = x - y;
+  return __builtin_shuffle (tem0, tem1, (v8si) { 8, 1, 10, 3, 12, 5, 14, 7 });
+}
+
+/* { dg-final { scan-assembler-times "vaddsubps" 4 } } */
Index: testsuite/gcc.target/i386/pr66560-4.c
===================================================================
--- testsuite/gcc.target/i386/pr66560-4.c	(revision 0)
+++ testsuite/gcc.target/i386/pr66560-4.c	(revision 0)
@@ -0,0 +1,35 @@ 
+/* PR target/66560 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+
+typedef double v4df __attribute__((vector_size(32)));
+typedef long long v4di __attribute__((vector_size(32)));
+v4df foo1 (v4df x, v4df y)
+{
+  v4df tem0 = x - y;
+  v4df tem1 = x + y;
+  return __builtin_shuffle (tem0, tem1, (v4di) { 0, 5, 2, 7 });
+}
+
+v4df foo2 (v4df x, v4df y)
+{
+  v4df tem0 = x - y;
+  v4df tem1 = y + x;
+  return __builtin_shuffle (tem0, tem1, (v4di) { 0, 5, 2, 7 });
+}
+
+v4df foo3 (v4df x, v4df y)
+{
+  v4df tem0 = x + y;
+  v4df tem1 = x - y;
+  return __builtin_shuffle (tem0, tem1, (v4di) { 4, 1, 6, 3 });
+}
+
+v4df foo4 (v4df x, v4df y)
+{
+  v4df tem0 = y + x;
+  v4df tem1 = x - y;
+  return __builtin_shuffle (tem0, tem1, (v4di) { 4, 1, 6, 3 });
+}
+
+/* { dg-final { scan-assembler-times "vaddsubpd" 4 } } */
Index: config/i386/predicates.md
===================================================================
--- config/i386/predicates.md	(revision 224769)
+++ config/i386/predicates.md	(working copy)
@@ -1426,8 +1426,105 @@ 
   (and (match_code "unspec_volatile")
        (match_test "XINT (op, 1) == UNSPECV_VZEROUPPER")))
 
+;; Return true if OP is an addsub vec_merge operation
+(define_predicate "addsub_vm_operator"
+  (match_code "vec_merge")
+{
+  rtx op0, op1;
+  int swapped;
+  HOST_WIDE_INT mask;
+  int nunits, elt;
+
+  op0 = XEXP (op, 0);
+  op1 = XEXP (op, 1);
+
+  /* Sanity check.  */
+  if (GET_CODE (op0) == MINUS && GET_CODE (op1) == PLUS)
+    swapped = 0;
+  else if (GET_CODE (op0) == PLUS && GET_CODE (op1) == MINUS)
+    swapped = 1;
+  else
+    gcc_unreachable ();
+
+  mask = INTVAL (XEXP (op, 2));
+  nunits = GET_MODE_NUNITS (mode);
+
+  for (elt = 0; elt < nunits; elt++)
+    {
+      /* bit clear: take from op0, set: take from op1  */
+      int bit = !(mask & (HOST_WIDE_INT_1U << elt));
+
+      if (bit != ((elt & 1) ^ swapped))
+	return false;
+    }
+
+  return true;
+})
+
+;; Return true if OP is an addsub vec_select/vec_concat operation
+(define_predicate "addsub_vs_operator"
+  (and (match_code "vec_select")
+       (match_code "vec_concat" "0"))
+{
+  rtx op0, op1;
+  bool swapped;
+  int nunits, elt;
+
+  op0 = XEXP (XEXP (op, 0), 0);
+  op1 = XEXP (XEXP (op, 0), 1);
+
+  /* Sanity check.  */
+  if (GET_CODE (op0) == MINUS && GET_CODE (op1) == PLUS)
+    swapped = false;
+  else if (GET_CODE (op0) == PLUS && GET_CODE (op1) == MINUS)
+    swapped = true;
+  else
+    gcc_unreachable ();
+
+  nunits = GET_MODE_NUNITS (mode);
+  if (XVECLEN (XEXP (op, 1), 0) != nunits)
+    return false;
+
+  /* We already checked that permutation is suitable for addsub,
+     so only look at the first element of the parallel.  */
+  elt = INTVAL (XVECEXP (XEXP (op, 1), 0, 0));
+
+  return elt == (swapped ? nunits : 0);
+})
+
+;; Return true if OP is a parallel for an addsub vec_select.
+(define_predicate "addsub_vs_parallel"
+  (and (match_code "parallel")
+       (match_code "const_int" "a"))
+{
+  int nelt = XVECLEN (op, 0);
+  int elt, i;
+  
+  if (nelt < 2)
+    return false;
+
+  /* Check that the permutation is suitable for addsub.
+     For example, { 0 9 2 11 4 13 6 15 } or { 8 1 10 3 12 5 14 7 }.  */
+  elt = INTVAL (XVECEXP (op, 0, 0));
+  if (elt == 0)
+    {
+      for (i = 1; i < nelt; ++i)
+	if (INTVAL (XVECEXP (op, 0, i)) != (i + (i & 1) * nelt))
+	  return false;
+    }
+  else if (elt == nelt)
+    {
+      for (i = 1; i < nelt; ++i)
+	if (INTVAL (XVECEXP (op, 0, i)) != (elt + i - (i & 1) * nelt))
+	  return false;
+    }
+  else
+    return false;
+
+  return true;
+})
+
 ;; Return true if OP is a parallel for a vbroadcast permute.
-
 (define_predicate "avx_vbroadcast_operand"
   (and (match_code "parallel")
        (match_code "const_int" "a"))
Index: config/i386/sse.md
===================================================================
--- config/i386/sse.md	(revision 224769)
+++ config/i386/sse.md	(working copy)
@@ -487,10 +487,12 @@ 
    (V4SI "v4di")   (V8SI "v8di")   (V16SI "v16di")])
 
 (define_mode_attr ssedoublemode
-  [(V16SF "V32SF") (V16SI "V32SI") (V8DI "V16DI") (V8DF "V16DF")
-   (V8SF "V16SF") (V8SI "V16SI") (V4DI "V8DI") (V4DF "V8DF")
-   (V16HI "V16SI") (V8HI "V8SI") (V4HI "V4SI") (V4SI "V4DI")
-   (V32HI "V32SI") (V32QI "V32HI") (V16QI "V16HI") (V64QI "V64HI")])
+  [(V4SF "V8SF") (V8SF "V16SF") (V16SF "V32SF")
+   (V2DF "V4DF") (V4DF "V8DF") (V8DF "V16DF")
+   (V16QI "V16HI") (V32QI "V32HI") (V64QI "V64HI")
+   (V4HI "V4SI") (V8HI "V8SI") (V16HI "V16SI") (V32HI "V32SI")
+   (V4SI "V4DI") (V8SI "V16SI") (V16SI "V32SI")
+   (V4DI "V8DI") (V8DI "V16DI")])
 
 (define_mode_attr ssebytemode
   [(V8DI "V64QI") (V4DI "V32QI") (V2DI "V16QI")])
@@ -2021,57 +2023,25 @@ 
 (define_insn "avx_addsubv4df3"
   [(set (match_operand:V4DF 0 "register_operand" "=x")
 	(vec_merge:V4DF
-	  (plus:V4DF
+	  (minus:V4DF
 	    (match_operand:V4DF 1 "register_operand" "x")
 	    (match_operand:V4DF 2 "nonimmediate_operand" "xm"))
-	  (minus:V4DF (match_dup 1) (match_dup 2))
-	  (const_int 10)))]
+	  (plus:V4DF (match_dup 1) (match_dup 2))
+	  (const_int 5)))]
   "TARGET_AVX"
   "vaddsubpd\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "sseadd")
    (set_attr "prefix" "vex")
    (set_attr "mode" "V4DF")])
 
-(define_insn "*avx_addsubv4df3_1"
-  [(set (match_operand:V4DF 0 "register_operand" "=x")
-  	(vec_select:V4DF
-	  (vec_concat:V8DF
-	    (minus:V4DF
-	      (match_operand:V4DF 1 "register_operand" "x")
-	      (match_operand:V4DF 2 "nonimmediate_operand" "xm"))
-	    (plus:V4DF (match_dup 1) (match_dup 2)))
-	  (parallel [(const_int 0) (const_int 5)
-		     (const_int 2) (const_int 7)])))]
-  "TARGET_AVX"
-  "vaddsubpd\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "type" "sseadd")
-   (set_attr "prefix" "vex")
-   (set_attr "mode" "V4DF")])
-
-(define_insn "*avx_addsubv4df3_1s"
-  [(set (match_operand:V4DF 0 "register_operand" "=x")
-  	(vec_select:V4DF
-	  (vec_concat:V8DF
-	    (minus:V4DF
-	      (match_operand:V4DF 1 "register_operand" "x")
-	      (match_operand:V4DF 2 "nonimmediate_operand" "xm"))
-	    (plus:V4DF (match_dup 2) (match_dup 1)))
-	  (parallel [(const_int 0) (const_int 5)
-		     (const_int 2) (const_int 7)])))]
-  "TARGET_AVX"
-  "vaddsubpd\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "type" "sseadd")
-   (set_attr "prefix" "vex")
-   (set_attr "mode" "V4DF")])
-
 (define_insn "sse3_addsubv2df3"
   [(set (match_operand:V2DF 0 "register_operand" "=x,x")
 	(vec_merge:V2DF
-	  (plus:V2DF
+	  (minus:V2DF
 	    (match_operand:V2DF 1 "register_operand" "0,x")
 	    (match_operand:V2DF 2 "nonimmediate_operand" "xm,xm"))
-	  (minus:V2DF (match_dup 1) (match_dup 2))
-	  (const_int 2)))]
+	  (plus:V2DF (match_dup 1) (match_dup 2))
+	  (const_int 1)))]
   "TARGET_SSE3"
   "@
    addsubpd\t{%2, %0|%0, %2}
@@ -2082,102 +2052,28 @@ 
    (set_attr "prefix" "orig,vex")
    (set_attr "mode" "V2DF")])
 
-(define_insn "*sse3_addsubv2df3_1"
-  [(set (match_operand:V2DF 0 "register_operand" "=x,x")
-	(vec_select:V2DF
-	  (vec_concat:V4DF
-	    (minus:V2DF
-	      (match_operand:V2DF 1 "register_operand" "0,x")
-	      (match_operand:V2DF 2 "nonimmediate_operand" "xm,xm"))
-	    (plus:V2DF (match_dup 1) (match_dup 2)))
-	  (parallel [(const_int 0) (const_int 3)])))]
-  "TARGET_SSE3"
-  "@
-   addsubpd\t{%2, %0|%0, %2}
-   vaddsubpd\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "isa" "noavx,avx")
-   (set_attr "type" "sseadd")
-   (set_attr "atom_unit" "complex")
-   (set_attr "prefix" "orig,vex")
-   (set_attr "mode" "V2DF")])
-
-(define_insn "*sse3_addsubv2df3_1s"
-  [(set (match_operand:V2DF 0 "register_operand" "=x,x")
-	(vec_select:V2DF
-	  (vec_concat:V4DF
-	    (minus:V2DF
-	      (match_operand:V2DF 1 "register_operand" "0,x")
-	      (match_operand:V2DF 2 "nonimmediate_operand" "xm,xm"))
-	    (plus:V2DF (match_dup 2) (match_dup 1)))
-	  (parallel [(const_int 0) (const_int 3)])))]
-  "TARGET_SSE3"
-  "@
-   addsubpd\t{%2, %0|%0, %2}
-   vaddsubpd\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "isa" "noavx,avx")
-   (set_attr "type" "sseadd")
-   (set_attr "atom_unit" "complex")
-   (set_attr "prefix" "orig,vex")
-   (set_attr "mode" "V2DF")])
-
 (define_insn "avx_addsubv8sf3"
   [(set (match_operand:V8SF 0 "register_operand" "=x")
 	(vec_merge:V8SF
-	  (plus:V8SF
+	  (minus:V8SF
 	    (match_operand:V8SF 1 "register_operand" "x")
 	    (match_operand:V8SF 2 "nonimmediate_operand" "xm"))
-	  (minus:V8SF (match_dup 1) (match_dup 2))
-	  (const_int 170)))]
+	  (plus:V8SF (match_dup 1) (match_dup 2))
+	  (const_int 85)))]
   "TARGET_AVX"
   "vaddsubps\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "sseadd")
    (set_attr "prefix" "vex")
    (set_attr "mode" "V8SF")])
 
-(define_insn "*avx_addsubv8sf3_1"
-  [(set (match_operand:V8SF 0 "register_operand" "=x")
-	(vec_select:V8SF
-	  (vec_concat:V16SF
-	    (minus:V8SF
-	      (match_operand:V8SF 1 "register_operand" "x")
-	      (match_operand:V8SF 2 "nonimmediate_operand" "xm"))
-	    (plus:V8SF (match_dup 1) (match_dup 2)))
-	  (parallel [(const_int 0) (const_int 9)
-		     (const_int 2) (const_int 11)
-		     (const_int 4) (const_int 13)
-		     (const_int 6) (const_int 15)])))]
-  "TARGET_AVX"
-  "vaddsubps\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "type" "sseadd")
-   (set_attr "prefix" "vex")
-   (set_attr "mode" "V8SF")])
-
-(define_insn "*avx_addsubv8sf3_1s"
-  [(set (match_operand:V8SF 0 "register_operand" "=x")
-	(vec_select:V8SF
-	  (vec_concat:V16SF
-	    (minus:V8SF
-	      (match_operand:V8SF 1 "register_operand" "x")
-	      (match_operand:V8SF 2 "nonimmediate_operand" "xm"))
-	    (plus:V8SF (match_dup 2) (match_dup 1)))
-	  (parallel [(const_int 0) (const_int 9)
-		     (const_int 2) (const_int 11)
-		     (const_int 4) (const_int 13)
-		     (const_int 6) (const_int 15)])))]
-  "TARGET_AVX"
-  "vaddsubps\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "type" "sseadd")
-   (set_attr "prefix" "vex")
-   (set_attr "mode" "V8SF")])
-
 (define_insn "sse3_addsubv4sf3"
   [(set (match_operand:V4SF 0 "register_operand" "=x,x")
 	(vec_merge:V4SF
-	  (plus:V4SF
+	  (minus:V4SF
 	    (match_operand:V4SF 1 "register_operand" "0,x")
 	    (match_operand:V4SF 2 "nonimmediate_operand" "xm,xm"))
-	  (minus:V4SF (match_dup 1) (match_dup 2))
-	  (const_int 10)))]
+	  (plus:V4SF (match_dup 1) (match_dup 2))
+	  (const_int 5)))]
   "TARGET_SSE3"
   "@
    addsubps\t{%2, %0|%0, %2}
@@ -2188,46 +2084,124 @@ 
    (set_attr "prefix_rep" "1,*")
    (set_attr "mode" "V4SF")])
 
-(define_insn "*sse3_addsubv4sf3_1"
-  [(set (match_operand:V4SF 0 "register_operand" "=x,x")
-	(vec_select:V4SF
-	  (vec_concat:V8SF
-	    (minus:V4SF
-	      (match_operand:V4SF 1 "register_operand" "0,x")
-	      (match_operand:V4SF 2 "nonimmediate_operand" "xm,xm"))
-	    (plus:V4SF (match_dup 1) (match_dup 2)))
-	  (parallel [(const_int 0) (const_int 5)
-		     (const_int 2) (const_int 7)])))]
-  "TARGET_SSE3"
-  "@
-   addsubps\t{%2, %0|%0, %2}
-   vaddsubps\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "isa" "noavx,avx")
-   (set_attr "type" "sseadd")
-   (set_attr "prefix" "orig,vex")
-   (set_attr "prefix_rep" "1,*")
-   (set_attr "mode" "V4SF")])
+(define_split
+  [(set (match_operand:VF_128_256 0 "register_operand")
+	(match_operator:VF_128_256 6 "addsub_vm_operator"
+	  [(minus:VF_128_256
+	     (match_operand:VF_128_256 1 "register_operand")
+	     (match_operand:VF_128_256 2 "nonimmediate_operand"))
+	   (plus:VF_128_256
+	     (match_operand:VF_128_256 3 "nonimmediate_operand")
+	     (match_operand:VF_128_256 4 "nonimmediate_operand"))
+	   (match_operand 5 "const_int_operand")]))]
+  "TARGET_SSE3
+   && can_create_pseudo_p ()
+   && ((rtx_equal_p (operands[1], operands[3])
+	&& rtx_equal_p (operands[2], operands[4]))
+       || (rtx_equal_p (operands[1], operands[4])
+	   && rtx_equal_p (operands[2], operands[3])))"
+  [(set (match_dup 0)
+	(vec_merge:VF_128_256
+	  (minus:VF_128_256 (match_dup 1) (match_dup 2))
+	  (plus:VF_128_256 (match_dup 1) (match_dup 2))
+	  (match_dup 5)))])
 
-(define_insn "*sse3_addsubv4sf3_1s"
-  [(set (match_operand:V4SF 0 "register_operand" "=x,x")
-	(vec_select:V4SF
-	  (vec_concat:V8SF
-	    (minus:V4SF
-	      (match_operand:V4SF 1 "register_operand" "0,x")
-	      (match_operand:V4SF 2 "nonimmediate_operand" "xm,xm"))
-	    (plus:V4SF (match_dup 2) (match_dup 1)))
-	  (parallel [(const_int 0) (const_int 5)
-		     (const_int 2) (const_int 7)])))]
-  "TARGET_SSE3"
-  "@
-   addsubps\t{%2, %0|%0, %2}
-   vaddsubps\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "isa" "noavx,avx")
-   (set_attr "type" "sseadd")
-   (set_attr "prefix" "orig,vex")
-   (set_attr "prefix_rep" "1,*")
-   (set_attr "mode" "V4SF")])
+(define_split
+  [(set (match_operand:VF_128_256 0 "register_operand")
+	(match_operator:VF_128_256 6 "addsub_vm_operator"
+	  [(plus:VF_128_256
+	     (match_operand:VF_128_256 1 "nonimmediate_operand")
+	     (match_operand:VF_128_256 2 "nonimmediate_operand"))
+	   (minus:VF_128_256
+	     (match_operand:VF_128_256 3 "register_operand")
+	     (match_operand:VF_128_256 4 "nonimmediate_operand"))
+	   (match_operand 5 "const_int_operand")]))]
+  "TARGET_SSE3
+   && can_create_pseudo_p ()
+   && ((rtx_equal_p (operands[1], operands[3])
+	&& rtx_equal_p (operands[2], operands[4]))
+       || (rtx_equal_p (operands[1], operands[4])
+	   && rtx_equal_p (operands[2], operands[3])))"
+  [(set (match_dup 0)
+	(vec_merge:VF_128_256
+	  (minus:VF_128_256 (match_dup 3) (match_dup 4))
+	  (plus:VF_128_256 (match_dup 3) (match_dup 4))
+	  (match_dup 5)))]
+{
+  /* Negate mask bits to compensate for swapped PLUS and MINUS RTXes.  */
+  operands[5]
+    = GEN_INT (~INTVAL (operands[5])
+	       & ((HOST_WIDE_INT_1U << GET_MODE_NUNITS (<MODE>mode)) - 1));
+})
 
+(define_split
+  [(set (match_operand:VF_128_256 0 "register_operand")
+	(match_operator:VF_128_256 7 "addsub_vs_operator"
+	  [(vec_concat:<ssedoublemode>
+	     (minus:VF_128_256
+	       (match_operand:VF_128_256 1 "register_operand")
+	       (match_operand:VF_128_256 2 "nonimmediate_operand"))
+	     (plus:VF_128_256
+	       (match_operand:VF_128_256 3 "nonimmediate_operand")
+	       (match_operand:VF_128_256 4 "nonimmediate_operand")))
+	   (match_parallel 5 "addsub_vs_parallel"
+	     [(match_operand 6 "const_int_operand")])]))]
+  "TARGET_SSE3
+   && can_create_pseudo_p ()
+   && ((rtx_equal_p (operands[1], operands[3])
+	&& rtx_equal_p (operands[2], operands[4]))
+       || (rtx_equal_p (operands[1], operands[4])
+	   && rtx_equal_p (operands[2], operands[3])))"
+  [(set (match_dup 0)
+	(vec_merge:VF_128_256
+	  (minus:VF_128_256 (match_dup 1) (match_dup 2))
+	  (plus:VF_128_256 (match_dup 1) (match_dup 2))
+	  (match_dup 5)))]
+{
+  int i, nelt = XVECLEN (operands[5], 0);
+  HOST_WIDE_INT ival = 0;
+
+  for (i = 0; i < nelt; i++)
+    if (INTVAL (XVECEXP (operands[5], 0, i)) < GET_MODE_NUNITS (<MODE>mode))
+      ival |= HOST_WIDE_INT_1 << i;
+
+  operands[5] = GEN_INT (ival);
+})
+
+(define_split
+  [(set (match_operand:VF_128_256 0 "register_operand")
+	(match_operator:VF_128_256 7 "addsub_vs_operator"
+	  [(vec_concat:<ssedoublemode>
+	     (plus:VF_128_256
+	       (match_operand:VF_128_256 1 "nonimmediate_operand")
+	       (match_operand:VF_128_256 2 "nonimmediate_operand"))
+	     (minus:VF_128_256
+	       (match_operand:VF_128_256 3 "register_operand")
+	       (match_operand:VF_128_256 4 "nonimmediate_operand")))
+	   (match_parallel 5 "addsub_vs_parallel"
+	     [(match_operand 6 "const_int_operand")])]))]
+  "TARGET_SSE3
+   && can_create_pseudo_p ()
+   && ((rtx_equal_p (operands[1], operands[3])
+	&& rtx_equal_p (operands[2], operands[4]))
+       || (rtx_equal_p (operands[1], operands[4])
+	   && rtx_equal_p (operands[2], operands[3])))"
+  [(set (match_dup 0)
+	(vec_merge:VF_128_256
+	  (minus:VF_128_256 (match_dup 3) (match_dup 4))
+	  (plus:VF_128_256 (match_dup 3) (match_dup 4))
+	  (match_dup 5)))]
+{
+  int i, nelt = XVECLEN (operands[5], 0);
+  HOST_WIDE_INT ival = 0;
+
+  for (i = 0; i < nelt; i++)
+    if (INTVAL (XVECEXP (operands[5], 0, i)) >= GET_MODE_NUNITS (<MODE>mode))
+      ival |= HOST_WIDE_INT_1 << i;
+
+  operands[5] = GEN_INT (ival);
+})
+
 (define_insn "avx_h<plusminus_insn>v4df3"
   [(set (match_operand:V4DF 0 "register_operand" "=x")
 	(vec_concat:V4DF