diff mbox

[i386] : Fix PR 56766, generate ADDSUB insns

Message ID CAFULd4Y7eCud1bzxa=76govMGACgKObBZ=9_JaB6GhNH71ikXw@mail.gmail.com
State New
Headers show

Commit Message

Uros Bizjak June 16, 2015, 4:34 p.m. UTC
Hello!

Attached patch implements missing (vec_select (vec_concat ...)) forms
of ADDSUB insn.

The core of the problem is with the fact, that we are able to expand
DFmode vectors with a shuffle pattern, which is not the case with
SFmode vectors (shuffle has certain limitations on x86). SFmode
vectors were expanded using blend pattern, and existing forms of
ADDSUB patterns were written to combine plus and minus RTXes with
blend pattern.

The new patterns are able to emit addsub for both following tests:

typedef double v4df __attribute__((vector_size(32)));
typedef long long v4di __attribute__((vector_size(32)));
v4df foo_v4df (v4df x, v4df y)
{
  v4df tem0 = x - y;
  v4df tem1 = x + y;
  return __builtin_shuffle (tem0, tem1, (v4di) { 0, 5, 2, 7 });
}

v4df bar_v4df (v4df x, v4df y)
{
  v4df tem0 = x + y;
  v4df tem1 = x - y;
  return __builtin_shuffle (tem0, tem1, (v4di) { 4, 1, 6, 3 });
}

However, existing shuffle patterns fail for the second case of
otherwise functionally identical testcases:

typedef float v4sf __attribute__((vector_size(16)));
typedef int v4si __attribute__((vector_size(16)));
v4sf foo_v4sf (v4sf x, v4sf y)
{
  v4sf tem0 = x - y;
  v4sf tem1 = x + y;
  return __builtin_shuffle (tem0, tem1, (v4si) { 0, 5, 2, 7 });
}

v4sf bar_v4sf (v4sf x, v4sf y)
{
  v4sf tem0 = x + y;
  v4sf tem1 = x - y;
  return __builtin_shuffle (tem0, tem1, (v4si) { 4, 1, 6, 3 });
}

due to the missing:

(set (reg:V4SF 93 [ D.2159 ])
   (vec_merge:V4SF (minus:V4SF (reg/v:V4SF 91 [ x ])
           (reg/v:V4SF 92 [ y ]))
       (plus:V4SF (reg/v:V4SF 91 [ x ])
           (reg/v:V4SF 92 [ y ]))
       (const_int 5 [0x5])))

(I'll open a PR for this).

The new patterns also implement "partially swapped" case, where
operands for a commutative plus RTX get swapped. As discussed in the
PR, there is currently no other way than duplicating the patterns.

2015-06-16  Uros Bizjak  <ubizjak@gmail.com>

    PR target/56776
    * config/i386/sse.md (*avx_addsubv4df3_1): New insn pattern.
    (*avx_addsubv4df3_1s): Ditto.
    (*sse3_addsubv2df3_1): Ditto.
    (*sse3_addsubv2df3_1s): Ditto.
    (*avx_addsubv8sf3_1): Ditto.
    (*avx_addsubv8sf3_1s): Ditto.
    (*sse3_addsubv4sf3_1): Ditto.
    (*sse3_addsubv4sf3_1s): Ditto.

testsuite/ChangeLog:

2015-06-16  Uros Bizjak  <ubizjak@gmail.com>

    PR target/56776
    * gcc.target/i386/pr56776-1.c: New test.
    * gcc.target/i386/pr56776-2.c: Ditto.

Patch was bootstrapped and regression tested on x86_64-linux-gnu
{,-m32} and will be committed to mainline SVN.

Uros.
diff mbox

Patch

Index: config/i386/sse.md
===================================================================
--- config/i386/sse.md	(revision 224523)
+++ config/i386/sse.md	(working copy)
@@ -2032,6 +2032,38 @@ 
    (set_attr "prefix" "vex")
    (set_attr "mode" "V4DF")])
 
+(define_insn "*avx_addsubv4df3_1"
+  [(set (match_operand:V4DF 0 "register_operand" "=x")
+  	(vec_select:V4DF
+	  (vec_concat:V8DF
+	    (minus:V4DF
+	      (match_operand:V4DF 1 "register_operand" "x")
+	      (match_operand:V4DF 2 "nonimmediate_operand" "xm"))
+	    (plus:V4DF (match_dup 1) (match_dup 2)))
+	  (parallel [(const_int 0) (const_int 5)
+		     (const_int 2) (const_int 7)])))]
+  "TARGET_AVX"
+  "vaddsubpd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4DF")])
+
+(define_insn "*avx_addsubv4df3_1s"
+  [(set (match_operand:V4DF 0 "register_operand" "=x")
+  	(vec_select:V4DF
+	  (vec_concat:V8DF
+	    (minus:V4DF
+	      (match_operand:V4DF 1 "register_operand" "x")
+	      (match_operand:V4DF 2 "nonimmediate_operand" "xm"))
+	    (plus:V4DF (match_dup 2) (match_dup 1)))
+	  (parallel [(const_int 0) (const_int 5)
+		     (const_int 2) (const_int 7)])))]
+  "TARGET_AVX"
+  "vaddsubpd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4DF")])
+
 (define_insn "sse3_addsubv2df3"
   [(set (match_operand:V2DF 0 "register_operand" "=x,x")
 	(vec_merge:V2DF
@@ -2050,6 +2082,44 @@ 
    (set_attr "prefix" "orig,vex")
    (set_attr "mode" "V2DF")])
 
+(define_insn "*sse3_addsubv2df3_1"
+  [(set (match_operand:V2DF 0 "register_operand" "=x,x")
+	(vec_select:V2DF
+	  (vec_concat:V4DF
+	    (minus:V2DF
+	      (match_operand:V2DF 1 "register_operand" "0,x")
+	      (match_operand:V2DF 2 "nonimmediate_operand" "xm,xm"))
+	    (plus:V2DF (match_dup 1) (match_dup 2)))
+	  (parallel [(const_int 0) (const_int 3)])))]
+  "TARGET_SSE3"
+  "@
+   addsubpd\t{%2, %0|%0, %2}
+   vaddsubpd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseadd")
+   (set_attr "atom_unit" "complex")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "V2DF")])
+
+(define_insn "*sse3_addsubv2df3_1s"
+  [(set (match_operand:V2DF 0 "register_operand" "=x,x")
+	(vec_select:V2DF
+	  (vec_concat:V4DF
+	    (minus:V2DF
+	      (match_operand:V2DF 1 "register_operand" "0,x")
+	      (match_operand:V2DF 2 "nonimmediate_operand" "xm,xm"))
+	    (plus:V2DF (match_dup 2) (match_dup 1)))
+	  (parallel [(const_int 0) (const_int 3)])))]
+  "TARGET_SSE3"
+  "@
+   addsubpd\t{%2, %0|%0, %2}
+   vaddsubpd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseadd")
+   (set_attr "atom_unit" "complex")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "V2DF")])
+
 (define_insn "avx_addsubv8sf3"
   [(set (match_operand:V8SF 0 "register_operand" "=x")
 	(vec_merge:V8SF
@@ -2064,6 +2134,42 @@ 
    (set_attr "prefix" "vex")
    (set_attr "mode" "V8SF")])
 
+(define_insn "*avx_addsubv8sf3_1"
+  [(set (match_operand:V8SF 0 "register_operand" "=x")
+	(vec_select:V8SF
+	  (vec_concat:V16SF
+	    (minus:V8SF
+	      (match_operand:V8SF 1 "register_operand" "x")
+	      (match_operand:V8SF 2 "nonimmediate_operand" "xm"))
+	    (plus:V8SF (match_dup 1) (match_dup 2)))
+	  (parallel [(const_int 0) (const_int 9)
+		     (const_int 2) (const_int 11)
+		     (const_int 4) (const_int 13)
+		     (const_int 6) (const_int 15)])))]
+  "TARGET_AVX"
+  "vaddsubps\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
+(define_insn "*avx_addsubv8sf3_1s"
+  [(set (match_operand:V8SF 0 "register_operand" "=x")
+	(vec_select:V8SF
+	  (vec_concat:V16SF
+	    (minus:V8SF
+	      (match_operand:V8SF 1 "register_operand" "x")
+	      (match_operand:V8SF 2 "nonimmediate_operand" "xm"))
+	    (plus:V8SF (match_dup 2) (match_dup 1)))
+	  (parallel [(const_int 0) (const_int 9)
+		     (const_int 2) (const_int 11)
+		     (const_int 4) (const_int 13)
+		     (const_int 6) (const_int 15)])))]
+  "TARGET_AVX"
+  "vaddsubps\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
 (define_insn "sse3_addsubv4sf3"
   [(set (match_operand:V4SF 0 "register_operand" "=x,x")
 	(vec_merge:V4SF
@@ -2082,6 +2188,46 @@ 
    (set_attr "prefix_rep" "1,*")
    (set_attr "mode" "V4SF")])
 
+(define_insn "*sse3_addsubv4sf3_1"
+  [(set (match_operand:V4SF 0 "register_operand" "=x,x")
+	(vec_select:V4SF
+	  (vec_concat:V8SF
+	    (minus:V4SF
+	      (match_operand:V4SF 1 "register_operand" "0,x")
+	      (match_operand:V4SF 2 "nonimmediate_operand" "xm,xm"))
+	    (plus:V4SF (match_dup 1) (match_dup 2)))
+	  (parallel [(const_int 0) (const_int 5)
+		     (const_int 2) (const_int 7)])))]
+  "TARGET_SSE3"
+  "@
+   addsubps\t{%2, %0|%0, %2}
+   vaddsubps\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseadd")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "prefix_rep" "1,*")
+   (set_attr "mode" "V4SF")])
+
+(define_insn "*sse3_addsubv4sf3_1s"
+  [(set (match_operand:V4SF 0 "register_operand" "=x,x")
+	(vec_select:V4SF
+	  (vec_concat:V8SF
+	    (minus:V4SF
+	      (match_operand:V4SF 1 "register_operand" "0,x")
+	      (match_operand:V4SF 2 "nonimmediate_operand" "xm,xm"))
+	    (plus:V4SF (match_dup 2) (match_dup 1)))
+	  (parallel [(const_int 0) (const_int 5)
+		     (const_int 2) (const_int 7)])))]
+  "TARGET_SSE3"
+  "@
+   addsubps\t{%2, %0|%0, %2}
+   vaddsubps\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseadd")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "prefix_rep" "1,*")
+   (set_attr "mode" "V4SF")])
+
 (define_insn "avx_h<plusminus_insn>v4df3"
   [(set (match_operand:V4DF 0 "register_operand" "=x")
 	(vec_concat:V4DF
Index: testsuite/gcc.target/i386/pr56766-1.c
===================================================================
--- testsuite/gcc.target/i386/pr56766-1.c	(revision 0)
+++ testsuite/gcc.target/i386/pr56766-1.c	(working copy)
@@ -0,0 +1,42 @@ 
+/* PR target/56766 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+
+typedef float v4sf __attribute__((vector_size(16)));
+typedef int v4si __attribute__((vector_size(16)));
+v4sf foo_v4sf (v4sf x, v4sf y)
+{
+  v4sf tem0 = x - y;
+  v4sf tem1 = x + y;
+  return __builtin_shuffle (tem0, tem1, (v4si) { 0, 5, 2, 7 });
+}
+
+typedef float v8sf __attribute__((vector_size(32)));
+typedef int v8si __attribute__((vector_size(32)));
+v8sf foo_v8sf (v8sf x, v8sf y)
+{
+  v8sf tem0 = x - y;
+  v8sf tem1 = x + y;
+  return __builtin_shuffle (tem0, tem1, (v8si) { 0, 9, 2, 11, 4, 13, 6, 15 });
+}
+
+typedef double v2df __attribute__((vector_size(16)));
+typedef long long v2di __attribute__((vector_size(16)));
+v2df foo_v2df (v2df x, v2df y)
+{
+  v2df tem0 = x - y;
+  v2df tem1 = x + y;
+  return __builtin_shuffle (tem0, tem1, (v2di) { 0, 3 });
+}
+
+typedef double v4df __attribute__((vector_size(32)));
+typedef long long v4di __attribute__((vector_size(32)));
+v4df foo_v4df (v4df x, v4df y)
+{
+  v4df tem0 = x - y;
+  v4df tem1 = x + y;
+  return __builtin_shuffle (tem0, tem1, (v4di) { 0, 5, 2, 7 });
+}
+
+/* { dg-final { scan-assembler-times "vaddsubps" 2 } } */
+/* { dg-final { scan-assembler-times "vaddsubpd" 2 } } */
Index: testsuite/gcc.target/i386/pr56766-2.c
===================================================================
--- testsuite/gcc.target/i386/pr56766-2.c	(revision 0)
+++ testsuite/gcc.target/i386/pr56766-2.c	(working copy)
@@ -0,0 +1,40 @@ 
+/* PR target/56766 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fno-vect-cost-model -mavx" } */
+
+void test_v4sf (float * __restrict__ p, float * __restrict q)
+{
+  p[0] = p[0] - q[0];
+  p[1] = p[1] + q[1];
+  p[2] = p[2] - q[2];
+  p[3] = p[3] + q[3];
+}
+
+void test_v8sf (float * __restrict__ p, float * __restrict q)
+{
+  p[0] = p[0] - q[0];
+  p[1] = p[1] + q[1];
+  p[2] = p[2] - q[2];
+  p[3] = p[3] + q[3];
+  p[4] = p[4] - q[4];
+  p[5] = p[5] + q[5];
+  p[6] = p[6] - q[6];
+  p[7] = p[7] + q[7];
+}
+
+void test_v2df (double * __restrict__ p, double * __restrict q)
+{
+  p[0] = p[0] - q[0];
+  p[1] = p[1] + q[1];
+}
+
+void test_v4df (double * __restrict__ p, double * __restrict q)
+{
+  p[0] = p[0] - q[0];
+  p[1] = p[1] + q[1];
+  p[2] = p[2] - q[2];
+  p[3] = p[3] + q[3];
+}
+
+/* { dg-final { scan-assembler-times "vaddsubps" 2 } } */
+/* { dg-final { scan-assembler-times "vaddsubpd" 2 } } */