@@ -1,3 +1,19 @@
+2013-11-20 Cong Hou <congh@google.com>
+
+ PR tree-optimization/56902
+ * config/i386/i386-protos.h (ix86_generate_mask_from_vec_merge):
+ New function.
+ * config/i386/i386.c (ix86_generate_mask_from_vec_merge): Likewise.
+ (expand_vec_perm_blend): Add a new flag to allow emulating the blend
+ instruction on SSE/SSE2.
+ (ix86_expand_vec_perm_const_1): Try emulating the blend instruction.
+ * config/i386/sse.md (sse3_addsubv2df3): Match the pattern of addsub
+ for V4DF mode.
+ (*vec_merge<mode>): New define_insn_and_split for splitting vec_merge.
+ * tree-vect-slp.c (vect_build_slp_tree_1): Add the support of detecting
+ addsub pattern for SLP (from Richard Biener).
+ (vect_schedule_slp_instance): Likewise.
+
2013-11-12 Jeff Law <law@redhat.com>
* tree-ssa-threadedge.c (thread_around_empty_blocks): New
@@ -106,6 +106,7 @@ extern void ix86_expand_unary_operator (enum
rtx_code, enum machine_mode,
rtx[]);
extern rtx ix86_build_const_vector (enum machine_mode, bool, rtx);
extern rtx ix86_build_signbit_mask (enum machine_mode, bool, bool);
+extern rtx ix86_generate_mask_from_vec_merge (enum machine_mode, rtx);
extern void ix86_split_convert_uns_si_sse (rtx[]);
extern void ix86_expand_convert_uns_didf_sse (rtx, rtx);
extern void ix86_expand_convert_uns_sixf_sse (rtx, rtx);
@@ -39633,11 +39633,61 @@ expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
return ok;
}
+/* Generate a vector mask from a scalar mask, which is used in vec_merge
+ instruction. */
+
+rtx
+ix86_generate_mask_from_vec_merge (enum machine_mode vmode, rtx mask)
+{
+ int int_mask = INTVAL (mask);
+ rtx bit_mask;
+
+ switch (vmode)
+ {
+ case V2DFmode:
+ case V2DImode:
+ bit_mask = gen_rtx_CONST_VECTOR (V2DImode, gen_rtvec (2,
+ (int_mask & 1) ? GEN_INT (0xFFFFFFFFFFFFFFFF) : const0_rtx,
+ (int_mask & 2) ? GEN_INT (0xFFFFFFFFFFFFFFFF) : const0_rtx));
+ break;
+
+ case V4SFmode:
+ case V4SImode:
+ bit_mask = gen_rtx_CONST_VECTOR (V4SImode, gen_rtvec (4,
+ (int_mask & 1) ? GEN_INT (0xFFFFFFFF) : const0_rtx,
+ (int_mask & 2) ? GEN_INT (0xFFFFFFFF) : const0_rtx,
+ (int_mask & 4) ? GEN_INT (0xFFFFFFFF) : const0_rtx,
+ (int_mask & 8) ? GEN_INT (0xFFFFFFFF) : const0_rtx));
+ break;
+
+ case V8HImode:
+ bit_mask = gen_rtx_CONST_VECTOR (vmode, gen_rtvec (8,
+ (int_mask & 1) ? GEN_INT (0xFFFF) : const0_rtx,
+ (int_mask & 2) ? GEN_INT (0xFFFF) : const0_rtx,
+ (int_mask & 4) ? GEN_INT (0xFFFF) : const0_rtx,
+ (int_mask & 8) ? GEN_INT (0xFFFF) : const0_rtx,
+ (int_mask & 16) ? GEN_INT (0xFFFF) : const0_rtx,
+ (int_mask & 32) ? GEN_INT (0xFFFF) : const0_rtx,
+ (int_mask & 64) ? GEN_INT (0xFFFF) : const0_rtx,
+ (int_mask & 128) ? GEN_INT (0xFFFF) : const0_rtx));
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ rtx ret = gen_reg_rtx (vmode);
+ convert_move (ret, bit_mask, false);
+ return ret;
+}
+
/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
- in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
+ in terms of blendp[sd] / pblendw / pblendvb / vpblendd. If EMULATE
+ is set, then we generate vec_merge instruction for SSE/SSE2, which
+ can be emulated by and/andn/or approach. */
static bool
-expand_vec_perm_blend (struct expand_vec_perm_d *d)
+expand_vec_perm_blend (struct expand_vec_perm_d *d, bool emulate = false)
{
enum machine_mode vmode = d->vmode;
unsigned i, mask, nelt = d->nelt;
@@ -39646,7 +39696,14 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
if (d->one_operand_p)
return false;
- if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
+
+ /* If the flag EMULATE is set, then we will use and/andn/or to emulate a
+ blend instruction. Note that we don't emulate blend instruction for
+ V16QImode, in which case vec_merge instruction won't be generated, and
+ this is also not useful in practice. */
+ if (emulate && !TARGET_SSE4_1 && vmode != V16QImode)
+ ;
+ else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
;
else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
;
@@ -39667,9 +39724,6 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
if (d->testing_p)
return true;
- /* ??? Without SSE4.1, we could implement this with and/andn/or. This
- decision should be extracted elsewhere, so that we only try that
- sequence once all budget==3 options have been tried. */
target = d->target;
op0 = d->op0;
op1 = d->op1;
@@ -41629,6 +41683,10 @@ ix86_expand_vec_perm_const_1 (struct
expand_vec_perm_d *d)
if (expand_vec_perm_1 (d))
return true;
+ /* Try emulating the blend instruction using other instructions. */
+ if (expand_vec_perm_blend (d, true))
+ return true;
+
/* Try sequences of two instructions. */
if (expand_vec_perm_pshuflw_pshufhw (d))
@@ -737,6 +737,11 @@
(and (match_code "const_int")
(match_test "IN_RANGE (INTVAL (op), 2, 3)")))
+;; Match 2 to 255.
+(define_predicate "const_2_to_255_operand"
+ (and (match_code "const_int")
+ (match_test "IN_RANGE (INTVAL (op), 2, 255)")))
+
;; Match 4 to 5.
(define_predicate "const_4_to_5_operand"
(and (match_code "const_int")
@@ -1509,13 +1509,15 @@
(set_attr "mode" "<MODE>")])
(define_insn "avx_addsubv4df3"
- [(set (match_operand:V4DF 0 "register_operand" "=x")
- (vec_merge:V4DF
- (plus:V4DF
- (match_operand:V4DF 1 "register_operand" "x")
- (match_operand:V4DF 2 "nonimmediate_operand" "xm"))
- (minus:V4DF (match_dup 1) (match_dup 2))
- (const_int 10)))]
+[(set (match_operand:V4DF 0 "register_operand" "=x,x")
+ (vec_select:V4DF
+ (vec_concat:V8DF
+ (minus:V4DF
+ (match_operand:V4DF 1 "register_operand" "0,x")
+ (match_operand:V4DF 2 "nonimmediate_operand" "xm,xm"))
+ (plus:V4DF (match_dup 1) (match_dup 2)))
+ (parallel [(const_int 0) (const_int 5)
+ (const_int 2) (const_int 7)])))]
"TARGET_AVX"
"vaddsubpd\t{%2, %1, %0|%0, %1, %2}"
[(set_attr "type" "sseadd")
@@ -1524,12 +1526,13 @@
(define_insn "sse3_addsubv2df3"
[(set (match_operand:V2DF 0 "register_operand" "=x,x")
- (vec_merge:V2DF
- (plus:V2DF
- (match_operand:V2DF 1 "register_operand" "0,x")
- (match_operand:V2DF 2 "nonimmediate_operand" "xm,xm"))
- (minus:V2DF (match_dup 1) (match_dup 2))
- (const_int 2)))]
+ (vec_select:V2DF
+ (vec_concat:V4DF
+ (minus:V2DF
+ (match_operand:V2DF 1 "register_operand" "0,x")
+ (match_operand:V2DF 2 "nonimmediate_operand" "xm,xm"))
+ (plus:V2DF (match_dup 1) (match_dup 2)))
+ (parallel [(const_int 0) (const_int 3)])))]
"TARGET_SSE3"
"@
addsubpd\t{%2, %0|%0, %2}
@@ -9716,6 +9719,24 @@
[(set (attr "length")
(symbol_ref ("(Pmode != word_mode) + 3")))])
+(define_insn_and_split "*vec_merge<mode>"
+ [(set (match_operand:V_128 0 "register_operand")
+ (vec_merge:V_128
+ (match_operand:V_128 2 "nonimmediate_operand")
+ (match_operand:V_128 1 "register_operand")
+ (match_operand:SI 3 "const_2_to_255_operand")))]
+ "TARGET_SSE && !TARGET_SSE4_1 && can_create_pseudo_p ()"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (and:V_128 (match_dup 3) (match_dup 2)))
+ (set (match_dup 3)
+ (and:V_128
+ (not:V_128 (match_dup 3)) (match_dup 1)))
+ (set (match_dup 0)
+ (ior:V_128 (match_dup 0) (match_dup 3)))]
+ "operands[3] = ix86_generate_mask_from_vec_merge (<MODE>mode, operands[3]);")
+
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;
;; SSSE3 instructions
@@ -1,3 +1,10 @@
+2013-11-20 Cong Hou <congh@google.com>
+
+ * gcc.target/i386/sse-vect-addsubps.c: New test.
+ * gcc.target/i386/sse2-vect-addsubpd.c: New test.
+ * gcc.target/i386/sse3-vect-addsubps.c: New test.
+ * gcc.target/i386/sse3-vect-addsubpd.c: New test.
+
2013-11-12 Balaji V. Iyer <balaji.v.iyer@intel.com>
* gcc.dg/cilk-plus/cilk-plus.exp: Added a check for LTO before running
b/gcc/testsuite/gcc.target/i386/sse-vect-addsubps.c
new file mode 100644
@@ -0,0 +1,36 @@
+/* { dg-do run } */
+/* { dg-require-effective-target sse } */
+/* { dg-additional-options "-O2 -ftree-vectorize -fdump-tree-slp-details" } */
+
+float a[4], b[4], c[4];
+
+void subadd ()
+{
+ c[0] = a[0] - b[0];
+ c[1] = a[1] + b[1];
+ c[2] = a[2] - b[2];
+ c[3] = a[3] + b[3];
+}
+
+extern void abort (void);
+
+int main ()
+{
+ int i;
+ for (i = 0; i < 4; ++i)
+ {
+ a[i] = (i + 1.2) / 3.4;
+ b[i] = (i + 5.6) / 7.8;
+ }
+
+ subadd ();
+
+ if (c[0] != a[0] - b[0]
+ || c[1] != a[1] + b[1]
+ || c[2] != a[2] - b[2]
+ || c[3] != a[3] + b[3])
+ abort ();
+}
+
+/* { dg-final { scan-tree-dump-times "basic block vectorized" 1 "slp" } } */
+/* { dg-final { cleanup-tree-dump "slp" } } */
b/gcc/testsuite/gcc.target/i386/sse2-vect-addsubpd.c
new file mode 100644
@@ -0,0 +1,36 @@
+/* { dg-do run } */
+/* { dg-require-effective-target sse2 } */
+/* { dg-additional-options "-O2 -ftree-vectorize -fdump-tree-slp-details" } */
+
+double a[4], b[4], c[4];
+
+void subadd ()
+{
+ c[0] = a[0] - b[0];
+ c[1] = a[1] + b[1];
+ c[2] = a[2] - b[2];
+ c[3] = a[3] + b[3];
+}
+
+extern void abort (void);
+
+int main ()
+{
+ int i;
+ for (i = 0; i < 4; ++i)
+ {
+ a[i] = (i + 1.2) / 3.4;
+ b[i] = (i + 5.6) / 7.8;
+ }
+
+ subadd ();
+
+ if (c[0] != a[0] - b[0]
+ || c[1] != a[1] + b[1]
+ || c[2] != a[2] - b[2]
+ || c[3] != a[3] + b[3])
+ abort ();
+}
+
+/* { dg-final { scan-tree-dump-times "basic block vectorized" 1 "slp" } } */
+/* { dg-final { cleanup-tree-dump "slp" } } */
b/gcc/testsuite/gcc.target/i386/sse3-vect-addsubpd.c
new file mode 100644
@@ -0,0 +1,36 @@
+/* { dg-do run } */
+/* { dg-require-effective-target sse3 } */
+/* { dg-additional-options "-O2 -ftree-vectorize -fdump-tree-slp-details" } */
+
+double a[4], b[4], c[4];
+
+void subadd ()
+{
+ c[0] = a[0] - b[0];
+ c[1] = a[1] + b[1];
+ c[2] = a[2] - b[2];
+ c[3] = a[3] + b[3];
+}
+
+extern void abort (void);
+
+int main ()
+{
+ int i;
+ for (i = 0; i < 4; ++i)
+ {
+ a[i] = (i + 1.2) / 3.4;
+ b[i] = (i + 5.6) / 7.8;
+ }
+
+ subadd ();
+
+ if (c[0] != a[0] - b[0]
+ || c[1] != a[1] + b[1]
+ || c[2] != a[2] - b[2]
+ || c[3] != a[3] + b[3])
+ abort ();
+}
+
+/* { dg-final { scan-tree-dump-times "basic block vectorized" 1 "slp" } } */
+/* { dg-final { cleanup-tree-dump "slp" } } */
b/gcc/testsuite/gcc.target/i386/sse3-vect-addsubps.c
new file mode 100644
@@ -0,0 +1,36 @@
+/* { dg-do run } */
+/* { dg-require-effective-target sse3 } */
+/* { dg-additional-options "-O2 -ftree-vectorize -fdump-tree-slp-details" } */
+
+float a[4], b[4], c[4];
+
+void subadd ()
+{
+ c[0] = a[0] - b[0];
+ c[1] = a[1] + b[1];
+ c[2] = a[2] - b[2];
+ c[3] = a[3] + b[3];
+}
+
+extern void abort (void);
+
+int main ()
+{
+ int i;
+ for (i = 0; i < 4; ++i)
+ {
+ a[i] = (i + 1.2) / 3.4;
+ b[i] = (i + 5.6) / 7.8;
+ }
+
+ subadd ();
+
+ if (c[0] != a[0] - b[0]
+ || c[1] != a[1] + b[1]
+ || c[2] != a[2] - b[2]
+ || c[3] != a[3] + b[3])
+ abort ();
+}
+
+/* { dg-final { scan-tree-dump-times "basic block vectorized" 1 "slp" } } */
+/* { dg-final { cleanup-tree-dump "slp" } } */
@@ -398,7 +398,7 @@ vect_build_slp_tree_1 (loop_vec_info loop_vinfo,
bb_vec_info bb_vinfo,
unsigned int vectorization_factor, bool *matches)
{
unsigned int i;
- gimple stmt = stmts[0];
+ gimple first_stmt = stmts[0], stmt = stmts[0];
enum tree_code first_stmt_code = ERROR_MARK, rhs_code = ERROR_MARK;
enum tree_code first_cond_code = ERROR_MARK;