diff mbox

Support addsub/subadd as non-isomorphic operations for SLP vectorizer.

Message ID CAK=A3=1Puwf0tkqMuuvRXxHk66a9+kkfL9x+-aqPvP=PdQOicw@mail.gmail.com
State New
Headers show

Commit Message

Cong Hou Nov. 21, 2013, 10:22 p.m. UTC
I have made a patch with new define_insn_and_split for vec_merge. The
addsub pattern can now be detected for float/double on all modes
(float from SSE and double from SSE2). From SSE3 we are using
ADDSUBPS/D, and for other modes we perform one minus, one add, and
then use and/andn/or method to blend those two results together (we
can further optimize it by selectively negating even elements and then
adding two operands - only two instructions required).

I applied your patch
(http://gcc.gnu.org/bugzilla/attachment.cgi?id=31209) in
tree-vect-slp.c with small modifications.

Currently the rtl patterns for addsub on float and double are
different: for float we are using vec_merge and for double we are
using vec_select. This is because for double vec_select can be
translated by one instruction and is chosen before vec_merge in
ix86_expand_vec_perm_const_1 (we can change this behavior but I am not
sure if it is necessary). This may look strange.

Bootstrapped and tested on an x86-64 machine.

The patch is pasted below. Please give me your comments.

Thank you!


Cong




   tree lhs;
@@ -588,6 +588,8 @@ vect_build_slp_tree_1 (loop_vec_info loop_vinfo,
bb_vec_info bb_vinfo,
   || rhs_code != REALPART_EXPR)
       && (first_stmt_code != REALPART_EXPR
   || rhs_code != IMAGPART_EXPR)
+      && (first_stmt_code != MINUS_EXPR
+  || rhs_code != ((i & 1) ? PLUS_EXPR : MINUS_EXPR))
               && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
                    && (first_stmt_code == ARRAY_REF
                        || first_stmt_code == BIT_FIELD_REF
@@ -601,6 +603,10 @@ vect_build_slp_tree_1 (loop_vec_info loop_vinfo,
bb_vec_info bb_vinfo,
    "Build SLP failed: different operation "
    "in stmt ");
   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
+                  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+   "original stmt ");
+  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
+    first_stmt, 0);
                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
  }
       /* Mismatch.  */
@@ -3062,6 +3068,63 @@ vect_schedule_slp_instance (slp_tree node,
slp_instance instance,
       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
     }

+  if (is_gimple_assign (stmt)
+      && (gimple_assign_rhs_code (stmt) == PLUS_EXPR
+  || gimple_assign_rhs_code (stmt) == MINUS_EXPR))
+    {
+      enum tree_code code0 = gimple_assign_rhs_code (stmt);
+      enum tree_code ocode;
+      gimple ostmt;
+      unsigned char *mask = XCNEWVEC (unsigned char, nunits);
+      bool allsame = true;
+      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt)
+ if (gimple_assign_rhs_code (ostmt) != code0)
+  {
+    mask[i] = i + nunits;
+    allsame = false;
+    ocode = gimple_assign_rhs_code (ostmt);
+  }
+ else
+  mask[i] = i;
+      if (!allsame)
+ {
+  vec<gimple> v0;
+  vec<gimple> v1;
+  unsigned j;
+  tree tmask = NULL_TREE;
+  vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
+  v0 = SLP_TREE_VEC_STMTS (node).copy ();
+  SLP_TREE_VEC_STMTS (node).truncate (0);
+  gimple_assign_set_rhs_code (stmt, ocode);
+  vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
+  gimple_assign_set_rhs_code (stmt, code0);
+  v1 = SLP_TREE_VEC_STMTS (node).copy ();
+  SLP_TREE_VEC_STMTS (node).truncate (0);
+  tmask = make_vector (nunits);
+  TREE_TYPE (tmask) = build_nonstandard_integer_type
+                                (GET_MODE_BITSIZE
+                                  (TYPE_MODE (TREE_TYPE (vectype))), 1);
+  TREE_TYPE (tmask) = get_same_sized_vectype (TREE_TYPE (tmask),
+                                                      vectype);
+  for (j = 0; j < nunits; ++j)
+    VECTOR_CST_ELT (tmask, j) = build_int_cst (TREE_TYPE
+                                          (TREE_TYPE (tmask)), mask[j]);
+  for (j = 0; j < v0.length (); ++j)
+    {
+      gimple vstmt;
+      tree tem = make_ssa_name (vectype, NULL);
+      vstmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, tem,
+    gimple_assign_lhs (v0[j]),
+    gimple_assign_lhs (v1[j]),
+                                                    tmask);
+      vect_finish_stmt_generation (stmt, vstmt, &si);
+      SLP_TREE_VEC_STMTS (node).quick_push (vstmt);
+    }
+  free (mask);
+  return false;
+ }
+      free (mask);
+    }
   is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
   return is_store;
 }













On Wed, Nov 20, 2013 at 1:07 AM, Richard Biener <rguenther@suse.de> wrote:
> On Tue, 19 Nov 2013, Cong Hou wrote:
>
>> On Tue, Nov 19, 2013 at 1:45 AM, Richard Biener <rguenther@suse.de> wrote:
>> >
>> > On Mon, 18 Nov 2013, Cong Hou wrote:
>> >
>> > > I tried your method and it works well for doubles. But for float,
>> > > there is an issue. For the following gimple code:
>> > >
>> > >    c1 = a - b;
>> > >    c2 = a + b;
>> > >    c = VEC_PERM <c1, c2, [0,5,2,7]>
>> > >
>> > > It needs two instructions to implement the VEC_PERM operation in
>> > > SSE2-4, one of which should be using shufps which is represented by
>> > > the following pattern in rtl:
>> > >
>> > >
>> > > (define_insn "sse_shufps_<mode>"
>> > >   [(set (match_operand:VI4F_128 0 "register_operand" "=x,x")
>> > > (vec_select:VI4F_128
>> > >  (vec_concat:<ssedoublevecmode>
>> > >    (match_operand:VI4F_128 1 "register_operand" "0,x")
>> > >    (match_operand:VI4F_128 2 "nonimmediate_operand" "xm,xm"))
>> > >  (parallel [(match_operand 3 "const_0_to_3_operand")
>> > >     (match_operand 4 "const_0_to_3_operand")
>> > >     (match_operand 5 "const_4_to_7_operand")
>> > >     (match_operand 6 "const_4_to_7_operand")])))]
>> > > ...)
>> > >
>> > > Note that it contains two rtl instructions.
>> >
>> > It's a single instruction as far as combine is concerned (RTL
>> > instructions have arbitrary complexity).
>>
>>
>> Even it is one instruction, we will end up with four rtl statements,
>> which still cannot be combined as there are restrictions on combining
>> four instructions (loads of constants or binary operations involving a
>> constant). Note that vec_select instead of vec_merge is used here
>> because currently vec_merge is emitted only if SSE4 is enabled (thus
>> blend instructions can be used. If you look at
>> ix86_expand_vec_perm_const_1() in i386.c, you can find that vec_merge
>> is generated in expand_vec_perm_1() with SSE4.). Without SSE4 support,
>> in most cases a vec_merge statement could not be translated by one SSE
>> instruction.
>>
>> >
>> >
>> > > Together with minus, plus,
>> > > and one more shuffling instruction, we have at least five instructions
>> > > for addsub pattern. I think during the combine pass, only four
>> > > instructions are considered to be combined, right? So unless we
>> > > compress those five instructions into four or less, we could not use
>> > > this method for float values.
>> >
>> > At the moment addsubv4sf looks like
>> >
>> > (define_insn "sse3_addsubv4sf3"
>> >   [(set (match_operand:V4SF 0 "register_operand" "=x,x")
>> >         (vec_merge:V4SF
>> >           (plus:V4SF
>> >             (match_operand:V4SF 1 "register_operand" "0,x")
>> >             (match_operand:V4SF 2 "nonimmediate_operand" "xm,xm"))
>> >           (minus:V4SF (match_dup 1) (match_dup 2))
>> >           (const_int 10)))]
>> >
>> > to match this it's best to have the VEC_SHUFFLE retained as
>> > vec_merge and thus support arbitrary(?) vec_merge for the aid
>> > of combining until reload(?) after which we can split it.
>> >
>>
>>
>> You mean VEC_PERM (this is generated in gimple from your patch)? Note
>> as I mentioned above, without SSE4, it is difficult to translate
>> VEC_PERM into vec_merge. Even if we can do it, we still need do define
>> split to convert one vec_merge into two or more other statements
>> later. ADDSUB instructions are proved by SSE3 and I think we should
>> not rely on SSE4 to perform this transformation, right?
>
> Sure not, we just keep vec_merge in the early RTL IL to give
> combine an easier job.  We split it later to expose the target details
> (see below).
>
>> To sum up, if we use vec_select instead of vec_merge, we may have four
>> rtl statements for float types, in which case they cannot be combined.
>> If we use vec_merge, we need to define the split for it without SSE4
>> support, and we need also to change the behavior of
>> ix86_expand_vec_perm_const_1().
>
> Yes.  But that would be best IMHO.  Otherwise we're not going to
> have PR56766 fixed for the plain C generic vector testcase
>
> typedef double v2df __attribute__((vector_size(16)));
> typedef long long v2di __attribute__((vector_size(16)));
> v2df foo (v2df x, v2df y)
> {
>   v2df tem1 = x - y;
>   v2df tem2 = x + y;
>   return __builtin_shuffle (tem1, tem2, (v2di) { 0, 3 });
> }
>
>> > > What do you think?
>> >
>> > Besides of addsub are there other instructions that can be expressed
>> > similarly?  Thus, how far should the combiner pattern go?
>> >
>>
>>
>> I think your method is quite flexible. Beside blending add/sub, we
>> could blend other combinations of two operations, and even one
>> operation and a no-op. For example, consider vectorizing complex
>> conjugate operation:
>>
>> for (int i = 0; i < N; i+=2) {
>>   a[i] = b[i];
>>   a[i+1] = -b[i+1];
>> }
>>
>> This is loop is better to be vectorized by hybrid SLP. The second
>> statement has a unary minus operation but there is no operation in the
>> first one. We can improve out SLP grouping algorithm to let GCC SLP
>> vectorize it.
>
> Yes indeed - I also liked the flexibility in the end.  If cost
> considerations allow it we can handle mismatched ops
> quite well (albeit with a cost overhead).  Note that the
> VEC_PERM permutations generated are always of the form that allow
> implementing them with
>
>   res = VEC_PERM <op1, op2, selector>
>
> =>
>
>   op1 = op1 & selector-mask1;
>   op2 = op2 & selector-mask2;
>   res = op1 | op2;
>
> which may be often preferable to using more than one shuffle
> insn building block.  I'm not sure if the current expander
> considers this option for selectors that qualify for it,
> but basically the above is always possible for vec_merge
> (so having a generic vec_merge pattern, later split, wouldn't
> depend on anything more than SSE2).
>
> That is, the vec_perm_const expander could expand to vec_merge
> (if the permutation is of suitable form) and only later split
> to ISA dependent code if the insn didn't get combined.
>
> Richard.
>
>>
>> thanks,
>> Cong
>>
>>
>>
>> > Richard.
>> >
>> > >
>> > >
>> > >
>> > > thanks,
>> > > Cong
>> > >
>> > >
>> > > On Fri, Nov 15, 2013 at 12:53 AM, Richard Biener <rguenther@suse.de> wrote:
>> > > > On Thu, 14 Nov 2013, Cong Hou wrote:
>> > > >
>> > > >> Hi
>> > > >>
>> > > >> This patch adds the support to two non-isomorphic operations addsub
>> > > >> and subadd for SLP vectorizer. More non-isomorphic operations can be
>> > > >> added later, but the limitation is that operations on even/odd
>> > > >> elements should still be isomorphic. Once such an operation is
>> > > >> detected, the code of the operation used in vectorized code is stored
>> > > >> and later will be used during statement transformation. Two new GIMPLE
>> > > >> opeartions VEC_ADDSUB_EXPR and VEC_SUBADD_EXPR are defined. And also
>> > > >> new optabs for them. They are also documented.
>> > > >>
>> > > >> The target supports for SSE/SSE2/SSE3/AVX are added for those two new
>> > > >> operations on floating points. SSE3/AVX provides ADDSUBPD and ADDSUBPS
>> > > >> instructions. For SSE/SSE2, those two operations are emulated using
>> > > >> two instructions (selectively negate then add).
>> > > >>
>> > > >> With this patch the following function will be SLP vectorized:
>> > > >>
>> > > >>
>> > > >> float a[4], b[4], c[4];  // double also OK.
>> > > >>
>> > > >> void subadd ()
>> > > >> {
>> > > >>   c[0] = a[0] - b[0];
>> > > >>   c[1] = a[1] + b[1];
>> > > >>   c[2] = a[2] - b[2];
>> > > >>   c[3] = a[3] + b[3];
>> > > >> }
>> > > >>
>> > > >> void addsub ()
>> > > >> {
>> > > >>   c[0] = a[0] + b[0];
>> > > >>   c[1] = a[1] - b[1];
>> > > >>   c[2] = a[2] + b[2];
>> > > >>   c[3] = a[3] - b[3];
>> > > >> }
>> > > >>
>> > > >>
>> > > >> Boostrapped and tested on an x86-64 machine.
>> > > >
>> > > > I managed to do this without adding new tree codes or optabs by
>> > > > vectorizing the above as
>> > > >
>> > > >    c1 = a + b;
>> > > >    c2 = a - b;
>> > > >    c = VEC_PERM <c1, c2, the-proper-mask>
>> > > >
>> > > > which then matches sse3_addsubv4sf3 if you fix that pattern to
>> > > > not use vec_merge (or fix PR56766).  Doing it this way also
>> > > > means that the code is vectorizable if you don't have a HW
>> > > > instruction for that but can do the VEC_PERM efficiently.
>> > > >
>> > > > So, I'd like to avoid new tree codes and optabs whenever possible
>> > > > and here I've already proved (with a patch) that it is possible.
>> > > > Didn't have time to clean it up, and it likely doesn't apply anymore
>> > > > (and PR56766 blocks it but it even has a patch).
>> > > >
>> > > > Btw, this was PR56902 where I attached my patch.
>> > > >
>> > > > Richard.
>> > > >
>> > > >>
>> > > >> thanks,
>> > > >> Cong
>> > > >>
>> > > >>
>> > > >>
>> > > >>
>> > > >>
>> > > >> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
>> > > >> index 2c0554b..656d5fb 100644
>> > > >> --- a/gcc/ChangeLog
>> > > >> +++ b/gcc/ChangeLog
>> > > >> @@ -1,3 +1,31 @@
>> > > >> +2013-11-14  Cong Hou  <congh@google.com>
>> > > >> +
>> > > >> + * tree-vect-slp.c (vect_create_new_slp_node): Initialize
>> > > >> + SLP_TREE_OP_CODE.
>> > > >> + (slp_supported_non_isomorphic_op): New function.  Check if the
>> > > >> + non-isomorphic operation is supported or not.
>> > > >> + (vect_build_slp_tree_1): Consider non-isomorphic operations.
>> > > >> + (vect_build_slp_tree): Change argument.
>> > > >> + * tree-vect-stmts.c (vectorizable_operation): Consider the opcode
>> > > >> + for non-isomorphic operations.
>> > > >> + * optabs.def (vec_addsub_optab, vec_subadd_optab): New optabs.
>> > > >> + * tree.def (VEC_ADDSUB_EXPR, VEC_SUBADD_EXPR): New operations.
>> > > >> + * expr.c (expand_expr_real_2): Add support to VEC_ADDSUB_EXPR and
>> > > >> + VEC_SUBADD_EXPR.
>> > > >> + * gimple-pretty-print.c (dump_binary_rhs): Likewise.
>> > > >> + * optabs.c (optab_for_tree_code): Likewise.
>> > > >> + * tree-cfg.c (verify_gimple_assign_binary): Likewise.
>> > > >> + * tree-vectorizer.h (struct _slp_tree): New data member.
>> > > >> + * config/i386/i386-protos.h (ix86_sse_expand_fp_addsub_operator):
>> > > >> + New funtion.  Expand addsub/subadd operations for SSE2.
>> > > >> + * config/i386/i386.c (ix86_sse_expand_fp_addsub_operator): Likewise.
>> > > >> + * config/i386/sse.md (UNSPEC_SUBADD, UNSPEC_ADDSUB): New RTL operation.
>> > > >> + (vec_subadd_v4sf3, vec_subadd_v2df3, vec_subadd_<mode>3,
>> > > >> + vec_addsub_v4sf3, vec_addsub_v2df3, vec_addsub_<mode>3):
>> > > >> + Expand addsub/subadd operations for SSE/SSE2/SSE3/AVX.
>> > > >> + * doc/generic.texi (VEC_ADDSUB_EXPR, VEC_SUBADD_EXPR): New doc.
>> > > >> + * doc/md.texi (vec_addsub_@var{m}3, vec_subadd_@var{m}3): New doc.
>> > > >> +
>> > > >>  2013-11-12  Jeff Law  <law@redhat.com>
>> > > >>
>> > > >>   * tree-ssa-threadedge.c (thread_around_empty_blocks): New
>> > > >> diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
>> > > >> index fdf9d58..b02b757 100644
>> > > >> --- a/gcc/config/i386/i386-protos.h
>> > > >> +++ b/gcc/config/i386/i386-protos.h
>> > > >> @@ -117,6 +117,7 @@ extern rtx ix86_expand_adjust_ufix_to_sfix_si (rtx, rtx *);
>> > > >>  extern enum ix86_fpcmp_strategy ix86_fp_comparison_strategy (enum rtx_code);
>> > > >>  extern void ix86_expand_fp_absneg_operator (enum rtx_code, enum machine_mode,
>> > > >>      rtx[]);
>> > > >> +extern void ix86_sse_expand_fp_addsub_operator (bool, enum
>> > > >> machine_mode, rtx[]);
>> > > >>  extern void ix86_expand_copysign (rtx []);
>> > > >>  extern void ix86_split_copysign_const (rtx []);
>> > > >>  extern void ix86_split_copysign_var (rtx []);
>> > > >> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
>> > > >> index 5287b49..76f38f5 100644
>> > > >> --- a/gcc/config/i386/i386.c
>> > > >> +++ b/gcc/config/i386/i386.c
>> > > >> @@ -18702,6 +18702,51 @@ ix86_expand_fp_absneg_operator (enum rtx_code
>> > > >> code, enum machine_mode mode,
>> > > >>      emit_insn (set);
>> > > >>  }
>> > > >>
>> > > >> +/* Generate code for addsub or subadd on fp vectors for sse/sse2.  The flag
>> > > >> +   SUBADD indicates if we are generating code for subadd or addsub.  */
>> > > >> +
>> > > >> +void
>> > > >> +ix86_sse_expand_fp_addsub_operator (bool subadd, enum machine_mode mode,
>> > > >> +    rtx operands[])
>> > > >> +{
>> > > >> +  rtx mask;
>> > > >> +  rtx neg_mask32 = GEN_INT (0x80000000);
>> > > >> +  rtx neg_mask64 = GEN_INT ((HOST_WIDE_INT)1 << 63);
>> > > >> +
>> > > >> +  switch (mode)
>> > > >> +    {
>> > > >> +    case V4SFmode:
>> > > >> +      if (subadd)
>> > > >> + mask = gen_rtx_CONST_VECTOR (V4SImode, gen_rtvec (4,
>> > > >> + neg_mask32, const0_rtx, neg_mask32, const0_rtx));
>> > > >> +      else
>> > > >> + mask = gen_rtx_CONST_VECTOR (V4SImode, gen_rtvec (4,
>> > > >> + const0_rtx, neg_mask32, const0_rtx, neg_mask32));
>> > > >> +      break;
>> > > >> +
>> > > >> +    case V2DFmode:
>> > > >> +      if (subadd)
>> > > >> + mask = gen_rtx_CONST_VECTOR (V2DImode, gen_rtvec (2,
>> > > >> + neg_mask64, const0_rtx));
>> > > >> +      else
>> > > >> + mask = gen_rtx_CONST_VECTOR (V2DImode, gen_rtvec (2,
>> > > >> + const0_rtx, neg_mask64));
>> > > >> +      break;
>> > > >> +
>> > > >> +    default:
>> > > >> +      gcc_unreachable ();
>> > > >> +    }
>> > > >> +
>> > > >> +  rtx tmp = gen_reg_rtx (mode);
>> > > >> +  convert_move (tmp, mask, false);
>> > > >> +
>> > > >> +  rtx tmp2 = gen_reg_rtx (mode);
>> > > >> +  tmp2 = expand_simple_binop (mode, XOR, tmp, operands[2],
>> > > >> +      tmp2, 0, OPTAB_DIRECT);
>> > > >> +  expand_simple_binop (mode, PLUS, operands[1], tmp2,
>> > > >> +       operands[0], 0, OPTAB_DIRECT);
>> > > >> +}
>> > > >> +
>> > > >>  /* Expand a copysign operation.  Special case operand 0 being a constant.  */
>> > > >>
>> > > >>  void
>> > > >> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
>> > > >> index 7bb2d77..4369b2e 100644
>> > > >> --- a/gcc/config/i386/sse.md
>> > > >> +++ b/gcc/config/i386/sse.md
>> > > >> @@ -25,6 +25,8 @@
>> > > >>
>> > > >>    ;; SSE3
>> > > >>    UNSPEC_LDDQU
>> > > >> +  UNSPEC_SUBADD
>> > > >> +  UNSPEC_ADDSUB
>> > > >>
>> > > >>    ;; SSSE3
>> > > >>    UNSPEC_PSHUFB
>> > > >> @@ -1508,6 +1510,80 @@
>> > > >>     (set_attr "prefix" "orig,vex")
>> > > >>     (set_attr "mode" "<MODE>")])
>> > > >>
>> > > >> +(define_expand "vec_subadd_v4sf3"
>> > > >> +  [(set (match_operand:V4SF 0 "register_operand")
>> > > >> + (unspec:V4SF
>> > > >> +  [(match_operand:V4SF 1 "register_operand")
>> > > >> +   (match_operand:V4SF 2 "nonimmediate_operand")] UNSPEC_SUBADD))]
>> > > >> +  "TARGET_SSE"
>> > > >> +{
>> > > >> +  if (TARGET_SSE3)
>> > > >> +    emit_insn (gen_sse3_addsubv4sf3 (operands[0], operands[1], operands[2]));
>> > > >> +  else
>> > > >> +    ix86_sse_expand_fp_addsub_operator (true, V4SFmode, operands);
>> > > >> +  DONE;
>> > > >> +})
>> > > >> +
>> > > >> +(define_expand "vec_subadd_v2df3"
>> > > >> +  [(set (match_operand:V2DF 0 "register_operand")
>> > > >> + (unspec:V2DF
>> > > >> +  [(match_operand:V2DF 1 "register_operand")
>> > > >> +   (match_operand:V2DF 2 "nonimmediate_operand")] UNSPEC_SUBADD))]
>> > > >> +  "TARGET_SSE2"
>> > > >> +{
>> > > >> +  if (TARGET_SSE3)
>> > > >> +    emit_insn (gen_sse3_addsubv2df3 (operands[0], operands[1], operands[2]));
>> > > >> +  else
>> > > >> +    ix86_sse_expand_fp_addsub_operator (true, V2DFmode, operands);
>> > > >> +  DONE;
>> > > >> +})
>> > > >> +
>> > > >> +(define_expand "vec_subadd_<mode>3"
>> > > >> +  [(set (match_operand:VF_256 0 "register_operand")
>> > > >> + (unspec:VF_256
>> > > >> +  [(match_operand:VF_256 1 "register_operand")
>> > > >> +   (match_operand:VF_256 2 "nonimmediate_operand")] UNSPEC_SUBADD))]
>> > > >> +  "TARGET_AVX"
>> > > >> +{
>> > > >> +  emit_insn (gen_avx_addsub<mode>3 (operands[0], operands[1], operands[2]));
>> > > >> +  DONE;
>> > > >> +})
>> > > >> +
>> > > >> +(define_expand "vec_addsub_v4sf3"
>> > > >> +  [(set (match_operand:V4SF 0 "register_operand")
>> > > >> + (unspec:V4SF
>> > > >> +  [(match_operand:V4SF 1 "register_operand")
>> > > >> +   (match_operand:V4SF 2 "nonimmediate_operand")] UNSPEC_SUBADD))]
>> > > >> +  "TARGET_SSE"
>> > > >> +{
>> > > >> +  ix86_sse_expand_fp_addsub_operator (false, V4SFmode, operands);
>> > > >> +  DONE;
>> > > >> +})
>> > > >> +
>> > > >> +(define_expand "vec_addsub_v2df3"
>> > > >> +  [(set (match_operand:V2DF 0 "register_operand")
>> > > >> + (unspec:V2DF
>> > > >> +  [(match_operand:V2DF 1 "register_operand")
>> > > >> +   (match_operand:V2DF 2 "nonimmediate_operand")] UNSPEC_SUBADD))]
>> > > >> +  "TARGET_SSE2"
>> > > >> +{
>> > > >> +  ix86_sse_expand_fp_addsub_operator (false, V2DFmode, operands);
>> > > >> +  DONE;
>> > > >> +})
>> > > >> +
>> > > >> +(define_expand "vec_addsub_<mode>3"
>> > > >> +  [(set (match_operand:VF_256 0 "register_operand")
>> > > >> + (unspec:VF_256
>> > > >> +  [(match_operand:VF_256 1 "register_operand")
>> > > >> +   (match_operand:VF_256 2 "nonimmediate_operand")] UNSPEC_ADDSUB))]
>> > > >> +  "TARGET_AVX"
>> > > >> +{
>> > > >> +  rtx tmp = gen_reg_rtx (<MODE>mode);
>> > > >> +  emit_insn (gen_neg<mode>2 (tmp, operands[2]));
>> > > >> +  emit_insn (gen_avx_addsub<mode>3 (operands[0], operands[1], tmp));
>> > > >> +  DONE;
>> > > >> +})
>> > > >> +
>> > > >>  (define_insn "avx_addsubv4df3"
>> > > >>    [(set (match_operand:V4DF 0 "register_operand" "=x")
>> > > >>   (vec_merge:V4DF
>> > > >> diff --git a/gcc/doc/generic.texi b/gcc/doc/generic.texi
>> > > >> index f2dd0ff..0870d6f 100644
>> > > >> --- a/gcc/doc/generic.texi
>> > > >> +++ b/gcc/doc/generic.texi
>> > > >> @@ -1715,6 +1715,8 @@ a value from @code{enum annot_expr_kind}.
>> > > >>  @tindex VEC_PACK_TRUNC_EXPR
>> > > >>  @tindex VEC_PACK_SAT_EXPR
>> > > >>  @tindex VEC_PACK_FIX_TRUNC_EXPR
>> > > >> +@tindex VEC_ADDSUB_EXPR
>> > > >> +@tindex VEC_SUBADD_EXPR
>> > > >>
>> > > >>  @table @code
>> > > >>  @item VEC_LSHIFT_EXPR
>> > > >> @@ -1795,6 +1797,12 @@ value, it is taken from the second operand. It
>> > > >> should never evaluate to
>> > > >>  any other value currently, but optimizations should not rely on that
>> > > >>  property. In contrast with a @code{COND_EXPR}, all operands are always
>> > > >>  evaluated.
>> > > >> +
>> > > >> +@item VEC_ADDSUB_EXPR
>> > > >> +@itemx VEC_SUBADD_EXPR
>> > > >> +These nodes represent add/sub and sub/add operations on even/odd elements
>> > > >> +of two vectors respectively.  The three operands must be vectors of the same
>> > > >> +size and number of elements.
>> > > >>  @end table
>> > > >>
>> > > >>
>> > > >> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
>> > > >> index 1a06e3d..d9726d2 100644
>> > > >> --- a/gcc/doc/md.texi
>> > > >> +++ b/gcc/doc/md.texi
>> > > >> @@ -4885,6 +4885,12 @@ with N signed/unsigned elements of size S@.
>> > > >> Operand 2 is a constant.  Shift
>> > > >>  the high/low elements of operand 1, and put the N/2 results of size 2*S in the
>> > > >>  output vector (operand 0).
>> > > >>
>> > > >> +@cindex @code{vec_addsub_@var{m}3} instruction pattern
>> > > >> +@cindex @code{vec_subadd_@var{m}3} instruction pattern
>> > > >> +@item @samp{vec_addsub_@var{m}3}, @samp{vec_subadd_@var{m}3}
>> > > >> +Perform add/sub or sub/add on even/odd elements of two vectors.  Each
>> > > >> +operand is a vector with N elements of size S@.
>> > > >> +
>> > > >>  @cindex @code{mulhisi3} instruction pattern
>> > > >>  @item @samp{mulhisi3}
>> > > >>  Multiply operands 1 and 2, which have mode @code{HImode}, and store
>> > > >> diff --git a/gcc/expr.c b/gcc/expr.c
>> > > >> index 28b4332..997cfe2 100644
>> > > >> --- a/gcc/expr.c
>> > > >> +++ b/gcc/expr.c
>> > > >> @@ -8743,6 +8743,8 @@ expand_expr_real_2 (sepops ops, rtx target, enum
>> > > >> machine_mode tmode,
>> > > >>      case BIT_AND_EXPR:
>> > > >>      case BIT_IOR_EXPR:
>> > > >>      case BIT_XOR_EXPR:
>> > > >> +    case VEC_ADDSUB_EXPR:
>> > > >> +    case VEC_SUBADD_EXPR:
>> > > >>        goto binop;
>> > > >>
>> > > >>      case LROTATE_EXPR:
>> > > >> diff --git a/gcc/gimple-pretty-print.c b/gcc/gimple-pretty-print.c
>> > > >> index 6842213..e5c7a93 100644
>> > > >> --- a/gcc/gimple-pretty-print.c
>> > > >> +++ b/gcc/gimple-pretty-print.c
>> > > >> @@ -355,6 +355,8 @@ dump_binary_rhs (pretty_printer *buffer, gimple
>> > > >> gs, int spc, int flags)
>> > > >>      case VEC_PACK_FIX_TRUNC_EXPR:
>> > > >>      case VEC_WIDEN_LSHIFT_HI_EXPR:
>> > > >>      case VEC_WIDEN_LSHIFT_LO_EXPR:
>> > > >> +    case VEC_ADDSUB_EXPR:
>> > > >> +    case VEC_SUBADD_EXPR:
>> > > >>        for (p = get_tree_code_name (code); *p; p++)
>> > > >>   pp_character (buffer, TOUPPER (*p));
>> > > >>        pp_string (buffer, " <");
>> > > >> diff --git a/gcc/optabs.c b/gcc/optabs.c
>> > > >> index 164e4dd..a725117 100644
>> > > >> --- a/gcc/optabs.c
>> > > >> +++ b/gcc/optabs.c
>> > > >> @@ -547,6 +547,12 @@ optab_for_tree_code (enum tree_code code, const_tree type,
>> > > >>        return TYPE_UNSIGNED (type) ?
>> > > >>   vec_pack_ufix_trunc_optab : vec_pack_sfix_trunc_optab;
>> > > >>
>> > > >> +    case VEC_ADDSUB_EXPR:
>> > > >> +      return vec_addsub_optab;
>> > > >> +
>> > > >> +    case VEC_SUBADD_EXPR:
>> > > >> +      return vec_subadd_optab;
>> > > >> +
>> > > >>      default:
>> > > >>        break;
>> > > >>      }
>> > > >> diff --git a/gcc/optabs.def b/gcc/optabs.def
>> > > >> index 6b924ac..3a09c52 100644
>> > > >> --- a/gcc/optabs.def
>> > > >> +++ b/gcc/optabs.def
>> > > >> @@ -281,6 +281,8 @@ OPTAB_D (vec_widen_umult_lo_optab, "vec_widen_umult_lo_$a")
>> > > >>  OPTAB_D (vec_widen_umult_odd_optab, "vec_widen_umult_odd_$a")
>> > > >>  OPTAB_D (vec_widen_ushiftl_hi_optab, "vec_widen_ushiftl_hi_$a")
>> > > >>  OPTAB_D (vec_widen_ushiftl_lo_optab, "vec_widen_ushiftl_lo_$a")
>> > > >> +OPTAB_D (vec_addsub_optab, "vec_addsub_$a3")
>> > > >> +OPTAB_D (vec_subadd_optab, "vec_subadd_$a3")
>> > > >>
>> > > >>  OPTAB_D (sync_add_optab, "sync_add$I$a")
>> > > >>  OPTAB_D (sync_and_optab, "sync_and$I$a")
>> > > >> diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
>> > > >> index 09c7f20..efd6c24 100644
>> > > >> --- a/gcc/testsuite/ChangeLog
>> > > >> +++ b/gcc/testsuite/ChangeLog
>> > > >> @@ -1,3 +1,10 @@
>> > > >> +2013-11-14  Cong Hou  <congh@google.com>
>> > > >> +
>> > > >> + * lib/target-supports.exp (check_effective_target_vect_addsub):
>> > > >> + New target.
>> > > >> + * gcc.dg/vect/vect-addsub-float.c: New test.
>> > > >> + * gcc.dg/vect/vect-addsub-double.c: New test.
>> > > >> +
>> > > >>  2013-11-12  Balaji V. Iyer  <balaji.v.iyer@intel.com>
>> > > >>
>> > > >>   * gcc.dg/cilk-plus/cilk-plus.exp: Added a check for LTO before running
>> > > >> diff --git a/gcc/testsuite/gcc.dg/vect/vect-addsub-double.c
>> > > >> b/gcc/testsuite/gcc.dg/vect/vect-addsub-double.c
>> > > >> new file mode 100644
>> > > >> index 0000000..5399dde
>> > > >> --- /dev/null
>> > > >> +++ b/gcc/testsuite/gcc.dg/vect/vect-addsub-double.c
>> > > >> @@ -0,0 +1,51 @@
>> > > >> +/* { dg-require-effective-target vect_addsub } */
>> > > >> +/* { dg-additional-options "-fdump-tree-slp-details" } */
>> > > >> +
>> > > >> +#include "tree-vect.h"
>> > > >> +
>> > > >> +double a[4], b[4], c[4];
>> > > >> +
>> > > >> +void subadd ()
>> > > >> +{
>> > > >> +  c[0] = a[0] - b[0];
>> > > >> +  c[1] = a[1] + b[1];
>> > > >> +  c[2] = a[2] - b[2];
>> > > >> +  c[3] = a[3] + b[3];
>> > > >> +}
>> > > >> +
>> > > >> +void addsub ()
>> > > >> +{
>> > > >> +  c[0] = a[0] + b[0];
>> > > >> +  c[1] = a[1] - b[1];
>> > > >> +  c[2] = a[2] + b[2];
>> > > >> +  c[3] = a[3] - b[3];
>> > > >> +}
>> > > >> +
>> > > >> +int main()
>> > > >> +{
>> > > >> +  int i;
>> > > >> +  for (i = 0; i < 4; ++i)
>> > > >> +    {
>> > > >> +      a[i] = (i + 1.2) / 3.4;
>> > > >> +      b[i] = (i + 5.6) / 7.8;
>> > > >> +    }
>> > > >> +
>> > > >> +  subadd ();
>> > > >> +
>> > > >> +  if (c[0] != a[0] - b[0]
>> > > >> +      || c[1] != a[1] + b[1]
>> > > >> +      || c[2] != a[2] - b[2]
>> > > >> +      || c[3] != a[3] + b[3])
>> > > >> +    abort ();
>> > > >> +
>> > > >> +  addsub ();
>> > > >> +
>> > > >> +  if (c[0] != a[0] + b[0]
>> > > >> +      || c[1] != a[1] - b[1]
>> > > >> +      || c[2] != a[2] + b[2]
>> > > >> +      || c[3] != a[3] - b[3])
>> > > >> +    abort ();
>> > > >> +}
>> > > >> +
>> > > >> +/* { dg-final { scan-tree-dump-times "basic block vectorized" 2 "slp" } } */
>> > > >> +/* { dg-final { cleanup-tree-dump "slp" } } */
>> > > >> diff --git a/gcc/testsuite/gcc.dg/vect/vect-addsub-float.c
>> > > >> b/gcc/testsuite/gcc.dg/vect/vect-addsub-float.c
>> > > >> new file mode 100644
>> > > >> index 0000000..5b780f3
>> > > >> --- /dev/null
>> > > >> +++ b/gcc/testsuite/gcc.dg/vect/vect-addsub-float.c
>> > > >> @@ -0,0 +1,51 @@
>> > > >> +/* { dg-require-effective-target vect_addsub } */
>> > > >> +/* { dg-additional-options "-fdump-tree-slp-details" } */
>> > > >> +
>> > > >> +#include "tree-vect.h"
>> > > >> +
>> > > >> +float a[4], b[4], c[4];
>> > > >> +
>> > > >> +void subadd ()
>> > > >> +{
>> > > >> +  c[0] = a[0] - b[0];
>> > > >> +  c[1] = a[1] + b[1];
>> > > >> +  c[2] = a[2] - b[2];
>> > > >> +  c[3] = a[3] + b[3];
>> > > >> +}
>> > > >> +
>> > > >> +void addsub ()
>> > > >> +{
>> > > >> +  c[0] = a[0] + b[0];
>> > > >> +  c[1] = a[1] - b[1];
>> > > >> +  c[2] = a[2] + b[2];
>> > > >> +  c[3] = a[3] - b[3];
>> > > >> +}
>> > > >> +
>> > > >> +int main()
>> > > >> +{
>> > > >> +  int i;
>> > > >> +  for (i = 0; i < 4; ++i)
>> > > >> +    {
>> > > >> +      a[i] = (i + 1.2) / 3.4;
>> > > >> +      b[i] = (i + 5.6) / 7.8;
>> > > >> +    }
>> > > >> +
>> > > >> +  subadd ();
>> > > >> +
>> > > >> +  if (c[0] != a[0] - b[0]
>> > > >> +      || c[1] != a[1] + b[1]
>> > > >> +      || c[2] != a[2] - b[2]
>> > > >> +      || c[3] != a[3] + b[3])
>> > > >> +    abort ();
>> > > >> +
>> > > >> +  addsub ();
>> > > >> +
>> > > >> +  if (c[0] != a[0] + b[0]
>> > > >> +      || c[1] != a[1] - b[1]
>> > > >> +      || c[2] != a[2] + b[2]
>> > > >> +      || c[3] != a[3] - b[3])
>> > > >> +    abort ();
>> > > >> +}
>> > > >> +
>> > > >> +/* { dg-final { scan-tree-dump-times "basic block vectorized" 2 "slp" } } */
>> > > >> +/* { dg-final { cleanup-tree-dump "slp" } } */
>> > > >> diff --git a/gcc/testsuite/lib/target-supports.exp
>> > > >> b/gcc/testsuite/lib/target-supports.exp
>> > > >> index c3d9712..f336f77 100644
>> > > >> --- a/gcc/testsuite/lib/target-supports.exp
>> > > >> +++ b/gcc/testsuite/lib/target-supports.exp
>> > > >> @@ -4099,6 +4099,15 @@ proc check_effective_target_vect_extract_even_odd { } {
>> > > >>      return $et_vect_extract_even_odd_saved
>> > > >>  }
>> > > >>
>> > > >> +# Return 1 if the target supports vector addsub and subadd
>> > > >> operations, 0 otherwise.
>> > > >> +
>> > > >> +proc check_effective_target_vect_addsub { } {
>> > > >> +    if { [check_effective_target_sse2] } {
>> > > >> + return 1
>> > > >> +    }
>> > > >> +    return 0
>> > > >> +}
>> > > >> +
>> > > >>  # Return 1 if the target supports vector interleaving, 0 otherwise.
>> > > >>
>> > > >>  proc check_effective_target_vect_interleave { } {
>> > > >> diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c
>> > > >> index 601efd6..2bf1b79 100644
>> > > >> --- a/gcc/tree-cfg.c
>> > > >> +++ b/gcc/tree-cfg.c
>> > > >> @@ -3572,6 +3572,23 @@ verify_gimple_assign_binary (gimple stmt)
>> > > >>          return false;
>> > > >>        }
>> > > >>
>> > > >> +    case VEC_SUBADD_EXPR:
>> > > >> +    case VEC_ADDSUB_EXPR:
>> > > >> +      {
>> > > >> +        if (TREE_CODE (rhs1_type) != VECTOR_TYPE
>> > > >> +            || TREE_CODE (rhs2_type) != VECTOR_TYPE
>> > > >> +            || TREE_CODE (lhs_type) != VECTOR_TYPE)
>> > > >> +          {
>> > > >> +            error ("type mismatch in addsub/subadd expression");
>> > > >> +            debug_generic_expr (lhs_type);
>> > > >> +            debug_generic_expr (rhs1_type);
>> > > >> +            debug_generic_expr (rhs2_type);
>> > > >> +            return true;
>> > > >> +          }
>> > > >> +
>> > > >> +        return false;
>> > > >> +      }
>> > > >> +
>> > > >>      case PLUS_EXPR:
>> > > >>      case MINUS_EXPR:
>> > > >>        {
>> > > >> diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
>> > > >> index 825f73a..1169d33 100644
>> > > >> --- a/gcc/tree-vect-slp.c
>> > > >> +++ b/gcc/tree-vect-slp.c
>> > > >> @@ -125,6 +125,7 @@ vect_create_new_slp_node (vec<gimple> scalar_stmts)
>> > > >>    SLP_TREE_VEC_STMTS (node).create (0);
>> > > >>    SLP_TREE_CHILDREN (node).create (nops);
>> > > >>    SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
>> > > >> +  SLP_TREE_OP_CODE (node) = ERROR_MARK;
>> > > >>
>> > > >>    return node;
>> > > >>  }
>> > > >> @@ -383,8 +384,74 @@ vect_get_and_check_slp_defs (loop_vec_info
>> > > >> loop_vinfo, bb_vec_info bb_vinfo,
>> > > >>    return true;
>> > > >>  }
>> > > >>
>> > > >> +/* Check if the target supports the vector operation that performs the
>> > > >> +   operation of FIRST_STMT_CODE on even elements and the operation as in STMT
>> > > >> +   on odd elements.  If yes, set the code of NODE to that of the new operation
>> > > >> +   and return true.  Otherwise return false.  This enables SLP vectorization
>> > > >> +   for the following code:
>> > > >>
>> > > >> -/* Verify if the scalar stmts STMTS are isomorphic, require data
>> > > >> +           a[0] = b[0] + c[0];
>> > > >> +           a[1] = b[1] - c[1];
>> > > >> +           a[2] = b[2] + c[2];
>> > > >> +           a[3] = b[3] - c[3];
>> > > >> +   */
>> > > >> +
>> > > >> +static bool
>> > > >> +slp_supported_non_isomorphic_op (enum tree_code first_stmt_code,
>> > > >> + gimple stmt,
>> > > >> + slp_tree *node)
>> > > >> +{
>> > > >> +  if (!is_gimple_assign (stmt))
>> > > >> +    return false;
>> > > >> +
>> > > >> +  enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
>> > > >> +  enum tree_code vec_opcode = ERROR_MARK;
>> > > >> +
>> > > >> +  switch (first_stmt_code)
>> > > >> +    {
>> > > >> +    case PLUS_EXPR:
>> > > >> +      if (rhs_code == MINUS_EXPR)
>> > > >> + vec_opcode = VEC_ADDSUB_EXPR;
>> > > >> +      break;
>> > > >> +
>> > > >> +    case MINUS_EXPR:
>> > > >> +      if (rhs_code == PLUS_EXPR)
>> > > >> + vec_opcode = VEC_SUBADD_EXPR;
>> > > >> +      break;
>> > > >> +
>> > > >> +    default:
>> > > >> +      return false;
>> > > >> +    }
>> > > >> +
>> > > >> +  if (vec_opcode == ERROR_MARK)
>> > > >> +    return false;
>> > > >> +
>> > > >> +  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
>> > > >> +  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
>> > > >> +  if (!vectype)
>> > > >> +    {
>> > > >> +      vectype = get_vectype_for_scalar_type
>> > > >> +                  (TREE_TYPE (gimple_assign_rhs1 (stmt)));
>> > > >> +      gcc_assert (vectype);
>> > > >> +    }
>> > > >> +
>> > > >> +  optab optab = optab_for_tree_code (vec_opcode, vectype, optab_default);
>> > > >> +  if (!optab)
>> > > >> +    return false;
>> > > >> +
>> > > >> +  int icode = (int) optab_handler (optab, TYPE_MODE (vectype));
>> > > >> +  if (icode == CODE_FOR_nothing)
>> > > >> +    return false;
>> > > >> +
>> > > >> +  if (SLP_TREE_OP_CODE (*node) != ERROR_MARK
>> > > >> +      && SLP_TREE_OP_CODE (*node) != vec_opcode)
>> > > >> +    return false;
>> > > >> +
>> > > >> +  SLP_TREE_OP_CODE (*node) = vec_opcode;
>> > > >> +  return true;
>> > > >> +}
>> > > >> +
>> > > >> +/* Verify if the scalar stmts of NODE are isomorphic, require data
>> > > >>     permutation or are of unsupported types of operation.  Return
>> > > >>     true if they are, otherwise return false and indicate in *MATCHES
>> > > >>     which stmts are not isomorphic to the first one.  If MATCHES[0]
>> > > >> @@ -393,11 +460,12 @@ vect_get_and_check_slp_defs (loop_vec_info
>> > > >> loop_vinfo, bb_vec_info bb_vinfo,
>> > > >>
>> > > >>  static bool
>> > > >>  vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
>> > > >> -       vec<gimple> stmts, unsigned int group_size,
>> > > >> +                       slp_tree *node, unsigned int group_size,
>> > > >>         unsigned nops, unsigned int *max_nunits,
>> > > >>         unsigned int vectorization_factor, bool *matches)
>> > > >>  {
>> > > >>    unsigned int i;
>> > > >> +  vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (*node);
>> > > >>    gimple stmt = stmts[0];
>> > > >>    enum tree_code first_stmt_code = ERROR_MARK, rhs_code = ERROR_MARK;
>> > > >>    enum tree_code first_cond_code = ERROR_MARK;
>> > > >> @@ -583,7 +651,10 @@ vect_build_slp_tree_1 (loop_vec_info loop_vinfo,
>> > > >> bb_vec_info bb_vinfo,
>> > > >>   }
>> > > >>        else
>> > > >>   {
>> > > >> -  if (first_stmt_code != rhs_code
>> > > >> +  if ((first_stmt_code != rhs_code
>> > > >> + && (i % 2 == 0
>> > > >> +     || !slp_supported_non_isomorphic_op (first_stmt_code,
>> > > >> +  stmt, node)))
>> > > >>        && (first_stmt_code != IMAGPART_EXPR
>> > > >>    || rhs_code != REALPART_EXPR)
>> > > >>        && (first_stmt_code != REALPART_EXPR
>> > > >> @@ -868,7 +939,7 @@ vect_build_slp_tree (loop_vec_info loop_vinfo,
>> > > >> bb_vec_info bb_vinfo,
>> > > >>      return false;
>> > > >>
>> > > >>    if (!vect_build_slp_tree_1 (loop_vinfo, bb_vinfo,
>> > > >> -      SLP_TREE_SCALAR_STMTS (*node), group_size, nops,
>> > > >> +      node, group_size, nops,
>> > > >>        max_nunits, vectorization_factor, matches))
>> > > >>      return false;
>> > > >>
>> > > >> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
>> > > >> index b0e0fa9..98906f0 100644
>> > > >> --- a/gcc/tree-vect-stmts.c
>> > > >> +++ b/gcc/tree-vect-stmts.c
>> > > >> @@ -3512,7 +3512,13 @@ vectorizable_operation (gimple stmt,
>> > > >> gimple_stmt_iterator *gsi,
>> > > >>    if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
>> > > >>      return false;
>> > > >>
>> > > >> -  code = gimple_assign_rhs_code (stmt);
>> > > >> +  /* Check if this slp_node will be vectorized by non-isomorphic operations,
>> > > >> +     in which case the operation on vectors is stored in
>> > > >> +     SLP_TREE_OP_CODE (slp_node).  */
>> > > >> +  if (slp_node && SLP_TREE_OP_CODE (slp_node) != ERROR_MARK)
>> > > >> +    code = SLP_TREE_OP_CODE (slp_node);
>> > > >> +  else
>> > > >> +    code = gimple_assign_rhs_code (stmt);
>> > > >>
>> > > >>    /* For pointer addition, we should use the normal plus for
>> > > >>       the vector addition.  */
>> > > >> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
>> > > >> index bbd50e1..19c09ae 100644
>> > > >> --- a/gcc/tree-vectorizer.h
>> > > >> +++ b/gcc/tree-vectorizer.h
>> > > >> @@ -117,6 +117,10 @@ struct _slp_tree {
>> > > >>       scalar elements in one scalar iteration (GROUP_SIZE) multiplied by VF
>> > > >>       divided by vector size.  */
>> > > >>    unsigned int vec_stmts_size;
>> > > >> +  /* The operation code used in the vectorized statement if it is not
>> > > >> +     ERROR_MARK.  Otherwise the operation is determined by the original
>> > > >> +     statement.  */
>> > > >> +  enum tree_code op_code;
>> > > >>  };
>> > > >>
>> > > >>
>> > > >> @@ -157,6 +161,7 @@ typedef struct _slp_instance {
>> > > >>  #define SLP_TREE_VEC_STMTS(S)                    (S)->vec_stmts
>> > > >>  #define SLP_TREE_NUMBER_OF_VEC_STMTS(S)          (S)->vec_stmts_size
>> > > >>  #define SLP_TREE_LOAD_PERMUTATION(S)             (S)->load_permutation
>> > > >> +#define SLP_TREE_OP_CODE(S)                         (S)->op_code
>> > > >>
>> > > >>  /* This structure is used in creation of an SLP tree.  Each instance
>> > > >>     corresponds to the same operand in a group of scalar stmts in an SLP
>> > > >> diff --git a/gcc/tree.def b/gcc/tree.def
>> > > >> index 6763e78..c3eda42 100644
>> > > >> --- a/gcc/tree.def
>> > > >> +++ b/gcc/tree.def
>> > > >> @@ -1256,6 +1256,13 @@ DEFTREECODE (VEC_PACK_FIX_TRUNC_EXPR,
>> > > >> "vec_pack_fix_trunc_expr", tcc_binary, 2)
>> > > >>  DEFTREECODE (VEC_WIDEN_LSHIFT_HI_EXPR, "widen_lshift_hi_expr", tcc_binary, 2)
>> > > >>  DEFTREECODE (VEC_WIDEN_LSHIFT_LO_EXPR, "widen_lshift_lo_expr", tcc_binary, 2)
>> > > >>
>> > > >> +/* Add even/odd elements and sub odd/even elements between two vectors.
>> > > >> +   Operand 0 and operand 1 are two operands.
>> > > >> +   The result of this operation is a vector with the same type of operand 0/1.
>> > > >> + */
>> > > >> +DEFTREECODE (VEC_ADDSUB_EXPR, "addsub_expr", tcc_binary, 2)
>> > > >> +DEFTREECODE (VEC_SUBADD_EXPR, "subadd_expr", tcc_binary, 2)
>> > > >> +
>> > > >>  /* PREDICT_EXPR.  Specify hint for branch prediction.  The
>> > > >>     PREDICT_EXPR_PREDICTOR specify predictor and PREDICT_EXPR_OUTCOME the
>> > > >>     outcome (0 for not taken and 1 for taken).  Once the profile is guessed
>> > > >>
>> > > >
>> > > > --
>> > > > Richard Biener <rguenther@suse.de>
>> > > > SUSE / SUSE Labs
>> > > > SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
>> > > > GF: Jeff Hawn, Jennifer Guild, Felix Imend"orffer
>> > >
>> > >
>> >
>> > --
>> > Richard Biener <rguenther@suse.de>
>> > SUSE / SUSE Labs
>> > SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
>> > GF: Jeff Hawn, Jennifer Guild, Felix Imend"orffer
>>
>>
>
> --
> Richard Biener <rguenther@suse.de>
> SUSE / SUSE Labs
> SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
> GF: Jeff Hawn, Jennifer Guild, Felix Imend"orffer
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 2c0554b..2061869 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,19 @@
+2013-11-20  Cong Hou  <congh@google.com>
+
+	PR tree-optimization/56902
+	* config/i386/i386-protos.h (ix86_generate_mask_from_vec_merge):
+	New function.
+	* config/i386/i386.c (ix86_generate_mask_from_vec_merge): Likewise.
+	(expand_vec_perm_blend): Add a new flag to allow emulating the blend
+	instruction on SSE/SSE2.
+	(ix86_expand_vec_perm_const_1): Try emulating the blend instruction.
+	* config/i386/sse.md (sse3_addsubv2df3): Match the pattern of addsub
+	for V4DF mode.
+	(*vec_merge<mode>): New define_insn_and_split for splitting vec_merge.
+	* tree-vect-slp.c (vect_build_slp_tree_1): Add the support of detecting
+	addsub pattern for SLP (from Richard Biener).
+	(vect_schedule_slp_instance): Likewise.
+
 2013-11-12  Jeff Law  <law@redhat.com>
 
 	* tree-ssa-threadedge.c (thread_around_empty_blocks): New
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index fdf9d58..1f7d1ea 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -106,6 +106,7 @@ extern void ix86_expand_unary_operator (enum rtx_code, enum machine_mode,
 					rtx[]);
 extern rtx ix86_build_const_vector (enum machine_mode, bool, rtx);
 extern rtx ix86_build_signbit_mask (enum machine_mode, bool, bool);
+extern rtx ix86_generate_mask_from_vec_merge (enum machine_mode, rtx);
 extern void ix86_split_convert_uns_si_sse (rtx[]);
 extern void ix86_expand_convert_uns_didf_sse (rtx, rtx);
 extern void ix86_expand_convert_uns_sixf_sse (rtx, rtx);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 5287b49..044eb09 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -39633,11 +39633,61 @@ expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
   return ok;
 }
 
+/* Generate a vector mask from a scalar mask, which is used in vec_merge
+   instruction.  */
+
+rtx
+ix86_generate_mask_from_vec_merge (enum machine_mode vmode, rtx mask)
+{
+  int int_mask = INTVAL (mask);
+  rtx bit_mask;
+
+  switch (vmode)
+    {
+    case V2DFmode:
+    case V2DImode:
+      bit_mask = gen_rtx_CONST_VECTOR (V2DImode, gen_rtvec (2,
+	   (int_mask & 1) ? GEN_INT (0xFFFFFFFFFFFFFFFF) : const0_rtx,
+	   (int_mask & 2) ? GEN_INT (0xFFFFFFFFFFFFFFFF) : const0_rtx));
+      break;
+
+    case V4SFmode:
+    case V4SImode:
+      bit_mask = gen_rtx_CONST_VECTOR (V4SImode, gen_rtvec (4,
+	   (int_mask & 1) ? GEN_INT (0xFFFFFFFF) : const0_rtx,
+	   (int_mask & 2) ? GEN_INT (0xFFFFFFFF) : const0_rtx,
+	   (int_mask & 4) ? GEN_INT (0xFFFFFFFF) : const0_rtx,
+	   (int_mask & 8) ? GEN_INT (0xFFFFFFFF) : const0_rtx));
+      break;
+
+    case V8HImode:
+      bit_mask = gen_rtx_CONST_VECTOR (vmode, gen_rtvec (8,
+	   (int_mask & 1) ? GEN_INT (0xFFFF) : const0_rtx,
+	   (int_mask & 2) ? GEN_INT (0xFFFF) : const0_rtx,
+	   (int_mask & 4) ? GEN_INT (0xFFFF) : const0_rtx,
+	   (int_mask & 8) ? GEN_INT (0xFFFF) : const0_rtx,
+	   (int_mask & 16) ? GEN_INT (0xFFFF) : const0_rtx,
+	   (int_mask & 32) ? GEN_INT (0xFFFF) : const0_rtx,
+	   (int_mask & 64) ? GEN_INT (0xFFFF) : const0_rtx,
+	   (int_mask & 128) ? GEN_INT (0xFFFF) : const0_rtx));
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  rtx ret = gen_reg_rtx (vmode);
+  convert_move (ret, bit_mask, false);
+  return ret;
+}
+
 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
-   in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
+   in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  If EMULATE
+   is set, then we generate vec_merge instruction for SSE/SSE2, which
+   can be emulated by and/andn/or approach.  */
 
 static bool
-expand_vec_perm_blend (struct expand_vec_perm_d *d)
+expand_vec_perm_blend (struct expand_vec_perm_d *d, bool emulate = false)
 {
   enum machine_mode vmode = d->vmode;
   unsigned i, mask, nelt = d->nelt;
@@ -39646,7 +39696,14 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
 
   if (d->one_operand_p)
     return false;
-  if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
+
+  /* If the flag EMULATE is set, then we will use and/andn/or to emulate a
+     blend instruction.  Note that we don't emulate blend instruction for
+     V16QImode, in which case vec_merge instruction won't be generated, and
+     this is also not useful in practice.  */
+  if (emulate && !TARGET_SSE4_1 && vmode != V16QImode)
+    ;
+  else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
     ;
   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
     ;
@@ -39667,9 +39724,6 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
   if (d->testing_p)
     return true;
 
-  /* ??? Without SSE4.1, we could implement this with and/andn/or.  This
-     decision should be extracted elsewhere, so that we only try that
-     sequence once all budget==3 options have been tried.  */
   target = d->target;
   op0 = d->op0;
   op1 = d->op1;
@@ -41629,6 +41683,10 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
   if (expand_vec_perm_1 (d))
     return true;
 
+  /* Try emulating the blend instruction using other instructions.  */
+  if (expand_vec_perm_blend (d, true))
+    return true;
+
   /* Try sequences of two instructions.  */
 
   if (expand_vec_perm_pshuflw_pshufhw (d))
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index e5dd90c..e77ec3e 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -737,6 +737,11 @@
   (and (match_code "const_int")
        (match_test "IN_RANGE (INTVAL (op), 2, 3)")))
 
+;; Match 2 to 255.
+(define_predicate "const_2_to_255_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 2, 255)")))
+
 ;; Match 4 to 5.
 (define_predicate "const_4_to_5_operand"
   (and (match_code "const_int")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 7bb2d77..551a05a 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1509,13 +1509,15 @@
    (set_attr "mode" "<MODE>")])
 
 (define_insn "avx_addsubv4df3"
-  [(set (match_operand:V4DF 0 "register_operand" "=x")
-	(vec_merge:V4DF
-	  (plus:V4DF
-	    (match_operand:V4DF 1 "register_operand" "x")
-	    (match_operand:V4DF 2 "nonimmediate_operand" "xm"))
-	  (minus:V4DF (match_dup 1) (match_dup 2))
-	  (const_int 10)))]
+[(set (match_operand:V4DF 0 "register_operand" "=x,x")
+	(vec_select:V4DF
+	   (vec_concat:V8DF
+	     (minus:V4DF
+	       (match_operand:V4DF 1 "register_operand" "0,x")
+	       (match_operand:V4DF 2 "nonimmediate_operand" "xm,xm"))
+	     (plus:V4DF (match_dup 1) (match_dup 2)))
+	   (parallel [(const_int 0) (const_int 5)
+		      (const_int 2) (const_int 7)])))]
   "TARGET_AVX"
   "vaddsubpd\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "sseadd")
@@ -1524,12 +1526,13 @@
 
 (define_insn "sse3_addsubv2df3"
   [(set (match_operand:V2DF 0 "register_operand" "=x,x")
-	(vec_merge:V2DF
-	  (plus:V2DF
-	    (match_operand:V2DF 1 "register_operand" "0,x")
-	    (match_operand:V2DF 2 "nonimmediate_operand" "xm,xm"))
-	  (minus:V2DF (match_dup 1) (match_dup 2))
-	  (const_int 2)))]
+	(vec_select:V2DF
+	   (vec_concat:V4DF
+	     (minus:V2DF
+	       (match_operand:V2DF 1 "register_operand" "0,x")
+	       (match_operand:V2DF 2 "nonimmediate_operand" "xm,xm"))
+	     (plus:V2DF (match_dup 1) (match_dup 2)))
+	   (parallel [(const_int 0) (const_int 3)])))]
   "TARGET_SSE3"
   "@
    addsubpd\t{%2, %0|%0, %2}
@@ -9716,6 +9719,24 @@
   [(set (attr "length")
      (symbol_ref ("(Pmode != word_mode) + 3")))])
 
+(define_insn_and_split "*vec_merge<mode>"
+  [(set (match_operand:V_128 0 "register_operand")
+	(vec_merge:V_128
+	  (match_operand:V_128 2 "nonimmediate_operand")
+	  (match_operand:V_128 1 "register_operand")
+	  (match_operand:SI 3 "const_2_to_255_operand")))]
+  "TARGET_SSE && !TARGET_SSE4_1 && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(and:V_128 (match_dup 3) (match_dup 2)))
+   (set (match_dup 3)
+	(and:V_128
+	  (not:V_128 (match_dup 3)) (match_dup 1)))
+   (set (match_dup 0)
+	(ior:V_128 (match_dup 0) (match_dup 3)))]
+  "operands[3] = ix86_generate_mask_from_vec_merge (<MODE>mode, operands[3]);")
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
 ;; SSSE3 instructions
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 09c7f20..0c8818a 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,10 @@
+2013-11-20  Cong Hou  <congh@google.com>
+
+	* gcc.target/i386/sse-vect-addsubps.c: New test.
+	* gcc.target/i386/sse2-vect-addsubpd.c: New test.
+	* gcc.target/i386/sse3-vect-addsubps.c: New test.
+	* gcc.target/i386/sse3-vect-addsubpd.c: New test.
+
 2013-11-12  Balaji V. Iyer  <balaji.v.iyer@intel.com>
 
 	* gcc.dg/cilk-plus/cilk-plus.exp: Added a check for LTO before running
diff --git a/gcc/testsuite/gcc.target/i386/sse-vect-addsubps.c b/gcc/testsuite/gcc.target/i386/sse-vect-addsubps.c
new file mode 100644
index 0000000..914020b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse-vect-addsubps.c
@@ -0,0 +1,36 @@
+/* { dg-do run } */
+/* { dg-require-effective-target sse } */
+/* { dg-additional-options "-O2 -ftree-vectorize -fdump-tree-slp-details" } */
+
+float a[4], b[4], c[4];
+
+void subadd ()
+{
+  c[0] = a[0] - b[0];
+  c[1] = a[1] + b[1];
+  c[2] = a[2] - b[2];
+  c[3] = a[3] + b[3];
+}
+
+extern void abort (void);
+
+int main ()
+{
+  int i;
+  for (i = 0; i < 4; ++i)
+    {
+      a[i] = (i + 1.2) / 3.4;
+      b[i] = (i + 5.6) / 7.8;
+    }
+
+  subadd ();
+
+  if (c[0] != a[0] - b[0]
+      || c[1] != a[1] + b[1]
+      || c[2] != a[2] - b[2]
+      || c[3] != a[3] + b[3])
+    abort ();
+}
+
+/* { dg-final { scan-tree-dump-times "basic block vectorized" 1 "slp" } } */
+/* { dg-final { cleanup-tree-dump "slp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-vect-addsubpd.c b/gcc/testsuite/gcc.target/i386/sse2-vect-addsubpd.c
new file mode 100644
index 0000000..2f78ec8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-vect-addsubpd.c
@@ -0,0 +1,36 @@
+/* { dg-do run } */
+/* { dg-require-effective-target sse2 } */
+/* { dg-additional-options "-O2 -ftree-vectorize -fdump-tree-slp-details" } */
+
+double a[4], b[4], c[4];
+
+void subadd ()
+{
+  c[0] = a[0] - b[0];
+  c[1] = a[1] + b[1];
+  c[2] = a[2] - b[2];
+  c[3] = a[3] + b[3];
+}
+
+extern void abort (void);
+
+int main ()
+{
+  int i;
+  for (i = 0; i < 4; ++i)
+    {
+      a[i] = (i + 1.2) / 3.4;
+      b[i] = (i + 5.6) / 7.8;
+    }
+
+  subadd ();
+
+  if (c[0] != a[0] - b[0]
+      || c[1] != a[1] + b[1]
+      || c[2] != a[2] - b[2]
+      || c[3] != a[3] + b[3])
+    abort ();
+}
+
+/* { dg-final { scan-tree-dump-times "basic block vectorized" 1 "slp" } } */
+/* { dg-final { cleanup-tree-dump "slp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse3-vect-addsubpd.c b/gcc/testsuite/gcc.target/i386/sse3-vect-addsubpd.c
new file mode 100644
index 0000000..99d55dc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse3-vect-addsubpd.c
@@ -0,0 +1,36 @@
+/* { dg-do run } */
+/* { dg-require-effective-target sse3 } */
+/* { dg-additional-options "-O2 -ftree-vectorize -fdump-tree-slp-details" } */
+
+double a[4], b[4], c[4];
+
+void subadd ()
+{
+  c[0] = a[0] - b[0];
+  c[1] = a[1] + b[1];
+  c[2] = a[2] - b[2];
+  c[3] = a[3] + b[3];
+}
+
+extern void abort (void);
+
+int main ()
+{
+  int i;
+  for (i = 0; i < 4; ++i)
+    {
+      a[i] = (i + 1.2) / 3.4;
+      b[i] = (i + 5.6) / 7.8;
+    }
+
+  subadd ();
+
+  if (c[0] != a[0] - b[0]
+      || c[1] != a[1] + b[1]
+      || c[2] != a[2] - b[2]
+      || c[3] != a[3] + b[3])
+    abort ();
+}
+
+/* { dg-final { scan-tree-dump-times "basic block vectorized" 1 "slp" } } */
+/* { dg-final { cleanup-tree-dump "slp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse3-vect-addsubps.c b/gcc/testsuite/gcc.target/i386/sse3-vect-addsubps.c
new file mode 100644
index 0000000..ea440ba
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse3-vect-addsubps.c
@@ -0,0 +1,36 @@
+/* { dg-do run } */
+/* { dg-require-effective-target sse3 } */
+/* { dg-additional-options "-O2 -ftree-vectorize -fdump-tree-slp-details" } */
+
+float a[4], b[4], c[4];
+
+void subadd ()
+{
+  c[0] = a[0] - b[0];
+  c[1] = a[1] + b[1];
+  c[2] = a[2] - b[2];
+  c[3] = a[3] + b[3];
+}
+
+extern void abort (void);
+
+int main ()
+{
+  int i;
+  for (i = 0; i < 4; ++i)
+    {
+      a[i] = (i + 1.2) / 3.4;
+      b[i] = (i + 5.6) / 7.8;
+    }
+
+  subadd ();
+
+  if (c[0] != a[0] - b[0]
+      || c[1] != a[1] + b[1]
+      || c[2] != a[2] - b[2]
+      || c[3] != a[3] + b[3])
+    abort ();
+}
+
+/* { dg-final { scan-tree-dump-times "basic block vectorized" 1 "slp" } } */
+/* { dg-final { cleanup-tree-dump "slp" } } */
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 825f73a..de7790d 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -398,7 +398,7 @@ vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
 		       unsigned int vectorization_factor, bool *matches)
 {
   unsigned int i;
-  gimple stmt = stmts[0];
+  gimple first_stmt = stmts[0], stmt = stmts[0];
   enum tree_code first_stmt_code = ERROR_MARK, rhs_code = ERROR_MARK;
   enum tree_code first_cond_code = ERROR_MARK;
   tree lhs;
@@ -588,6 +588,8 @@ vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
 		  || rhs_code != REALPART_EXPR)
 	      && (first_stmt_code != REALPART_EXPR
 		  || rhs_code != IMAGPART_EXPR)
+	      && (first_stmt_code != MINUS_EXPR
+		  || rhs_code != ((i & 1) ? PLUS_EXPR : MINUS_EXPR))
               && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
                    && (first_stmt_code == ARRAY_REF
                        || first_stmt_code == BIT_FIELD_REF
@@ -601,6 +603,10 @@ vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
 				   "Build SLP failed: different operation "
 				   "in stmt ");
 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
+                  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				   "original stmt ");
+		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
+				    first_stmt, 0);
                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 		}
 	      /* Mismatch.  */
@@ -3062,6 +3068,63 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
     }
 
+  if (is_gimple_assign (stmt)
+      && (gimple_assign_rhs_code (stmt) == PLUS_EXPR
+	  || gimple_assign_rhs_code (stmt) == MINUS_EXPR))
+    {
+      enum tree_code code0 = gimple_assign_rhs_code (stmt);
+      enum tree_code ocode;
+      gimple ostmt;
+      unsigned char *mask = XCNEWVEC (unsigned char, nunits);
+      bool allsame = true;
+      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt)
+	if (gimple_assign_rhs_code (ostmt) != code0)
+	  {
+	    mask[i] = i + nunits;
+	    allsame = false;
+	    ocode = gimple_assign_rhs_code (ostmt);
+	  }
+	else
+	  mask[i] = i;
+      if (!allsame)
+	{
+	  vec<gimple> v0;
+	  vec<gimple> v1;
+	  unsigned j;
+	  tree tmask = NULL_TREE;
+	  vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
+	  v0 = SLP_TREE_VEC_STMTS (node).copy ();
+	  SLP_TREE_VEC_STMTS (node).truncate (0);
+	  gimple_assign_set_rhs_code (stmt, ocode);
+	  vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
+	  gimple_assign_set_rhs_code (stmt, code0);
+	  v1 = SLP_TREE_VEC_STMTS (node).copy ();
+	  SLP_TREE_VEC_STMTS (node).truncate (0);
+	  tmask = make_vector (nunits);
+	  TREE_TYPE (tmask) = build_nonstandard_integer_type
+                                (GET_MODE_BITSIZE
+                                  (TYPE_MODE (TREE_TYPE (vectype))), 1);
+	  TREE_TYPE (tmask) = get_same_sized_vectype (TREE_TYPE (tmask),
+                                                      vectype);
+	  for (j = 0; j < nunits; ++j)
+	    VECTOR_CST_ELT (tmask, j) = build_int_cst (TREE_TYPE
+                                          (TREE_TYPE (tmask)), mask[j]);
+	  for (j = 0; j < v0.length (); ++j)
+	    {
+	      gimple vstmt;
+	      tree tem = make_ssa_name (vectype, NULL);
+	      vstmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, tem,
+						    gimple_assign_lhs (v0[j]),
+						    gimple_assign_lhs (v1[j]),
+                                                    tmask);
+	      vect_finish_stmt_generation (stmt, vstmt, &si);
+	      SLP_TREE_VEC_STMTS (node).quick_push (vstmt);
+	    }
+	  free (mask);
+	  return false;
+	}
+      free (mask);
+    }
   is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
   return is_store;
 }
diff mbox

Patch

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 2c0554b..2061869 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,19 @@ 
+2013-11-20  Cong Hou  <congh@google.com>
+
+ PR tree-optimization/56902
+ * config/i386/i386-protos.h (ix86_generate_mask_from_vec_merge):
+ New function.
+ * config/i386/i386.c (ix86_generate_mask_from_vec_merge): Likewise.
+ (expand_vec_perm_blend): Add a new flag to allow emulating the blend
+ instruction on SSE/SSE2.
+ (ix86_expand_vec_perm_const_1): Try emulating the blend instruction.
+ * config/i386/sse.md (sse3_addsubv2df3): Match the pattern of addsub
+ for V4DF mode.
+ (*vec_merge<mode>): New define_insn_and_split for splitting vec_merge.
+ * tree-vect-slp.c (vect_build_slp_tree_1): Add the support of detecting
+ addsub pattern for SLP (from Richard Biener).
+ (vect_schedule_slp_instance): Likewise.
+
 2013-11-12  Jeff Law  <law@redhat.com>

  * tree-ssa-threadedge.c (thread_around_empty_blocks): New
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index fdf9d58..1f7d1ea 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -106,6 +106,7 @@  extern void ix86_expand_unary_operator (enum
rtx_code, enum machine_mode,
  rtx[]);
 extern rtx ix86_build_const_vector (enum machine_mode, bool, rtx);
 extern rtx ix86_build_signbit_mask (enum machine_mode, bool, bool);
+extern rtx ix86_generate_mask_from_vec_merge (enum machine_mode, rtx);
 extern void ix86_split_convert_uns_si_sse (rtx[]);
 extern void ix86_expand_convert_uns_didf_sse (rtx, rtx);
 extern void ix86_expand_convert_uns_sixf_sse (rtx, rtx);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 5287b49..044eb09 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -39633,11 +39633,61 @@  expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
   return ok;
 }

+/* Generate a vector mask from a scalar mask, which is used in vec_merge
+   instruction.  */
+
+rtx
+ix86_generate_mask_from_vec_merge (enum machine_mode vmode, rtx mask)
+{
+  int int_mask = INTVAL (mask);
+  rtx bit_mask;
+
+  switch (vmode)
+    {
+    case V2DFmode:
+    case V2DImode:
+      bit_mask = gen_rtx_CONST_VECTOR (V2DImode, gen_rtvec (2,
+   (int_mask & 1) ? GEN_INT (0xFFFFFFFFFFFFFFFF) : const0_rtx,
+   (int_mask & 2) ? GEN_INT (0xFFFFFFFFFFFFFFFF) : const0_rtx));
+      break;
+
+    case V4SFmode:
+    case V4SImode:
+      bit_mask = gen_rtx_CONST_VECTOR (V4SImode, gen_rtvec (4,
+   (int_mask & 1) ? GEN_INT (0xFFFFFFFF) : const0_rtx,
+   (int_mask & 2) ? GEN_INT (0xFFFFFFFF) : const0_rtx,
+   (int_mask & 4) ? GEN_INT (0xFFFFFFFF) : const0_rtx,
+   (int_mask & 8) ? GEN_INT (0xFFFFFFFF) : const0_rtx));
+      break;
+
+    case V8HImode:
+      bit_mask = gen_rtx_CONST_VECTOR (vmode, gen_rtvec (8,
+   (int_mask & 1) ? GEN_INT (0xFFFF) : const0_rtx,
+   (int_mask & 2) ? GEN_INT (0xFFFF) : const0_rtx,
+   (int_mask & 4) ? GEN_INT (0xFFFF) : const0_rtx,
+   (int_mask & 8) ? GEN_INT (0xFFFF) : const0_rtx,
+   (int_mask & 16) ? GEN_INT (0xFFFF) : const0_rtx,
+   (int_mask & 32) ? GEN_INT (0xFFFF) : const0_rtx,
+   (int_mask & 64) ? GEN_INT (0xFFFF) : const0_rtx,
+   (int_mask & 128) ? GEN_INT (0xFFFF) : const0_rtx));
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  rtx ret = gen_reg_rtx (vmode);
+  convert_move (ret, bit_mask, false);
+  return ret;
+}
+
 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
-   in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
+   in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  If EMULATE
+   is set, then we generate vec_merge instruction for SSE/SSE2, which
+   can be emulated by and/andn/or approach.  */

 static bool
-expand_vec_perm_blend (struct expand_vec_perm_d *d)
+expand_vec_perm_blend (struct expand_vec_perm_d *d, bool emulate = false)
 {
   enum machine_mode vmode = d->vmode;
   unsigned i, mask, nelt = d->nelt;
@@ -39646,7 +39696,14 @@  expand_vec_perm_blend (struct expand_vec_perm_d *d)

   if (d->one_operand_p)
     return false;
-  if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
+
+  /* If the flag EMULATE is set, then we will use and/andn/or to emulate a
+     blend instruction.  Note that we don't emulate blend instruction for
+     V16QImode, in which case vec_merge instruction won't be generated, and
+     this is also not useful in practice.  */
+  if (emulate && !TARGET_SSE4_1 && vmode != V16QImode)
+    ;
+  else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
     ;
   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
     ;
@@ -39667,9 +39724,6 @@  expand_vec_perm_blend (struct expand_vec_perm_d *d)
   if (d->testing_p)
     return true;

-  /* ??? Without SSE4.1, we could implement this with and/andn/or.  This
-     decision should be extracted elsewhere, so that we only try that
-     sequence once all budget==3 options have been tried.  */
   target = d->target;
   op0 = d->op0;
   op1 = d->op1;
@@ -41629,6 +41683,10 @@  ix86_expand_vec_perm_const_1 (struct
expand_vec_perm_d *d)
   if (expand_vec_perm_1 (d))
     return true;

+  /* Try emulating the blend instruction using other instructions.  */
+  if (expand_vec_perm_blend (d, true))
+    return true;
+
   /* Try sequences of two instructions.  */

   if (expand_vec_perm_pshuflw_pshufhw (d))
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index e5dd90c..e77ec3e 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -737,6 +737,11 @@ 
   (and (match_code "const_int")
        (match_test "IN_RANGE (INTVAL (op), 2, 3)")))

+;; Match 2 to 255.
+(define_predicate "const_2_to_255_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 2, 255)")))
+
 ;; Match 4 to 5.
 (define_predicate "const_4_to_5_operand"
   (and (match_code "const_int")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 7bb2d77..551a05a 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1509,13 +1509,15 @@ 
    (set_attr "mode" "<MODE>")])

 (define_insn "avx_addsubv4df3"
-  [(set (match_operand:V4DF 0 "register_operand" "=x")
- (vec_merge:V4DF
-  (plus:V4DF
-    (match_operand:V4DF 1 "register_operand" "x")
-    (match_operand:V4DF 2 "nonimmediate_operand" "xm"))
-  (minus:V4DF (match_dup 1) (match_dup 2))
-  (const_int 10)))]
+[(set (match_operand:V4DF 0 "register_operand" "=x,x")
+ (vec_select:V4DF
+   (vec_concat:V8DF
+     (minus:V4DF
+       (match_operand:V4DF 1 "register_operand" "0,x")
+       (match_operand:V4DF 2 "nonimmediate_operand" "xm,xm"))
+     (plus:V4DF (match_dup 1) (match_dup 2)))
+   (parallel [(const_int 0) (const_int 5)
+      (const_int 2) (const_int 7)])))]
   "TARGET_AVX"
   "vaddsubpd\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "sseadd")
@@ -1524,12 +1526,13 @@ 

 (define_insn "sse3_addsubv2df3"
   [(set (match_operand:V2DF 0 "register_operand" "=x,x")
- (vec_merge:V2DF
-  (plus:V2DF
-    (match_operand:V2DF 1 "register_operand" "0,x")
-    (match_operand:V2DF 2 "nonimmediate_operand" "xm,xm"))
-  (minus:V2DF (match_dup 1) (match_dup 2))
-  (const_int 2)))]
+ (vec_select:V2DF
+   (vec_concat:V4DF
+     (minus:V2DF
+       (match_operand:V2DF 1 "register_operand" "0,x")
+       (match_operand:V2DF 2 "nonimmediate_operand" "xm,xm"))
+     (plus:V2DF (match_dup 1) (match_dup 2)))
+   (parallel [(const_int 0) (const_int 3)])))]
   "TARGET_SSE3"
   "@
    addsubpd\t{%2, %0|%0, %2}
@@ -9716,6 +9719,24 @@ 
   [(set (attr "length")
      (symbol_ref ("(Pmode != word_mode) + 3")))])

+(define_insn_and_split "*vec_merge<mode>"
+  [(set (match_operand:V_128 0 "register_operand")
+ (vec_merge:V_128
+  (match_operand:V_128 2 "nonimmediate_operand")
+  (match_operand:V_128 1 "register_operand")
+  (match_operand:SI 3 "const_2_to_255_operand")))]
+  "TARGET_SSE && !TARGET_SSE4_1 && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+ (and:V_128 (match_dup 3) (match_dup 2)))
+   (set (match_dup 3)
+ (and:V_128
+  (not:V_128 (match_dup 3)) (match_dup 1)))
+   (set (match_dup 0)
+ (ior:V_128 (match_dup 0) (match_dup 3)))]
+  "operands[3] = ix86_generate_mask_from_vec_merge (<MODE>mode, operands[3]);")
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
 ;; SSSE3 instructions
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 09c7f20..0c8818a 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,10 @@ 
+2013-11-20  Cong Hou  <congh@google.com>
+
+ * gcc.target/i386/sse-vect-addsubps.c: New test.
+ * gcc.target/i386/sse2-vect-addsubpd.c: New test.
+ * gcc.target/i386/sse3-vect-addsubps.c: New test.
+ * gcc.target/i386/sse3-vect-addsubpd.c: New test.
+
 2013-11-12  Balaji V. Iyer  <balaji.v.iyer@intel.com>

  * gcc.dg/cilk-plus/cilk-plus.exp: Added a check for LTO before running
diff --git a/gcc/testsuite/gcc.target/i386/sse-vect-addsubps.c
b/gcc/testsuite/gcc.target/i386/sse-vect-addsubps.c
new file mode 100644
index 0000000..914020b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse-vect-addsubps.c
@@ -0,0 +1,36 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target sse } */
+/* { dg-additional-options "-O2 -ftree-vectorize -fdump-tree-slp-details" } */
+
+float a[4], b[4], c[4];
+
+void subadd ()
+{
+  c[0] = a[0] - b[0];
+  c[1] = a[1] + b[1];
+  c[2] = a[2] - b[2];
+  c[3] = a[3] + b[3];
+}
+
+extern void abort (void);
+
+int main ()
+{
+  int i;
+  for (i = 0; i < 4; ++i)
+    {
+      a[i] = (i + 1.2) / 3.4;
+      b[i] = (i + 5.6) / 7.8;
+    }
+
+  subadd ();
+
+  if (c[0] != a[0] - b[0]
+      || c[1] != a[1] + b[1]
+      || c[2] != a[2] - b[2]
+      || c[3] != a[3] + b[3])
+    abort ();
+}
+
+/* { dg-final { scan-tree-dump-times "basic block vectorized" 1 "slp" } } */
+/* { dg-final { cleanup-tree-dump "slp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-vect-addsubpd.c
b/gcc/testsuite/gcc.target/i386/sse2-vect-addsubpd.c
new file mode 100644
index 0000000..2f78ec8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-vect-addsubpd.c
@@ -0,0 +1,36 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target sse2 } */
+/* { dg-additional-options "-O2 -ftree-vectorize -fdump-tree-slp-details" } */
+
+double a[4], b[4], c[4];
+
+void subadd ()
+{
+  c[0] = a[0] - b[0];
+  c[1] = a[1] + b[1];
+  c[2] = a[2] - b[2];
+  c[3] = a[3] + b[3];
+}
+
+extern void abort (void);
+
+int main ()
+{
+  int i;
+  for (i = 0; i < 4; ++i)
+    {
+      a[i] = (i + 1.2) / 3.4;
+      b[i] = (i + 5.6) / 7.8;
+    }
+
+  subadd ();
+
+  if (c[0] != a[0] - b[0]
+      || c[1] != a[1] + b[1]
+      || c[2] != a[2] - b[2]
+      || c[3] != a[3] + b[3])
+    abort ();
+}
+
+/* { dg-final { scan-tree-dump-times "basic block vectorized" 1 "slp" } } */
+/* { dg-final { cleanup-tree-dump "slp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse3-vect-addsubpd.c
b/gcc/testsuite/gcc.target/i386/sse3-vect-addsubpd.c
new file mode 100644
index 0000000..99d55dc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse3-vect-addsubpd.c
@@ -0,0 +1,36 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target sse3 } */
+/* { dg-additional-options "-O2 -ftree-vectorize -fdump-tree-slp-details" } */
+
+double a[4], b[4], c[4];
+
+void subadd ()
+{
+  c[0] = a[0] - b[0];
+  c[1] = a[1] + b[1];
+  c[2] = a[2] - b[2];
+  c[3] = a[3] + b[3];
+}
+
+extern void abort (void);
+
+int main ()
+{
+  int i;
+  for (i = 0; i < 4; ++i)
+    {
+      a[i] = (i + 1.2) / 3.4;
+      b[i] = (i + 5.6) / 7.8;
+    }
+
+  subadd ();
+
+  if (c[0] != a[0] - b[0]
+      || c[1] != a[1] + b[1]
+      || c[2] != a[2] - b[2]
+      || c[3] != a[3] + b[3])
+    abort ();
+}
+
+/* { dg-final { scan-tree-dump-times "basic block vectorized" 1 "slp" } } */
+/* { dg-final { cleanup-tree-dump "slp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse3-vect-addsubps.c
b/gcc/testsuite/gcc.target/i386/sse3-vect-addsubps.c
new file mode 100644
index 0000000..ea440ba
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse3-vect-addsubps.c
@@ -0,0 +1,36 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target sse3 } */
+/* { dg-additional-options "-O2 -ftree-vectorize -fdump-tree-slp-details" } */
+
+float a[4], b[4], c[4];
+
+void subadd ()
+{
+  c[0] = a[0] - b[0];
+  c[1] = a[1] + b[1];
+  c[2] = a[2] - b[2];
+  c[3] = a[3] + b[3];
+}
+
+extern void abort (void);
+
+int main ()
+{
+  int i;
+  for (i = 0; i < 4; ++i)
+    {
+      a[i] = (i + 1.2) / 3.4;
+      b[i] = (i + 5.6) / 7.8;
+    }
+
+  subadd ();
+
+  if (c[0] != a[0] - b[0]
+      || c[1] != a[1] + b[1]
+      || c[2] != a[2] - b[2]
+      || c[3] != a[3] + b[3])
+    abort ();
+}
+
+/* { dg-final { scan-tree-dump-times "basic block vectorized" 1 "slp" } } */
+/* { dg-final { cleanup-tree-dump "slp" } } */
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 825f73a..de7790d 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -398,7 +398,7 @@  vect_build_slp_tree_1 (loop_vec_info loop_vinfo,
bb_vec_info bb_vinfo,
        unsigned int vectorization_factor, bool *matches)
 {
   unsigned int i;
-  gimple stmt = stmts[0];
+  gimple first_stmt = stmts[0], stmt = stmts[0];
   enum tree_code first_stmt_code = ERROR_MARK, rhs_code = ERROR_MARK;
   enum tree_code first_cond_code = ERROR_MARK;