diff mbox series

[2/2,RFC,middle-end] : Add complex number arithmetic patterns to SLP pattern matcher.

Message ID 20191001113935.GA12235@arm.com
State New
Headers show
Series [1/2,RFC,middle-end] : Add SLP pattern matcher. | expand

Commit Message

Tamar Christina Oct. 1, 2019, 11:39 a.m. UTC
Hi All,

This patch adds pattern detections for the following operations:

 1) Addition with rotation of the second argument around the Argand plane.
    Supported rotations are 90 and 180.

    c = a + (b * I) and c = a + (b * I * I)

  2) Complex multiplication and Conjucate Complex multiplication of the second
     parameter.

    c = a * b and c = a * conj (b)

  3) Complex FMLA, Conjucate FMLA of the second parameter and FMLS.

    c += a * b, c += a * conj (b) and c -= a * b

  For the conjucate cases it supports under fast-math that the operands that is
  being conjucated be flipped by flipping the arguments to the optab.  This
  allows it to support c = conj (a) * b and c += conj (a) * b.

  where a, b and c are complex numbers.

The patterns work by looking at the sequence produced after GCC lowers complex
numbers.  As such they would match any normal operation that does the same
computations.

For this to work it has to recognize two statements in parallel as it needs to
match against operations towards both the real and imaginary numbers.  The
instructions generated by this change is also only available in their vector
forms.

The instructions also require the loads to be contiguous and so when a match is
made, and the code decides it is able to do the replacement it cancels out all
permutes.

The nodes of the SLP tree is still re-organized such that subsequent matches
match against the correct operation.  For instance matching the following SLP
tree:

     +-----------------------------------+
     | stmt 0 REALPART_EXPR <*_3> = _4;  |
     | stmt 1 IMAGPART_EXPR <*_3> = _36; |
     +-----------------------------------+
       |
       |
       v
     +-----------------------------------+
     |      stmt 0 _4 = _16 - _34;       |
 <-- |      stmt 1 _36 = _15 + _33;      |
     +-----------------------------------+
       |
       |
       v
     +-----------------------------------+
     |      stmt 0 _34 = _31 + _32;      |
 <-- |      stmt 1 _33 = _29 - _30;      |
     +-----------------------------------+
       |
       |
       v
     +-----------------------------------+
     |      stmt 0 _31 = _24 * _26;      |
     |      stmt 1 _29 = _23 * _26;      | -->
     +-----------------------------------+
       |
       |
       v
     +-----------------------------------+
     | stmt 0 _24 = IMAGPART_EXPR <*_7>; |
     | stmt 1 _23 = REALPART_EXPR <*_7>; |
     +-----------------------------------+

Which corresponds to the following C code

__attribute__((noinline,noipa))
void fma90_new (double _Complex a[restrict 32], double _Complex b[restrict 32],
		double _Complex c[restrict 32])
{
  for (int i=0; i < 32; i++)
      c[i] += a[i] * b[i] * 1.0fi;
}

shows that the patterns in node (_4, _36) and (_34, _33) are the same. e.g. the
generated sequence is

        mov     x3, 0
        .p2align 3,,7
.L139:
        ldr     q0, [x0, x3]
        ldr     q3, [x1, x3]
        ldr     q1, [x2, x3]
        fmul    v2.2d, v3.2d, v0.d[0]
        fmul    v0.2d, v3.2d, v0.d[1]
        fcadd   v0.2d, v2.2d, v0.2d, #90
        fcadd   v0.2d, v1.2d, v0.2d, #90
        str     q0, [x2, x3]
        add     x3, x3, 16
        cmp     x3, 512
        bne     .L139
        ret

Which you can only get by re-ordering the operations after matching the first
FCADD #90 and noticing that after that the next instruction is also an FCADD 90.

When a replacement is done a new internal function is generated which the
back-end has to expand to the proper instruction sequences.

Concretely, this generates

        ldr	q1, [x1, x3]
	ldr	q2, [x0, x3]
	ldr	q0, [x2, x3]
	fcmla	v0.2d, v1.2d, v2.2d, #180
	fcmla	v0.2d, v1.2d, v2.2d, #90
	str	q0, [x2, x3]
	add	x3, x3, 16
	cmp	x3, 3200
	bne	.L2
	ret

now instead of

	add	x3, x2, 31
	sub	x4, x3, x1
	sub	x3, x3, x0
	cmp	x4, 62
	mov	x4, 62
	ccmp	x3, x4, 0, hi
	bls	.L5
	mov	x3, x0
	mov	x0, x1
	add	x1, x2, 3200
	.p2align 3,,7
.L3:
	ld2	{v16.2d - v17.2d}, [x2]
	ld2	{v2.2d - v3.2d}, [x3], 32
	ld2	{v0.2d - v1.2d}, [x0], 32
	mov	v7.16b, v17.16b
	fmul	v6.2d, v0.2d, v3.2d
	fmla	v7.2d, v1.2d, v3.2d
	fmla	v6.2d, v1.2d, v2.2d
	fmls	v7.2d, v2.2d, v0.2d
	fadd	v4.2d, v6.2d, v16.2d
	mov	v5.16b, v7.16b
	st2	{v4.2d - v5.2d}, [x2], 32
	cmp	x2, x1
	bne	.L3
	ret
.L5:
	add	x4, x2, 8
	add	x6, x0, 8
	add	x5, x1, 8
	mov	x3, 0
	.p2align 3,,7
.L2:
	ldr	d1, [x6, x3]
	ldr	d4, [x1, x3]
	ldr	d5, [x5, x3]
	ldr	d3, [x0, x3]
	fmul	d2, d4, d1
	ldr	d0, [x4, x3]
	fmadd	d0, d5, d1, d0
	ldr	d1, [x2, x3]
	fmadd	d2, d5, d3, d2
	fmsub	d0, d4, d3, d0
	fadd	d1, d1, d2
	str	d1, [x2, x3]
	str	d0, [x4, x3]
	add	x3, x3, 16
	cmp	x3, 3200
	bne	.L2
	ret

I've omitted the optabs definition from the patch to keep the RFC small.
The optabs and IFN definitions are quite straight forward.

Bootstrapped on aarch64-none-linux-gnu with patterns on and no issues.
no regression testing done yet but spec 2017 builds cleanly.

Thanks,
Tamar

gcc/ChangeLog:

2019-10-01  Tamar Christina  <tamar.christina@arm.com>

	* tree-vect-slp.c (vect_split_slp_tree): New.
	(vect_merge_slp_tree): New.
	(vect_create_merged_slp_tree): New.
	(vect_transform_complex_slp_tree_1): New.
	(vect_transform_complex_slp_tree): New.
	(vect_match_expression_p): New.
	(vect_detect_pair_op): New.
	(vect_is_complex_trans_valid): New.
	(vect_match_call_complex_add): New.
	(vect_match_call_complex_mla_1): New.
	(vect_match_call_complex_mla): New.
	(vect_match_call_complex_mul): New.
	(vect_match_slp_patterns): Add patterns.

--

Comments

Toon Moene Oct. 1, 2019, 6:29 p.m. UTC | #1
On 10/1/19 1:39 PM, Tamar Christina wrote:

> The patterns work by looking at the sequence produced after GCC lowers complex
> numbers.  As such they would match any normal operation that does the same
> computations.

Thanks - I didn't understand Ramana's comments during the GNU Tools 
Cauldron about this feature, but now I do.

Can't wait to put my (upcoming) ThunderX hardware to work on this (plus 
that I have to teach *a lot* of 30-year+ Fortran programmers that you do 
not have to lower COMPLEX arithmetic yourself, because the compiler will 
do this optimally for you ...).

Kind regards,
diff mbox series

Patch

diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 7b7453981975f1d756965a96499f651ff2fcea7a..69f377c70b2e7543b4711bcf9b089aaa396ee162 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -2044,6 +2044,323 @@  vect_create_slp_patt_stmt (slp_tree node, match_workset *work,
   return call_stmt;
 }
 
+/* Split the given SLP tree rooted in NODE into a unit tree containing the
+   statement as position IDX and all it's children at the same position.  */
+
+static slp_tree
+vect_split_slp_tree (slp_tree node, int idx)
+{
+  auto_vec<stmt_vec_info> scalar_stmts;
+  auto_vec<slp_tree> nodes;
+  unsigned i;
+  slp_tree child;
+
+  nodes.create (SLP_TREE_CHILDREN (node).length ());
+  scalar_stmts.safe_push (SLP_TREE_SCALAR_STMTS (node)[idx]);
+  slp_tree new_node = vect_create_new_slp_node (scalar_stmts.copy());
+
+  /* Copy over some meta data from the previous node. */
+  SLP_TREE_DEF_TYPE (new_node) = SLP_TREE_DEF_TYPE (node);
+
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
+    nodes.quick_push (vect_split_slp_tree (child, idx));
+
+  SLP_TREE_CHILDREN (new_node).safe_splice (nodes);
+
+  return new_node;
+}
+
+/* Merge a unit tree SRC created by vect_split_slp_tree into DEST if they have
+   the same shape.  If DEST is empty then SRC is returned.  Otherwise
+   recursively merge all the children of SRC and DEST together such that those
+   from SRC are added last to each node.
+
+   TODO: This function doesn't handle when the SLP tree is a graph very well.
+   It won't create something invalid, but it won't share nodes as was the case
+   before.  This can be supported by adding a cache to look up nodes in.  */
+
+static slp_tree
+vect_merge_slp_tree (slp_tree dest, slp_tree src)
+{
+  slp_tree child;
+  unsigned i;
+  if (dest == NULL)
+    return src;
+
+  if (SLP_TREE_CHILDREN (dest).length () != SLP_TREE_CHILDREN (src).length ())
+    return NULL;
+
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (dest), i, child)
+    SLP_TREE_CHILDREN (dest)[i]
+      = vect_merge_slp_tree (child, SLP_TREE_CHILDREN (src)[i]);
+
+  SLP_TREE_SCALAR_STMTS (dest).safe_push (SLP_TREE_SCALAR_STMTS (src).last ());
+  return dest;
+}
+
+/* Create a tree containing as the children the definition of all statements in
+   DEFS so the new tree can be used as the root for new statements using the
+   statements in DEFS.  This node will be used by the pattern matcher as the new
+   definition site for arguments to the created patterns.
+
+   IDX indicated the operand to the pattern in WORKSET that is being created.
+   e.g. if IDX is 0 then the resulting node will contain all the definition for
+   first operand of every statement in DEFS.
+
+   This function is used as a sliding window to create all the children one
+   operand at a time.  The definition site of the statement is looked up in
+   ARG_MAP which is created by vect_transform_complex_slp_tree_1.  */
+
+static slp_tree
+vect_create_merged_slp_tree (unsigned idx, const vec<stmt_vec_info> defs,
+			     vec<match_workset> workset,
+			     ssa_name_def_to_slp_tree_map_t *arg_map)
+{
+  match_workset work;
+  unsigned int i;
+  int x;
+  slp_tree tmp = NULL;
+
+  for (i = 0; i < workset.length (); i++)
+    {
+      work = workset[i];
+      int stride = work.vects_len / work.patt_info->arity;
+      for (x = 0; x < work.patt_info->arity; x++)
+	{
+	  int pos = idx + (x * stride);
+	  stmt_vec_info op = defs[work.vects_offset + pos];
+	  gcc_assert (op);
+	  tmp = vect_merge_slp_tree (tmp, *arg_map->get (op));
+          if (tmp == NULL)
+	    return NULL;
+	}
+   }
+
+  return tmp;
+}
+
+/* Helper function for vect_transform_complex_slp_tree.  Find for each statement
+   in DEFS rooted in the SLP tree NODE the actually SLP node containing the
+   definition of the statement.  This pair of statement and node is stored in
+   ARG_MAP and any node that are visited in the walk from NODE to DEST_NODE are
+   added to FREE_SET to be freed later as they are no longer used after the
+   pattern statement is applied.  */
+
+static void
+vect_transform_complex_slp_tree_1 (slp_tree node, vec<stmt_vec_info> defs,
+				   ssa_name_def_to_slp_tree_map_t *arg_map,
+				   vec<slp_tree> *free_set)
+{
+  unsigned i;
+  vec<stmt_vec_info> scalar_stmts = SLP_TREE_SCALAR_STMTS (node);
+  slp_tree child;
+  stmt_vec_info scalar_stmt;
+
+  gcc_assert (scalar_stmts.length () > 0);
+  bool found_p = false;
+
+  FOR_EACH_VEC_ELT (scalar_stmts, i, scalar_stmt)
+    {
+      if (defs.contains (scalar_stmt))
+	{
+	  arg_map->put (scalar_stmt, vect_split_slp_tree (node, i));
+	  found_p = true;
+	}
+    }
+
+  if (!found_p)
+    {
+      free_set->safe_push (node);
+      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
+	  vect_transform_complex_slp_tree_1 (child, defs, arg_map, free_set);
+    }
+}
+
+/* The post transform and validation function for the complex number patterns.
+   This will re-arrange the tree and re-organize the nodes such that they can be
+   used by the complex number instructions that are to be created.  It does this
+   by doing the following steps:
+
+   1) It looks up the definition nodes of each statement in DEFS which are the
+      new arguments to be used in the new patterns that will be created.  It
+      does this by calling vect_transform_complex_slp_tree_1.
+
+   2) Unit trees are created for each statement in DEFS and they are stored in a
+      hash_map for lookup.  We also create a list of nodes that are not used
+      anymore after the patterns are created.  These are defined as the nodes
+      between the current NODE and the definition sites that were looked up.
+
+   3) The unit trees are merged and the new root is created with these trees as
+      the children.  If the merging was successful it means that all operations
+      in the tree are valid and the shapes were valid.
+
+   4) The children of NODE are replaced with the new set of nodes we created.
+
+   5) The list of nodes to be freed are released if their refcounts are now 0.
+
+   6) The cancel_permutes flag is set and success is returned.
+
+   This sequence of operations does an implicit re-ordering and DSE of nodes.
+   e.g. it will undo as much of the permutes as it possibly can.  This is
+   required such that pattern matchers running on the newly created statements
+   match the correct operations.  */
+
+static bool
+vect_transform_complex_slp_tree (slp_tree node, const vec<stmt_vec_info> defs,
+				 int arity, vec<match_workset> workset,
+				 bool *cancel_permutes)
+{
+  slp_tree child;
+  int i;
+
+  auto_vec<slp_tree> nodes;
+  nodes.create (arity);
+  auto_vec<slp_tree> free_set;
+
+  ssa_name_def_to_slp_tree_map_t *arg_map
+    = new ssa_name_def_to_slp_tree_map_t ();
+
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
+    vect_transform_complex_slp_tree_1 (child, defs, arg_map, &free_set);
+
+  for (int x = 0; x < arity; x++)
+    {
+      slp_tree tmp = vect_create_merged_slp_tree (x, defs, workset, arg_map);
+      if (tmp == NULL)
+	{
+	  delete arg_map;
+	  return false;
+	}
+      nodes.safe_push (tmp);
+    }
+
+  /* Now that we've actually done the replacements, we need to free the nodes
+     that are no longer relevant.  We must do this non-recursively as we're only
+     interesting in freeing the node alone and not it's children.  It's always
+     a non-final call and any statements we do encounter that have refcount 0
+     should be marked as unused.  */
+  FOR_EACH_VEC_ELT (free_set, i, child)
+    vect_free_slp_tree (child, false, false, true);
+
+  SLP_TREE_CHILDREN (node).truncate (0);
+  SLP_TREE_CHILDREN (node).safe_splice (nodes);
+
+  /* TWO_OPERATORS can only handle simple opts, and so these call patterns
+     can't use it.  */
+  SLP_TREE_TWO_OPERATORS (node) = false;
+
+  /* Cancel the permute.  */
+  *cancel_permutes = true;
+
+  delete arg_map;
+  return true;
+}
+
+/* The COMPLEX_OPERATION enum denotes the possible pair of operations that can
+   be matched when looking for expressions that we are interested matching for
+   complex numbers addition and mla.  */
+
+typedef enum _complex_operation {
+  PLUS_PLUS, MINUS_PLUS, PLUS_MINUS, MULT_MULT, NEG_NEG, CMPLX_NONE
+} complex_operation_t;
+
+/* Checks to see of the expression EXPR is a gimple assign with code CODE and if
+   this is the case the two operands of EXPR is returned in OP1 and OP2.
+
+   If the matching and extraction is successful TRUE is returned otherwise FALSE
+   in which case the value of OP1 and OP2 will not have been touched.  */
+
+static bool
+vect_match_expression_p (slp_tree node, tree_code code, int base, int idx,
+			 stmt_vec_info *op1, stmt_vec_info *op2)
+{
+  vec<stmt_vec_info> scalar_stmts = SLP_TREE_SCALAR_STMTS (node);
+  int n = base + idx;
+  if (scalar_stmts.length () < (unsigned)n) // can use group_size
+    return false;
+
+  gimple* expr = STMT_VINFO_STMT (scalar_stmts[n]);
+  if (!is_gimple_assign (expr)
+      || gimple_expr_code (expr) != code)
+    return false;
+
+  vec<slp_tree> children = SLP_TREE_CHILDREN (node);
+  if (children.length () != (op2 ? 2 : 1))
+    return false;
+
+  if (op1)
+    *op1 = SLP_TREE_SCALAR_STMTS (children[0])[n];
+
+  if (op2)
+    *op2 = SLP_TREE_SCALAR_STMTS (children[1])[n];
+
+  return true;
+}
+
+/* This function will match two gimple expressions STMT_0 and STMT_1 in parallel
+   and returns the pair operation that represents the two expressions in the
+   two statements.
+
+   If match is successful then the corresponding complex_operation is returned
+   and the arguments to the two matched operations are returned in OPS.
+
+   If unsuccessful then CMPLX_NONE is returned and OPS is untouched.
+
+   e.g. the following gimple statements
+
+   stmt 0 _39 = _37 + _12;
+   stmt 1 _6 = _38 - _36;
+
+   will return PLUS_MINUS along with OPS containing {_37, _12, _38, _36}.  */
+
+static complex_operation_t
+vect_detect_pair_op (int base, slp_tree node1, int offset1,
+		     slp_tree node2, int offset2,
+		     vec<stmt_vec_info> *ops)
+{
+  stmt_vec_info op1 = NULL, op2 = NULL, op3 = NULL, op4	 = NULL;
+  complex_operation_t result = CMPLX_NONE;
+  if (vect_match_expression_p (node1, MINUS_EXPR, base, offset1, &op1, &op2)
+      && vect_match_expression_p (node2, PLUS_EXPR, base, offset2, &op3, &op4))
+    result = MINUS_PLUS;
+  else if (vect_match_expression_p (node1, PLUS_EXPR, base, offset1, &op1, &op2)
+	   && vect_match_expression_p (node2, MINUS_EXPR, base, offset2, &op3,
+				       &op4))
+    result = PLUS_MINUS;
+  else if (vect_match_expression_p (node1, PLUS_EXPR, base, offset1, &op1, &op2)
+	   && vect_match_expression_p (node2, PLUS_EXPR, base, offset2, &op3,
+				       &op4))
+    result = PLUS_PLUS;
+  else if (vect_match_expression_p (node1, MULT_EXPR, base, offset1, &op1, &op2)
+	   && vect_match_expression_p (node2, MULT_EXPR, base, offset2, &op3,
+				       &op4))
+    result = MULT_MULT;
+  else if (vect_match_expression_p (node1, NEGATE_EXPR, base, offset1, &op1,
+				    NULL)
+	   && vect_match_expression_p (node2, NEGATE_EXPR, base, offset2, &op3,
+				       NULL))
+    result = NEG_NEG;
+
+  if (result != CMPLX_NONE && ops != NULL)
+    {
+      ops->create (4);
+      ops->quick_push (op1);
+      ops->quick_push (op2);
+      ops->quick_push (op3);
+      ops->quick_push (op4);
+    }
+  return result;
+}
+
+/* Overload of vect_detect_pair_op where the statements are assumed to be one
+   after the other.  */
+
+static complex_operation_t
+vect_detect_pair_op (int base, slp_tree node, vec<stmt_vec_info> *ops)
+{
+  return vect_detect_pair_op (base, node, 0, node, 1, ops);
+}
+
 /* Helper function of vect_match_slp_patterns.
 
    Attempts to match the given pattern PATT_INFO against the slp tree rooted in
@@ -2199,6 +2516,377 @@  vect_match_slp_patterns_2 (slp_tree node, vec_info *vinfo,
   return found_p | found_rec_p;
 }
 
+/* An extracted version of vect_build_slp_tree_1 which checks to see that the
+   operands of the new patterns are not the same.  If they are the same than
+   there will never be a benefit to doing this transformation as there will not
+   be a permute.  This version is much simplified from vect_build_slp_tree_1 as
+   it assumes all those checks were already performed.
+
+   It also checks to see if the operations for each grouping of statements are
+   compatible.  i.e. make sure we still have compatible statements for
+   vectorization.  */
+
+/* TODO: See about sharing this code with vect_build_slp_tree_1, seems
+   non-trivial to extract logic from there.  */
+
+static bool
+vect_is_complex_trans_valid (vec<stmt_vec_info> *vects, int n)
+{
+  vec<stmt_vec_info> args = *vects;
+  int num_args = args.length () / n;
+  for (int x = 0; x < num_args; x++)
+    {
+      if (args[x] == args [x + n])
+         return false;
+
+      gimple *stmt1 = STMT_VINFO_STMT (args[x]);
+      gimple *stmt2 = STMT_VINFO_STMT (args[x + n]);
+      enum gimple_code code1 = gimple_code (stmt1);
+      enum gimple_code code2 = gimple_code (stmt2);
+
+      if (code1 != code2)
+	 return false;
+
+      switch (code1)
+       {
+	case GIMPLE_ASSIGN:
+	  {
+	    tree rhs1 = gimple_assign_rhs1 (stmt1);
+	    tree rhs2 = gimple_assign_rhs1 (stmt2);
+	    enum tree_code tcode1 = gimple_assign_rhs_code (stmt1);
+	    enum tree_code tcode2 = gimple_assign_rhs_code (stmt2);
+
+	    if ((tcode1 == IMAGPART_EXPR || tcode1 == REALPART_EXPR)
+		&& (tcode2 == IMAGPART_EXPR || tcode2 == REALPART_EXPR))
+	       continue;
+
+	    if ((tcode1 == PLUS_EXPR || tcode1 == MINUS_EXPR)
+		&& (tcode2 == PLUS_EXPR || tcode2 == MINUS_EXPR))
+	       continue;
+
+	    if (tcode1 == tcode2)
+	       continue;
+
+	    if (!operand_equal_p (rhs1, rhs2, 0))
+	       return false;
+	    break;
+	  }
+	case GIMPLE_CALL:
+	  {
+	    if (!gimple_assign_load_p (stmt1)
+		&& !compatible_calls_p (as_a <gcall *> (stmt1),
+					as_a <gcall *> (stmt2)))
+	       return false;
+	    break;
+	  }
+	default:
+	  break;
+       }
+    }
+
+  return true;
+}
+
+/* Pattern matcher for trying to match complex addition pattern in SLP tree
+   using the N statements statements found in node starting at position IDX.
+   If the operation matches then IFN is set to the operation it matched and the
+   arguments to the two replacement statements are put in VECTS.
+
+   If no match is found then IFN is set to IFN_LAST and V1 and V2 are unchanged.
+
+   This function matches the patterns shaped as:
+
+      c[i] = a[i] - b[i+1];
+      c[i+1] = a[i+1] + b[i];
+
+   If a match occurred then TRUE is returned, else FALSE.  */
+
+static bool
+vect_match_call_complex_add (slp_tree node,
+			     stmt_vec_info *stmts ATTRIBUTE_UNUSED, int idx,
+			     int n, internal_fn *ifn, vec<stmt_vec_info> *vects,
+			     vec_info *info ATTRIBUTE_UNUSED)
+{
+  *ifn = IFN_LAST;
+  int base = idx - (n - 1);
+
+  complex_operation_t op = vect_detect_pair_op (base, node, vects);
+
+  /* Find the two components.  Rotation in the complex plane will modify
+     the operations:
+
+     * Rotation  0: + +
+     * Rotation 90: - +
+     * Rotation 180: - -
+     * Rotation 270: + -
+
+     Rotation 0 and 180 can be handled by normal SIMD code, so we don't need
+     to care about them here.  */
+  if (op == MINUS_PLUS)
+    *ifn = IFN_COMPLEX_ADD_ROT90;
+  else if (op == PLUS_MINUS)
+    *ifn = IFN_COMPLEX_ADD_ROT270;
+
+  if (*ifn == IFN_LAST)
+    return false;
+
+  vec<stmt_vec_info> args = *vects;
+
+  /* Correct the arguments after matching.  */
+  std::swap (args[1], args[3]);
+
+   /* If the two operands are the same, we don't have a permute. In such a case
+      there is no advantage in doing the replacement.  */
+  return vect_is_complex_trans_valid (vects, 2);
+}
+
+/* Helper function of vect_match_call_complex_mla that looks up the definition
+   of LHS_0 and LHS_1 by finding the statements starting in position BASE + IDX
+   in child ROOT of NODE and tries to match the definition against pair ops.
+
+   If the match is successful then ARGS will contain the operands matched and
+   the complex_operation_t type is returned.  If match is not successful then
+   CMPLX_NONE is returned and ARGS is left unmodified.  */
+
+static complex_operation_t
+vect_match_call_complex_mla_1 (slp_tree node, slp_tree *res, int root, int base,
+			       int idx, vec<stmt_vec_info> *args)
+{
+  gcc_assert (base >= 0 && idx >= 0 && node != NULL);
+
+  if ((unsigned)root >= SLP_TREE_CHILDREN (node).length ())
+    return CMPLX_NONE;
+
+  slp_tree data = SLP_TREE_CHILDREN (node)[root];
+
+  int lhs_0 = base + idx;
+  int lhs_1 = base + idx + 1;
+
+  gimple *stmt_0 = STMT_VINFO_STMT (SLP_TREE_SCALAR_STMTS (data)[lhs_0]);
+  gimple *stmt_1 = STMT_VINFO_STMT (SLP_TREE_SCALAR_STMTS (data)[lhs_1]);
+
+  if (gimple_expr_type (stmt_0) != gimple_expr_type (stmt_1))
+    return CMPLX_NONE;
+
+  if (res)
+    *res = data;
+
+  return vect_detect_pair_op (base, data, args);
+}
+
+/* Pattern matcher for trying to match complex multiply and accumulate pattern
+   in SLP tree using N statements STMT_0 and STMT_0 as the root
+   statements by finding the statements starting in position IDX in NODE.  If
+   the operation matches then IFN is set to the operation it matched and the
+   arguments to the two replacement statements are put in VECTS.
+
+   If no match is found then IFN is set to IFN_LAST and VECTS is unchanged.
+
+   This function matches the patterns shaped as:
+
+      double ax = (b[i+1] * a[i]) + (b[i] * a[i]);
+      double bx = (a[i+1] * b[i]) - (a[i+1] * b[i+1]);
+
+      c[i] = c[i] - ax;
+      c[i+1] = c[i+1] + bx;
+
+   If a match occurred then TRUE is returned, else FALSE.  */
+
+//TODO: Figure out why auto_vec<.., 4> here sometimes corrupts list. or add
+//      explicit releases.
+static bool
+vect_match_call_complex_mla (slp_tree node,
+			     stmt_vec_info *stmts ATTRIBUTE_UNUSED, int idx,
+			     int n, internal_fn *ifn, vec<stmt_vec_info> *vects,
+			     vec_info *vinfo ATTRIBUTE_UNUSED)
+{
+  *ifn = IFN_LAST;
+  int base = idx - (n - 1);
+  /* Find the two components.  Rotation in the complex plane will modify
+     the operations:
+
+     * Rotation  0: + +
+     * Rotation 90: - +
+     * Rotation 180: - -
+     * Rotation 270: + -.  */
+  vec<stmt_vec_info> args0;
+  complex_operation_t op1 = vect_detect_pair_op (base, node, &args0);
+
+  if (op1 == CMPLX_NONE)
+    return false;
+
+  slp_tree sub1, sub2a, sub2b, sub3;
+
+  /* Now operand2+4 must lead to another expression.  */
+  vec<stmt_vec_info> args1;
+  complex_operation_t op2
+    = vect_match_call_complex_mla_1 (node, &sub1, 1, base, 0, &args1);
+
+  if (op2 != MINUS_PLUS && op2 != PLUS_MINUS)
+    return false;
+
+  /* Now operand1+3 must lead to another expression.  */
+  vec<stmt_vec_info> args2;
+  complex_operation_t op3
+    = vect_match_call_complex_mla_1 (sub1, &sub2a, 0, base, 0, &args2);
+
+  if (op3 != MULT_MULT)
+    return false;
+
+  /* Now operand2+4 must lead to another expression.  */
+  vec<stmt_vec_info> args3;
+  complex_operation_t op4
+    = vect_match_call_complex_mla_1 (sub1, &sub2b, 1, base, 0, &args3);
+
+  if (op4 != MULT_MULT)
+    return false;
+
+  /* Now operand2+4 may lead to another expression.  */
+  vec<stmt_vec_info> args4;
+  complex_operation_t op5
+    = vect_match_call_complex_mla_1 (sub2b, &sub3, 1, base, 0, &args4);
+
+  if (op1 == PLUS_MINUS && op2 == MINUS_PLUS)
+    *ifn = IFN_COMPLEX_FMS;
+  else if (op1 == PLUS_PLUS && op2 == MINUS_PLUS)
+    {
+      if (op5 == CMPLX_NONE)
+	*ifn = IFN_COMPLEX_FMA;
+       else if (op5 == NEG_NEG)
+	  *ifn = IFN_COMPLEX_FMA_CONJ;
+    }
+
+  if (*ifn == IFN_LAST)
+    return false;
+
+
+  vects->create (6);
+
+  if (*ifn == IFN_COMPLEX_FMA_CONJ)
+    {
+      /* Check if the conjucate is on the first or second parameter.  */
+      if (args2[0] == args2[2] && args3[1] == args3[3])
+	{
+	  vects->quick_push (args0[0]);
+	  vects->quick_push (args2[2]);
+	  vects->quick_push (args3[2]);
+	  vects->quick_push (args0[2]);
+	  vects->quick_push (args4[0]);
+	  vects->quick_push (args2[3]);
+	}
+      else
+	{
+	  /* When the conjucate is on the first argument the SLP build fails and
+	     because of this we never get here.  We should be able to support it
+	     by swapping the parameters like with MUL, but since it currently
+	     can't be tested don't support it.  */
+	  return false;
+	}
+    }
+   else
+    {
+      vects->quick_push (args0[0]);
+      vects->quick_push (args2[3]);
+      vects->quick_push (args3[2]);
+      vects->quick_push (args0[2]);
+      vects->quick_push (args3[3]);
+      vects->quick_push (args2[2]);
+    }
+
+  return vect_is_complex_trans_valid (vects, 3);
+}
+
+/* Pattern matcher for trying to match complex multiply pattern in SLP tree
+   using N statements STMT_0 and STMT_0 as the root statements by finding the
+   statements starting in position IDX in NODE.  If the operation matches then
+   IFN is set to the operation it matched and the arguments to the two
+   replacement statements are put in VECTS.
+
+   If no match is found then IFN is set to IFN_LAST and VECTS is unchanged.
+
+   This function matches the patterns shaped as:
+
+      double ax = (b[i+1] * a[i]);
+      double bx = (a[i+1] * b[i]);
+
+      c[i] = c[i] - ax;
+      c[i+1] = c[i+1] + bx;
+
+   If a match occurred then TRUE is returned, else FALSE.  */
+
+static bool
+vect_match_call_complex_mul (slp_tree node,
+			     stmt_vec_info *stmts ATTRIBUTE_UNUSED, int idx,
+			     int n, internal_fn *ifn, vec<stmt_vec_info> *vects,
+			     vec_info *vinfo ATTRIBUTE_UNUSED)
+{
+  *ifn = IFN_LAST;
+  int base = idx - (n - 1);
+
+  complex_operation_t op1 = vect_detect_pair_op (base, node, NULL);
+
+  if (op1 != MINUS_PLUS)
+    return false;
+
+  slp_tree sub1a, sub1b, sub2;
+  /* Now operand1+3 must lead to another expression.  */
+  vec<stmt_vec_info> args0;
+  complex_operation_t op2
+    = vect_match_call_complex_mla_1 (node, &sub1a, 0, base, 0, &args0);
+
+  if (op2 != MULT_MULT)
+    return false;
+
+  /* Now operand2+4 must lead to another expression.  */
+  vec<stmt_vec_info> args1;
+  complex_operation_t op3
+    = vect_match_call_complex_mla_1 (node, &sub1b, 1, base, 0, &args1);
+
+  if (op3 != MULT_MULT)
+    return false;
+
+  /* Now operand2+4 may lead to another expression.  */
+  vec<stmt_vec_info> args2;
+  complex_operation_t op4
+    = vect_match_call_complex_mla_1 (sub1b, &sub2, 1, base, 0, &args2);
+
+  if (op4 != CMPLX_NONE && op4 != NEG_NEG)
+    return false;
+
+  vects->create (4);
+
+  if (op4 == CMPLX_NONE)
+    {
+      *ifn = IFN_COMPLEX_MUL;
+      /* Correct the arguments after matching.  */
+      std::swap (args0[2], args1[0]);
+    }
+  else if (op4 == NEG_NEG)
+    {
+      *ifn = IFN_COMPLEX_MUL_CONJ;
+      /* Check if the conjucate is on the first or second parameter.  */
+      if (args1[1] == args1[3] && args0[1] == args0[3])
+	{
+	  vects->quick_push (args0[3]);
+	  vects->quick_push (args0[0]);
+	  vects->quick_push (args2[0]);
+	  vects->quick_push (args0[2]);
+	}
+      else
+	{
+	  /* Correct the arguments after matching.  */
+	  std::swap (args0[2], args2[0]);
+	}
+    }
+
+  if (vects->length () == 0)
+    vects->splice (args0);
+
+  return *ifn != IFN_LAST && vect_is_complex_trans_valid (vects, 3);
+}
+
+
+
 /* Applies pattern matching to the given SLP tree rooted in NODE using vec_info
    VINFO and group size GROUP_SIZE.
 
@@ -2221,7 +2909,14 @@  vect_match_slp_patterns (slp_tree node, vec_info *vinfo,
     }
 
 
-  static complex_pattern_t patterns[0] = {};
+  static complex_pattern_t patterns[3] = {
+    { vect_match_call_complex_mla, 2, "Complex FM(A|S)",
+      vect_transform_complex_slp_tree },
+    { vect_match_call_complex_mul, 2, "Complex Multiplication",
+      vect_transform_complex_slp_tree },
+    { vect_match_call_complex_add, 2, "Complex Addition",
+      vect_transform_complex_slp_tree },
+  };
 
   for (size_t x = 0; x < sizeof (patterns) / sizeof (complex_pattern_t); x++)
     found_p |= vect_match_slp_patterns_2 (node, vinfo, group_size, patterns[x],