diff mbox series

middle-end[RFC] slp: new implementation of complex numbers

Message ID patch-14582-tamar@arm.com
State New
Headers show
Series middle-end[RFC] slp: new implementation of complex numbers | expand

Commit Message

Tamar Christina June 21, 2021, 2:52 p.m. UTC
Hi Richi,

This patch is still very much incomplete and I do know that it is missing things
but it's complete enough such that examples are working and allows me to show
what I'm working towards.

note, that this approach will remove a lot of code in tree-vect-slp-patterns but
to keep the diff readable I've left them in and just commented out the calls or
removed them where needed.

The patch rewrites the complex numbers detection by splitting the detection of
structure from dataflow analysis.  In principle the biggest difference between
this and the previous implementation is that instead of trying to detect valid
complex operations it *makes* an operation a valid complex operation.

To do this each operation gets a dual optab which matches the same structure but
has no dataflow requirement.

i.e. in this patch I added 4, ADDSUB, SUBADD, MUL_ADDSUB, MULL_SUBADD.

There is a then a mapping between these and their variant with the dataflow:

* ADDSUB -> COMPLEX_ADD_ROT270
* SUBADD -> COMPLEX_ADD_ROT90
* MUL_ADDSUB -> COMPLEX_MUL_CONJ
* MUL_SUBADD -> COMPLEX_MUL

with the intention that when we detect the structure of an operation we query
the backend for both optabs.

This should result in one of three states:

 * not supported: Move on.
 * Supports ADDSUB only: Rewrite using ADDSUB, set type to 'cannot_transform'
 * Supports COMPLEX_ADD_ROT270 only: Rewrite using ADDSUB, set type to 'must_transform'
 * Supports both: Rewrite using ADDSUB, set type fo 'can_transform'

with the idea behind `can_transform` is to check the costs of the inverse
permute needed to use the complex operation and if this is very expensive then
stick to addsub.  This requires the target to be able to cost the operations
reasonably correct.

So for ADD this looks like

 === vect_match_slp_patterns ===
 Analyzing SLP tree 0x494e970 for patterns
 Found ADDSUB pattern in SLP tree
 Target does not support ADDSUB for vector type vector(4) float
 Found COMPLEX_ADD_ROT270 pattern in SLP tree
 Target supports COMPLEX_ADD_ROT270 vectorization with mode vector(4) float
Pattern matched SLP tree
node 0x494e970 (max_nunits=4, refcnt=1)
op template: REALPART_EXPR <*_10> = _23;
  stmt 0 REALPART_EXPR <*_10> = _23;
  stmt 1 IMAGPART_EXPR <*_10> = _22;
  children 0x494ea00
node 0x494ea00 (max_nunits=4, refcnt=1)
op template: slp_patt_39 = .ADDSUB (_23, _23);
  stmt 0 _23 = _6 + _13;
  stmt 1 _22 = _12 - _8;
  children 0x494eb20 0x494ebb0
node 0x494eb20 (max_nunits=4, refcnt=1)
op template: _13 = REALPART_EXPR <*_3>;
  stmt 0 _13 = REALPART_EXPR <*_3>;
  stmt 1 _12 = IMAGPART_EXPR <*_3>;
node 0x494ebb0 (max_nunits=4, refcnt=1)
op: VEC_PERM_EXPR
  { }
  lane permutation { 0[1] 0[0] }
  children 0x494ec40
node 0x494ec40 (max_nunits=1, refcnt=2)
op template: _8 = REALPART_EXPR <*_5>;
  stmt 0 _8 = REALPART_EXPR <*_5>;
  stmt 1 _6 = IMAGPART_EXPR <*_5>;
  load permutation { 0 1 }

and later during optimize_slp we get

Tranforming SLP expression from ADDSUB to COMPLEX_ADD_ROT270
processing node 0x494ebb0
simplifying permute node 0x494ebb0
Optimized SLP instances:
node 0x494e970 (max_nunits=4, refcnt=1)
op template: REALPART_EXPR <*_10> = _23;
   stmt 0 REALPART_EXPR <*_10> = _23;
   stmt 1 IMAGPART_EXPR <*_10> = _22;
   children 0x494ea00
node 0x494ea00 (max_nunits=4, refcnt=1)
op template: slp_patt_39 = .COMPLEX_ADD_ROT270 (_23, _23);
   stmt 0 _23 = _6 + _13;
   stmt 1 _22 = _12 - _8;
   children 0x494eb20 0x494ebb0
node 0x494eb20 (max_nunits=4, refcnt=1)
op template: _13 = REALPART_EXPR <*_3>;
   stmt 0 _13 = REALPART_EXPR <*_3>;
   stmt 1 _12 = IMAGPART_EXPR <*_3>;
node 0x494ebb0 (max_nunits=4, refcnt=1)
op: VEC_PERM_EXPR
   { }
   lane permutation { 0[0] 0[1] }
   children 0x494ec40
node 0x494ec40 (max_nunits=1, refcnt=2)
op template: _8 = REALPART_EXPR <*_5>;
   stmt 0 _8 = REALPART_EXPR <*_5>;
   stmt 1 _6 = IMAGPART_EXPR <*_5>;

Now I still have to elide the VEC_PERM_EXPR here but that's easy.

This works for ADD, but it doesn't work well when there's a complicated sequence
of loads.  So for example

#define N 200
void g (double complex a[restrict N], double complex b[restrict N],
	double complex c[restrict N])
{
  for (int i=0; i < N; i++)
    {
      c[i] =  a[i] * b[i];
    }
}

will results in an SLP tree where each operand of the multiply does not get to
see all the original vectors:

Final SLP tree for instance 0x5678a30:
node 0x55965a0 (max_nunits=4, refcnt=2)
op template: REALPART_EXPR <*_7> = _25;
  stmt 0 REALPART_EXPR <*_7> = _25;
  stmt 1 IMAGPART_EXPR <*_7> = _26;
  children 0x5596630
node 0x5596630 (max_nunits=4, refcnt=2)
op: VEC_PERM_EXPR
  stmt 0 _25 = _17 - _22;
  stmt 1 _26 = _23 + _24;
  lane permutation { 0[0] 1[1] }
  children 0x5596a20 0x5596ab0
node 0x5596a20 (max_nunits=1, refcnt=1)
op template: _25 = _17 - _22;
  { }
  children 0x55966c0 0x5596870
node 0x55966c0 (max_nunits=4, refcnt=3)
op template: _17 = _10 * _19;
  stmt 0 _17 = _10 * _19;
  stmt 1 _23 = _10 * _18;
  children 0x5596750 0x55967e0
node 0x5596750 (max_nunits=4, refcnt=2)
op template: _10 = REALPART_EXPR <*_3>;
  stmt 0 _10 = REALPART_EXPR <*_3>;
  stmt 1 _10 = REALPART_EXPR <*_3>;
  load permutation { 0 0 }
node 0x55967e0 (max_nunits=4, refcnt=2)
op template: _19 = REALPART_EXPR <*_5>;
  stmt 0 _19 = REALPART_EXPR <*_5>;
  stmt 1 _18 = IMAGPART_EXPR <*_5>;
  load permutation { 0 1 }
node 0x5596870 (max_nunits=4, refcnt=3)
op template: _22 = _9 * _18;
  stmt 0 _22 = _9 * _18;
  stmt 1 _24 = _9 * _19;
  children 0x5596900 0x5596990
node 0x5596900 (max_nunits=4, refcnt=2)
op template: _9 = IMAGPART_EXPR <*_3>;
  stmt 0 _9 = IMAGPART_EXPR <*_3>;
  stmt 1 _9 = IMAGPART_EXPR <*_3>;
  load permutation { 1 1 }
node 0x5596990 (max_nunits=4, refcnt=2)
op template: _18 = IMAGPART_EXPR <*_5>;
  stmt 0 _18 = IMAGPART_EXPR <*_5>;
  stmt 1 _19 = REALPART_EXPR <*_5>;
  load permutation { 1 0 }
node 0x5596ab0 (max_nunits=1, refcnt=1)
op template: _26 = _23 + _24;
  { }
  children 0x55966c0 0x5596870

So depending on the node each multiply either sees REAL 3 + REAL 5 + IMAG 5 or
IMAG 3 + IMAG 5 + REAL 5.

since we are removing the TWO_OPERANDS node we need to drop one of the multiply
and so we need to give it the original 2 vectors as a parameter.  The current
implementation emits a permute operation to combine the loads again and later
pushes the permute down.  This is problematic as it again requires us to do df
analysis early.

To counter this, in the patch I have changed loads to no longer come out of
build_slp with LOAD_PERMUTES but instead to have a VEC_PERM above each load.

This is only temporary and the idea is that optimize SLP will push them back
down into the loads once it's deemed there are still permutes needed.

So COMPLEX MUL becomes:

Final SLP tree for instance 0x46d9c80:
node 0x45f76c0 (max_nunits=4, refcnt=2)
op template: REALPART_EXPR <*_7> = _25;
  stmt 0 REALPART_EXPR <*_7> = _25;
  stmt 1 IMAGPART_EXPR <*_7> = _26;
  children 0x45f7750
node 0x45f7750 (max_nunits=4, refcnt=2)
op: VEC_PERM_EXPR
  stmt 0 _25 = _17 - _22;
  stmt 1 _26 = _23 + _24;
  lane permutation { 0[0] 1[1] }
  children 0x45f7bd0 0x45f7c60
node 0x45f7bd0 (max_nunits=1, refcnt=1)
op template: _25 = _17 - _22;
  { }
  children 0x45f77e0 0x45f7a20
node 0x45f77e0 (max_nunits=4, refcnt=3)
op template: _17 = _10 * _19;
  stmt 0 _17 = _10 * _19;
  stmt 1 _23 = _10 * _18;
  children 0x45f7870 0x45f7990
node 0x45f7870 (max_nunits=4, refcnt=2)
op: VEC_PERM_EXPR
  { }
  lane permutation { 0[0] 0[0] }
  children 0x45f7900
node 0x45f7900 (max_nunits=1, refcnt=4)
op template: _10 = REALPART_EXPR <*_3>;
  stmt 0 _10 = REALPART_EXPR <*_3>;
  stmt 1 _9 = IMAGPART_EXPR <*_3>;
  load permutation { 0 1 }
node 0x45f7990 (max_nunits=4, refcnt=3)
op template: _19 = REALPART_EXPR <*_5>;
  stmt 0 _19 = REALPART_EXPR <*_5>;
  stmt 1 _18 = IMAGPART_EXPR <*_5>;
  load permutation { 0 1 }
node 0x45f7a20 (max_nunits=4, refcnt=3)
op template: _22 = _9 * _18;
  stmt 0 _22 = _9 * _18;
  stmt 1 _24 = _9 * _19;
  children 0x45f7ab0 0x45f7b40
node 0x45f7ab0 (max_nunits=4, refcnt=2)
op: VEC_PERM_EXPR
  { }
  lane permutation { 0[1] 0[1] }
  children 0x45f7900
node 0x45f7b40 (max_nunits=4, refcnt=2)
op: VEC_PERM_EXPR
  { }
  lane permutation { 0[1] 0[0] }
  children 0x45f7990
node 0x45f7c60 (max_nunits=1, refcnt=1)
op template: _26 = _23 + _24;
  { }
  children 0x45f77e0 0x45f7a20

This representation has a number of advantages:

 * complex mul becomes trivial, as trivial as add.
 * add now detects itself as part of a sequence of complex mul.  i.e. if you
   have only add, it will use it instead of failing to follow the df like now.
 * It opens the door a bit more to vectorize the first loop in x264.  The code
   there has a cross iteration mix of loading the full vector but in an order
   that the vectorizer thinks it can't use a single linear load for.  e.g.

   .. = (a[0] + b[0]) - (a[4] + b[4])
   .. = (a[1] + b[1]) - (a[5] + b[5])
   .. = (a[2] + b[2]) - (a[6] + b[6])
   .. = (a[3] + b[3]) - (a[7] + b[6])

 Ultimately it thinks it's cheaper to use scalar loads here, while (at least for
 aarch64) if we see the unrolled loop together we can pattern match this sequence
 and rewrite it to something that uses the linear load. This is because the
 addition and subtract are widening.  So the widening arithmetic operations we
 have already only read half the vector since they're widening.

 For this to work, we need to see all 16 bytes loads, so the full unrolled loop
 of 4 iterations. Which currently doesn't happen for NEON but does for SVE. (though
 I wonder if now that you've added SLP decomposition if the initial SLP build
 shouldn't allow building of SLP trees that are wider than the vector length and
 later just decompose it or bump up the VF.).  However this is something for
 another discussion :).

 Changing the loads like this makes things a lot simpler and we require no
 additional work anywhere else.  The complexity ends up in optimize SLP which
 now has the following changes:

 * loads no longer introduce permutes, so they don't see the permutations.
   Instead each load can lead to a permute which can see the permutes.
 * As we're moving permutes up we stop at call boundaries as well.
 * As we're calculating materialization points for the permute we inspect the
   node and check if we can transform it, if so we mark it for transformation.
 * After calculating the materialization and transform points we iterate over
   the transform points checking whether we should transform.  If we should we
   modify the permute at the materialization point (which should be at the
   transform point) into the inverse permute.
 * During materialization as the permute is transformed this would undo the
   permutation if we transformed the instruction.
 * After computing this we elide the permute node, or push it down towards the
   load and then back into the load itself, splitting the node if required.

After this the tree should be back to it's normal form.  I still need to work
out when and where one needs to push down the permute all the way to the loads.

But this covers my thoughts.  Any feedback is appreciated to get it right early
on :)

Regards,
Tamar

--- inline copy of patch -- 
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index b2f414d2131b867eda337cd30f5ed40ed7c9fa10..73d1d742414d16e06e8312967990d565eddb218d 100644


--

Comments

Richard Biener June 22, 2021, 12:08 p.m. UTC | #1
On Mon, 21 Jun 2021, Tamar Christina wrote:

> Hi Richi,
> 
> This patch is still very much incomplete and I do know that it is 
> missing things but it's complete enough such that examples are working 
> and allows me to show what I'm working towards.
> 
> note, that this approach will remove a lot of code in 
> tree-vect-slp-patterns but to keep the diff readable I've left them in 
> and just commented out the calls or removed them where needed.
> 
> The patch rewrites the complex numbers detection by splitting the 
> detection of structure from dataflow analysis.  In principle the biggest 
> difference between this and the previous implementation is that instead 
> of trying to detect valid complex operations it *makes* an operation a 
> valid complex operation.
> 
> To do this each operation gets a dual optab which matches the same structure but
> has no dataflow requirement.
> 
> i.e. in this patch I added 4, ADDSUB, SUBADD, MUL_ADDSUB, MULL_SUBADD.
> 
> There is a then a mapping between these and their variant with the dataflow:
> 
> * ADDSUB -> COMPLEX_ADD_ROT270
> * SUBADD -> COMPLEX_ADD_ROT90
> * MUL_ADDSUB -> COMPLEX_MUL_CONJ
> * MUL_SUBADD -> COMPLEX_MUL
> 
> with the intention that when we detect the structure of an operation we query
> the backend for both optabs.
> 
> This should result in one of three states:
> 
>  * not supported: Move on.
>  * Supports ADDSUB only: Rewrite using ADDSUB, set type to 'cannot_transform'
>  * Supports COMPLEX_ADD_ROT270 only: Rewrite using ADDSUB, set type to 'must_transform'
>  * Supports both: Rewrite using ADDSUB, set type fo 'can_transform'
> 
> with the idea behind `can_transform` is to check the costs of the inverse
> permute needed to use the complex operation and if this is very expensive then
> stick to addsub.  This requires the target to be able to cost the operations
> reasonably correct.
> 
> So for ADD this looks like
> 
>  === vect_match_slp_patterns ===
>  Analyzing SLP tree 0x494e970 for patterns
>  Found ADDSUB pattern in SLP tree
>  Target does not support ADDSUB for vector type vector(4) float
>  Found COMPLEX_ADD_ROT270 pattern in SLP tree
>  Target supports COMPLEX_ADD_ROT270 vectorization with mode vector(4) float
> Pattern matched SLP tree
> node 0x494e970 (max_nunits=4, refcnt=1)
> op template: REALPART_EXPR <*_10> = _23;
>   stmt 0 REALPART_EXPR <*_10> = _23;
>   stmt 1 IMAGPART_EXPR <*_10> = _22;
>   children 0x494ea00
> node 0x494ea00 (max_nunits=4, refcnt=1)
> op template: slp_patt_39 = .ADDSUB (_23, _23);
>   stmt 0 _23 = _6 + _13;
>   stmt 1 _22 = _12 - _8;
>   children 0x494eb20 0x494ebb0
> node 0x494eb20 (max_nunits=4, refcnt=1)
> op template: _13 = REALPART_EXPR <*_3>;
>   stmt 0 _13 = REALPART_EXPR <*_3>;
>   stmt 1 _12 = IMAGPART_EXPR <*_3>;
> node 0x494ebb0 (max_nunits=4, refcnt=1)
> op: VEC_PERM_EXPR
>   { }
>   lane permutation { 0[1] 0[0] }
>   children 0x494ec40
> node 0x494ec40 (max_nunits=1, refcnt=2)
> op template: _8 = REALPART_EXPR <*_5>;
>   stmt 0 _8 = REALPART_EXPR <*_5>;
>   stmt 1 _6 = IMAGPART_EXPR <*_5>;
>   load permutation { 0 1 }
> 
> and later during optimize_slp we get
> 
> Tranforming SLP expression from ADDSUB to COMPLEX_ADD_ROT270
> processing node 0x494ebb0
> simplifying permute node 0x494ebb0
> Optimized SLP instances:
> node 0x494e970 (max_nunits=4, refcnt=1)
> op template: REALPART_EXPR <*_10> = _23;
>    stmt 0 REALPART_EXPR <*_10> = _23;
>    stmt 1 IMAGPART_EXPR <*_10> = _22;
>    children 0x494ea00
> node 0x494ea00 (max_nunits=4, refcnt=1)
> op template: slp_patt_39 = .COMPLEX_ADD_ROT270 (_23, _23);
>    stmt 0 _23 = _6 + _13;
>    stmt 1 _22 = _12 - _8;
>    children 0x494eb20 0x494ebb0
> node 0x494eb20 (max_nunits=4, refcnt=1)
> op template: _13 = REALPART_EXPR <*_3>;
>    stmt 0 _13 = REALPART_EXPR <*_3>;
>    stmt 1 _12 = IMAGPART_EXPR <*_3>;
> node 0x494ebb0 (max_nunits=4, refcnt=1)
> op: VEC_PERM_EXPR
>    { }
>    lane permutation { 0[0] 0[1] }
>    children 0x494ec40
> node 0x494ec40 (max_nunits=1, refcnt=2)
> op template: _8 = REALPART_EXPR <*_5>;
>    stmt 0 _8 = REALPART_EXPR <*_5>;
>    stmt 1 _6 = IMAGPART_EXPR <*_5>;
> 
> Now I still have to elide the VEC_PERM_EXPR here but that's easy.

So having skimmed half of the patch - this means SLP pattern recog
will initially recognize { +, -, +, - } as ADDSUB for example
but not factor in lane permutes from loads yet.  Now, suppose we
have { +, -, -, + } seen in pattern recog - how's that handled?
It might feed operations where we'd like to have inputs permuted
and thus would end up with ADDSUB in the end?

That said, you do

+         /* Check to see if this node can be transformed during permute
+            materialization.  */
+         bool patt_trans_cand_p = is_pattern_convert_candidate_p (node);
+         if (patt_trans_cand_p)
+           bitmap_set_bit (n_transform, idx);

in the propagation stage (but this looks like a static marking).

And then just for each of those candidates you somehow "undo" the
permutes on the SLP childs (by modifying it in-place - uh, see below).
But if you figure one child that is not permuted you've done that
anyway but stop.

So what I've expected is that this "transform" is done during
propagation (where you now set the n_transform bit), by means of
detecting the candidate as materialization point (but I'd have to
recap the kind of permutes we expect to be incoming).  That is,
for a specific set of permutes on the successor edges claim
the node can handle the permute and thus propagate unpermuted
upwards.

At the materialization stage you'd then transform the node.

Btw, how do we handle targets that can do complex_addsub but not
addsub?

> This works for ADD, but it doesn't work well when there's a complicated sequence
> of loads.  So for example
> 
> #define N 200
> void g (double complex a[restrict N], double complex b[restrict N],
> 	double complex c[restrict N])
> {
>   for (int i=0; i < N; i++)
>     {
>       c[i] =  a[i] * b[i];
>     }
> }
> 
> will results in an SLP tree where each operand of the multiply does not get to
> see all the original vectors:
> 
> Final SLP tree for instance 0x5678a30:
> node 0x55965a0 (max_nunits=4, refcnt=2)
> op template: REALPART_EXPR <*_7> = _25;
>   stmt 0 REALPART_EXPR <*_7> = _25;
>   stmt 1 IMAGPART_EXPR <*_7> = _26;
>   children 0x5596630
> node 0x5596630 (max_nunits=4, refcnt=2)
> op: VEC_PERM_EXPR
>   stmt 0 _25 = _17 - _22;
>   stmt 1 _26 = _23 + _24;
>   lane permutation { 0[0] 1[1] }
>   children 0x5596a20 0x5596ab0
> node 0x5596a20 (max_nunits=1, refcnt=1)
> op template: _25 = _17 - _22;
>   { }
>   children 0x55966c0 0x5596870
> node 0x55966c0 (max_nunits=4, refcnt=3)
> op template: _17 = _10 * _19;
>   stmt 0 _17 = _10 * _19;
>   stmt 1 _23 = _10 * _18;
>   children 0x5596750 0x55967e0
> node 0x5596750 (max_nunits=4, refcnt=2)
> op template: _10 = REALPART_EXPR <*_3>;
>   stmt 0 _10 = REALPART_EXPR <*_3>;
>   stmt 1 _10 = REALPART_EXPR <*_3>;
>   load permutation { 0 0 }
> node 0x55967e0 (max_nunits=4, refcnt=2)
> op template: _19 = REALPART_EXPR <*_5>;
>   stmt 0 _19 = REALPART_EXPR <*_5>;
>   stmt 1 _18 = IMAGPART_EXPR <*_5>;
>   load permutation { 0 1 }
> node 0x5596870 (max_nunits=4, refcnt=3)
> op template: _22 = _9 * _18;
>   stmt 0 _22 = _9 * _18;
>   stmt 1 _24 = _9 * _19;
>   children 0x5596900 0x5596990
> node 0x5596900 (max_nunits=4, refcnt=2)
> op template: _9 = IMAGPART_EXPR <*_3>;
>   stmt 0 _9 = IMAGPART_EXPR <*_3>;
>   stmt 1 _9 = IMAGPART_EXPR <*_3>;
>   load permutation { 1 1 }
> node 0x5596990 (max_nunits=4, refcnt=2)
> op template: _18 = IMAGPART_EXPR <*_5>;
>   stmt 0 _18 = IMAGPART_EXPR <*_5>;
>   stmt 1 _19 = REALPART_EXPR <*_5>;
>   load permutation { 1 0 }
> node 0x5596ab0 (max_nunits=1, refcnt=1)
> op template: _26 = _23 + _24;
>   { }
>   children 0x55966c0 0x5596870
> 
> So depending on the node each multiply either sees REAL 3 + REAL 5 + IMAG 5 or
> IMAG 3 + IMAG 5 + REAL 5.
> 
> since we are removing the TWO_OPERANDS node we need to drop one of the multiply
> and so we need to give it the original 2 vectors as a parameter.  The current
> implementation emits a permute operation to combine the loads again and later
> pushes the permute down.  This is problematic as it again requires us to do df
> analysis early.
> 
> To counter this, in the patch I have changed loads to no longer come out of
> build_slp with LOAD_PERMUTES but instead to have a VEC_PERM above each load.

Yep.  I've been there before (not sure if I ever sent you my 
work-in-progress here).  There's some wrongs in your patch but I guess
doing this exactly for the permutes optimize_slp handles would be fine.

We should see to do this independently of the stuff above, I can handle
this and will prepare a patch for this later.

> This is only temporary and the idea is that optimize SLP will push them back
> down into the loads once it's deemed there are still permutes needed.
> 
> So COMPLEX MUL becomes:
> 
> Final SLP tree for instance 0x46d9c80:
> node 0x45f76c0 (max_nunits=4, refcnt=2)
> op template: REALPART_EXPR <*_7> = _25;
>   stmt 0 REALPART_EXPR <*_7> = _25;
>   stmt 1 IMAGPART_EXPR <*_7> = _26;
>   children 0x45f7750
> node 0x45f7750 (max_nunits=4, refcnt=2)
> op: VEC_PERM_EXPR
>   stmt 0 _25 = _17 - _22;
>   stmt 1 _26 = _23 + _24;
>   lane permutation { 0[0] 1[1] }
>   children 0x45f7bd0 0x45f7c60
> node 0x45f7bd0 (max_nunits=1, refcnt=1)
> op template: _25 = _17 - _22;
>   { }
>   children 0x45f77e0 0x45f7a20
> node 0x45f77e0 (max_nunits=4, refcnt=3)
> op template: _17 = _10 * _19;
>   stmt 0 _17 = _10 * _19;
>   stmt 1 _23 = _10 * _18;
>   children 0x45f7870 0x45f7990
> node 0x45f7870 (max_nunits=4, refcnt=2)
> op: VEC_PERM_EXPR
>   { }
>   lane permutation { 0[0] 0[0] }
>   children 0x45f7900
> node 0x45f7900 (max_nunits=1, refcnt=4)
> op template: _10 = REALPART_EXPR <*_3>;
>   stmt 0 _10 = REALPART_EXPR <*_3>;
>   stmt 1 _9 = IMAGPART_EXPR <*_3>;
>   load permutation { 0 1 }
> node 0x45f7990 (max_nunits=4, refcnt=3)
> op template: _19 = REALPART_EXPR <*_5>;
>   stmt 0 _19 = REALPART_EXPR <*_5>;
>   stmt 1 _18 = IMAGPART_EXPR <*_5>;
>   load permutation { 0 1 }
> node 0x45f7a20 (max_nunits=4, refcnt=3)
> op template: _22 = _9 * _18;
>   stmt 0 _22 = _9 * _18;
>   stmt 1 _24 = _9 * _19;
>   children 0x45f7ab0 0x45f7b40
> node 0x45f7ab0 (max_nunits=4, refcnt=2)
> op: VEC_PERM_EXPR
>   { }
>   lane permutation { 0[1] 0[1] }
>   children 0x45f7900
> node 0x45f7b40 (max_nunits=4, refcnt=2)
> op: VEC_PERM_EXPR
>   { }
>   lane permutation { 0[1] 0[0] }
>   children 0x45f7990
> node 0x45f7c60 (max_nunits=1, refcnt=1)
> op template: _26 = _23 + _24;
>   { }
>   children 0x45f77e0 0x45f7a20
> 
> This representation has a number of advantages:
> 
>  * complex mul becomes trivial, as trivial as add.
>  * add now detects itself as part of a sequence of complex mul.  i.e. if you
>    have only add, it will use it instead of failing to follow the df like now.
>  * It opens the door a bit more to vectorize the first loop in x264.  The code
>    there has a cross iteration mix of loading the full vector but in an order
>    that the vectorizer thinks it can't use a single linear load for.  e.g.
> 
>    .. = (a[0] + b[0]) - (a[4] + b[4])
>    .. = (a[1] + b[1]) - (a[5] + b[5])
>    .. = (a[2] + b[2]) - (a[6] + b[6])
>    .. = (a[3] + b[3]) - (a[7] + b[6])
> 
>  Ultimately it thinks it's cheaper to use scalar loads here, while (at least for
>  aarch64) if we see the unrolled loop together we can pattern match this sequence
>  and rewrite it to something that uses the linear load. This is because the
>  addition and subtract are widening.  So the widening arithmetic operations we
>  have already only read half the vector since they're widening.
> 
>  For this to work, we need to see all 16 bytes loads, so the full unrolled loop
>  of 4 iterations. Which currently doesn't happen for NEON but does for SVE. (though
>  I wonder if now that you've added SLP decomposition if the initial SLP build
>  shouldn't allow building of SLP trees that are wider than the vector length and
>  later just decompose it or bump up the VF.).  However this is something for
>  another discussion :).
> 
>  Changing the loads like this makes things a lot simpler and we require no
>  additional work anywhere else.  The complexity ends up in optimize SLP which
>  now has the following changes:
> 
>  * loads no longer introduce permutes, so they don't see the permutations.
>    Instead each load can lead to a permute which can see the permutes.
>  * As we're moving permutes up we stop at call boundaries as well.
>  * As we're calculating materialization points for the permute we inspect the
>    node and check if we can transform it, if so we mark it for transformation.
>  * After calculating the materialization and transform points we iterate over
>    the transform points checking whether we should transform.  If we should we
>    modify the permute at the materialization point (which should be at the
>    transform point) into the inverse permute.
>  * During materialization as the permute is transformed this would undo the
>    permutation if we transformed the instruction.
>  * After computing this we elide the permute node, or push it down towards the
>    load and then back into the load itself, splitting the node if required.
> 
> After this the tree should be back to it's normal form.  I still need to 
> work out when and where one needs to push down the permute all the way 
> to the loads.

It's indeed the case that the current permute "propagation" is away from
loads only and a phase to propagate in the reverse direction is missing
if we now have permute sources other than loads.  The cycle handling
of the propagation is also a bit awkward and incomplete (but it "works").

Richard.

> But this covers my thoughts.  Any feedback is appreciated to get it right early
> on :)
> 
> Regards,
> Tamar
> 
> --- inline copy of patch -- 
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index b2f414d2131b867eda337cd30f5ed40ed7c9fa10..73d1d742414d16e06e8312967990d565eddb218d 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -277,6 +277,10 @@ DEF_INTERNAL_FLT_FN (SCALB, ECF_CONST, scalb, binary)
>  DEF_INTERNAL_FLT_FLOATN_FN (FMIN, ECF_CONST, fmin, binary)
>  DEF_INTERNAL_FLT_FLOATN_FN (FMAX, ECF_CONST, fmax, binary)
>  DEF_INTERNAL_OPTAB_FN (XORSIGN, ECF_CONST, xorsign, binary)
> +DEF_INTERNAL_OPTAB_FN (ADDSUB, ECF_CONST, addsub, binary)
> +DEF_INTERNAL_OPTAB_FN (SUBADD, ECF_CONST, subadd, binary)
> +DEF_INTERNAL_OPTAB_FN (MUL_ADDSUB, ECF_CONST, mul_addsub, binary)
> +DEF_INTERNAL_OPTAB_FN (MUL_SUBADD, ECF_CONST, mul_subadd, binary)
>  DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT90, ECF_CONST, cadd90, binary)
>  DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT270, ECF_CONST, cadd270, binary)
>  DEF_INTERNAL_OPTAB_FN (COMPLEX_MUL, ECF_CONST, cmul, binary)
> diff --git a/gcc/optabs.def b/gcc/optabs.def
> index b192a9d070b8aa72e5676b2eaa020b5bdd7ffcc8..73d23c49bdfa7423f67a6240650c558cfaf3f2f0 100644
> --- a/gcc/optabs.def
> +++ b/gcc/optabs.def
> @@ -290,6 +290,10 @@ OPTAB_D (atan_optab, "atan$a2")
>  OPTAB_D (atanh_optab, "atanh$a2")
>  OPTAB_D (copysign_optab, "copysign$F$a3")
>  OPTAB_D (xorsign_optab, "xorsign$F$a3")
> +OPTAB_D (addsub_optab, "addsub$a3")
> +OPTAB_D (subadd_optab, "subadd$a3")
> +OPTAB_D (mul_addsub_optab, "mul_addsub$a3")
> +OPTAB_D (mul_subadd_optab, "mul_subadd$a3")
>  OPTAB_D (cadd90_optab, "cadd90$a3")
>  OPTAB_D (cadd270_optab, "cadd270$a3")
>  OPTAB_D (cmul_optab, "cmul$a3")
> diff --git a/gcc/tree-vect-slp-patterns.c b/gcc/tree-vect-slp-patterns.c
> index 2ed49cd9edcabd7948b365dd60d7405b79079a7b..ce8d2daa1f06f17eda2cb08255d76bac9cf263f0 100644
> --- a/gcc/tree-vect-slp-patterns.c
> +++ b/gcc/tree-vect-slp-patterns.c
> @@ -68,6 +68,39 @@ along with GCC; see the file COPYING3.  If not see
>   * vect_pattern class
>   ******************************************************************************/
>  
> +static bool
> +vect_pattern_validate_optab (internal_fn ifn, slp_tree node);
> +
> +
> +/* Check if the base or complex equivalents are supported.  */
> +static bool
> +vect_pattern_validate_optab_or_eq (internal_fn ifn, slp_tree node)
> +{
> +  if (vect_pattern_validate_optab (ifn, node))
> +    return true;
> +
> +  switch (ifn)
> +  {
> +    case IFN_ADDSUB:
> +      ifn = IFN_COMPLEX_ADD_ROT270;
> +      break;
> +    case IFN_SUBADD:
> +      ifn = IFN_COMPLEX_ADD_ROT90;
> +      break;
> +    case IFN_MUL_ADDSUB:
> +      ifn = IFN_COMPLEX_MUL_CONJ;
> +      break;
> +    case IFN_MUL_SUBADD:
> +      ifn = IFN_COMPLEX_MUL;
> +      break;
> +    default:
> +      gcc_unreachable ();
> +  }
> +
> +  return vect_pattern_validate_optab (ifn, node);
> +}
> +
> +
>  /* Default implementation of recognize that performs matching, validation and
>     replacement of nodes but that can be overriden if required.  */
>  
> @@ -630,8 +663,7 @@ complex_add_pattern::build (vec_info *vinfo)
>  
>    /* First re-arrange the children.  */
>    SLP_TREE_CHILDREN (*this->m_node)[0] = children[0];
> -  SLP_TREE_CHILDREN (*this->m_node)[1] =
> -    vect_build_swap_evenodd_node (children[1]);
> +  SLP_TREE_CHILDREN (*this->m_node)[1] = children[1];
>  
>    SLP_TREE_REF_COUNT (SLP_TREE_CHILDREN (*this->m_node)[0])++;
>    SLP_TREE_REF_COUNT (SLP_TREE_CHILDREN (*this->m_node)[1])++;
> @@ -670,9 +702,9 @@ complex_add_pattern::matches (complex_operation_t op,
>        Rotation 0 and 180 can be handled by normal SIMD code, so we don't need
>        to care about them here.  */
>    if (op == MINUS_PLUS)
> -    ifn = IFN_COMPLEX_ADD_ROT90;
> +    ifn = IFN_SUBADD;
>    else if (op == PLUS_MINUS)
> -    ifn = IFN_COMPLEX_ADD_ROT270;
> +    ifn = IFN_ADDSUB;
>    else
>      return ifn;
>  
> @@ -680,17 +712,7 @@ complex_add_pattern::matches (complex_operation_t op,
>       we support.  */
>    gcc_assert (ops->length () == 2);
>  
> -  vec<slp_tree> children = SLP_TREE_CHILDREN ((*ops)[0]);
> -
> -  /* First node must be unpermuted.  */
> -  if (linear_loads_p (perm_cache, children[0]) != PERM_EVENODD)
> -    return IFN_LAST;
> -
> -  /* Second node must be permuted.  */
> -  if (linear_loads_p (perm_cache, children[1]) != PERM_ODDEVEN)
> -    return IFN_LAST;
> -
> -  if (!vect_pattern_validate_optab (ifn, *node))
> +  if (!vect_pattern_validate_optab_or_eq (ifn, *node))
>      return IFN_LAST;
>  
>    return ifn;
> @@ -1501,7 +1523,8 @@ vect_pattern_decl_t slp_patterns[]
>       order patterns from the largest to the smallest.  Especially if they
>       overlap in what they can detect.  */
>  
> -  SLP_PATTERN (complex_operations_pattern),
> +  SLP_PATTERN (complex_add_pattern),
> +  //SLP_PATTERN (complex_operations_pattern),
>  };
>  #undef SLP_PATTERN
>  
> diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
> index 0c1f85beeb2e9f3fb7c66c15d4d30594b2570f9e..f24ad85d15780807c49153895b7340c9b4d4d1f3 100644
> --- a/gcc/tree-vect-slp.c
> +++ b/gcc/tree-vect-slp.c
> @@ -1764,25 +1764,69 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
>  	{
>  	  *max_nunits = this_max_nunits;
>  	  (*tree_size)++;
> -	  node = vect_create_new_slp_node (node, stmts, 0);
> -	  SLP_TREE_VECTYPE (node) = vectype;
> +	  node = vect_create_new_slp_node (node, stmts, 1);
>  	  /* And compute the load permutation.  Whether it is actually
>  	     a permutation depends on the unrolling factor which is
> -	     decided later.  */
> -	  vec<unsigned> load_permutation;
> +	     decided later.  We specifically materialize permutes at this
> +	     early stage and leave it up to optimize SLP to push them back
> +	     into the load if needed.  */
> +	  lane_permutation_t lane_permutation;
> +
> +	  /* See the loads with a linear permute, this so that optimize_slp
> +	     can later push the permute upwards and downwards without needing
> +	     to inspect the parent node.  */
> +	  load_permutation_t load_permutation;
>  	  int j;
> -	  stmt_vec_info load_info;
> +	  stmt_vec_info load_info, curr_load;
> +	  lane_permutation.create (group_size);
>  	  load_permutation.create (group_size);
> +	  vec<stmt_vec_info> group_stmts;
> +	  bool linear = true;
>  	  stmt_vec_info first_stmt_info
>  	    = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
> +	  group_stmts.create (DR_GROUP_SIZE (first_stmt_info));
> +	  curr_load = first_stmt_info;
>  	  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
>  	    {
>  	      int load_place = vect_get_place_in_interleaving_chain
>  		  (load_info, first_stmt_info);
>  	      gcc_assert (load_place != -1);
> -	      load_permutation.safe_push (load_place);
> +
> +	      lane_permutation.safe_push (std::make_pair (0, load_place));
> +	      load_permutation.quick_push (j);
> +
> +	      group_stmts.quick_push (curr_load);
> +	      linear = linear && curr_load == load_info;
> +	      curr_load = DR_GROUP_NEXT_ELEMENT (curr_load);
> +	    }
> +
> +	  if (!linear)
> +	    {
> +	      slp_tree child;
> +	      if (slp_tree *load = bst_map->get (group_stmts))
> +		child = *load;
> +	      else
> +		{
> +		  child =  vect_create_new_slp_node (group_stmts.copy (), 0);
> +		  SLP_TREE_VECTYPE (child) = vectype;
> +		  /* One for me, one for the BST map.  */
> +		  SLP_TREE_REF_COUNT (child)++;
> +		  bst_map->put (group_stmts, child);
> +		}
> +
> +	      /* Set some metadata on the load node.  */
> +	      SLP_TREE_REF_COUNT (child)++;
> +	      SLP_TREE_LANES (child) = SLP_TREE_LANES (node);
> +	      SLP_TREE_LOAD_PERMUTATION (child) = load_permutation;
> +
> +	      /* But return a permute node.  */
> +	      SLP_TREE_LANE_PERMUTATION (node) = lane_permutation;
> +	      SLP_TREE_CODE (node) = VEC_PERM_EXPR;
> +	      SLP_TREE_CHILDREN (node).quick_push (child);
> +	      SLP_TREE_SCALAR_STMTS (node) = vNULL;
>  	    }
> -	  SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
> +
> +	  SLP_TREE_VECTYPE (node) = vectype;
>  	  return node;
>  	}
>      }
> @@ -3430,8 +3474,8 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
>        FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
>  	{
>  	  slp_tree root = SLP_INSTANCE_TREE (instance);
> -	  optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
> -					&load_map, root);
> +	  //optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
> +	//				&load_map, root);
>  	}
>      }
>  
> @@ -3548,6 +3592,83 @@ vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
>  			 sizeof (unsigned) * perms[perm_a].length ()) == 0));
>  }
>  
> +static bool
> +is_pattern_convert_candidate_p (slp_tree node)
> +{
> +  stmt_vec_info stmt_info;
> +  if (!(stmt_info = SLP_TREE_REPRESENTATIVE (node))
> +      || !STMT_VINFO_SLP_VECT_ONLY_PATTERN (stmt_info))
> +    return false;
> +
> +  gimple* stmt = STMT_VINFO_STMT (stmt_info);
> +  if (!is_gimple_call (stmt))
> +    return false;
> +
> +  if (!gimple_call_internal_p (stmt))
> +    return false;
> +
> +  internal_fn ifn = gimple_call_internal_fn (stmt);
> +
> +  switch (ifn)
> +    {
> +    case IFN_ADDSUB:
> +    case IFN_SUBADD:
> +    case IFN_MUL_ADDSUB:
> +    case IFN_MUL_SUBADD:
> +      return true;
> +    default:
> +      break;
> +    }
> +
> +  return false;
> +}
> +
> +static internal_fn
> +vect_get_transformed_pattern (internal_fn ifn, slp_tree /* node */)
> +{
> +  switch (ifn)
> +    {
> +    case IFN_ADDSUB:
> +      ifn = IFN_COMPLEX_ADD_ROT270;
> +      break;
> +    case IFN_SUBADD:
> +      ifn = IFN_COMPLEX_ADD_ROT90;
> +      break;
> +    case IFN_MUL_ADDSUB:
> +      ifn = IFN_COMPLEX_MUL_CONJ;
> +      break;
> +    case IFN_MUL_SUBADD:
> +      ifn = IFN_COMPLEX_MUL;
> +      break;
> +    default:
> +      gcc_unreachable ();
> +    }
> +
> +  return ifn;
> +}
> +
> +
> +static load_permutation_t
> +vect_reverse_perm (load_permutation_t perm)
> +{
> +  auto_vec<std::pair<unsigned, unsigned> > pairs;
> +  pairs.create (perm.length ());
> +  unsigned i;
> +  for (i = 0; i < perm.length (); i++)
> +    pairs.quick_push (std::make_pair (i, perm[i]));
> +
> +  typedef const std::pair<unsigned, unsigned>* cmp_t;
> +  pairs.qsort ([](const void* a, const void* b) -> int
> +    { return (int)((cmp_t)a)->second - (int)((cmp_t)b)->second; });
> +
> +  load_permutation_t rev_perm;
> +  rev_perm.create (perm.length ());
> +  for (i = 0; i < perm.length (); i++)
> +    rev_perm.quick_push (pairs[i].first);
> +
> +  return rev_perm;
> +}
> +
>  /* Optimize the SLP graph of VINFO.  */
>  
>  void
> @@ -3578,11 +3699,13 @@ vect_optimize_slp (vec_info *vinfo)
>  
>    auto_sbitmap n_visited (vertices.length ());
>    auto_sbitmap n_materialize (vertices.length ());
> +  auto_sbitmap n_transform (vertices.length ());
>    auto_vec<int> n_perm (vertices.length ());
>    auto_vec<vec<unsigned> > perms;
>  
>    bitmap_clear (n_visited);
>    bitmap_clear (n_materialize);
> +  bitmap_clear (n_transform);
>    n_perm.quick_grow_cleared (vertices.length ());
>    perms.safe_push (vNULL); /* zero is no permute */
>  
> @@ -3602,55 +3725,73 @@ vect_optimize_slp (vec_info *vinfo)
>  	 as entries to the reverse graph.  */
>        if (!slpg->vertices[idx].succ)
>  	bitmap_set_bit (n_visited, idx);
> -      /* Loads are the only thing generating permutes.  */
> -      if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
> -	continue;
>  
> -      /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
> -	 node unpermuted, record this permute.  */
> -      stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
> -      if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
> -	continue;
> -      dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
> -      unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
> -      bool any_permute = false;
> -      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> +      int perm_seed = 0;
> +
> +      for (graph_edge *pred = slpg->vertices[idx].pred;
> +	   pred; pred = pred->pred_next)
>  	{
> -	  unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
> -	  imin = MIN (imin, idx);
> -	  imax = MAX (imax, idx);
> -	  if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
> -	    any_permute = true;
> -	}
> -      /* If there's no permute no need to split one out.  */
> -      if (!any_permute)
> -	continue;
> -      /* If the span doesn't match we'd disrupt VF computation, avoid
> -	 that for now.  */
> -      if (imax - imin + 1 != SLP_TREE_LANES (node))
> -	continue;
> +	  int src_idx = pred->src;
> +	  slp_tree src_node = vertices[src_idx];
>  
> -      /* For now only handle true permutes, like
> -	 vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
> -	 when permuting constants and invariants keeping the permute
> -	 bijective.  */
> -      auto_sbitmap load_index (SLP_TREE_LANES (node));
> -      bitmap_clear (load_index);
> -      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> -	bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
> -      unsigned j;
> -      for (j = 0; j < SLP_TREE_LANES (node); ++j)
> -	if (!bitmap_bit_p (load_index, j))
> -	  break;
> -      if (j != SLP_TREE_LANES (node))
> -	continue;
> +	  /* Loads are the only thing generating permutes, however the permutes
> +	     are not yet lowered into the loads.  So instead chase up the chain
> +	     to find a permute node.  */
> +	  if (!SLP_TREE_LANE_PERMUTATION (src_node).exists ())
> +	    continue;
> +
> +	  /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
> +	     node unpermuted, record this permute.  */
> +	  stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
> +	  if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
> +	    continue;
> +	  dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
> +	  unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
> +	  bool any_permute = false;
> +	  for (unsigned j = 0; j < SLP_TREE_LANES (src_node); ++j)
> +	    {
> +	      unsigned src_op = SLP_TREE_LANE_PERMUTATION (src_node)[j].first;
> +	      unsigned idx = SLP_TREE_LANE_PERMUTATION (src_node)[j].second;
> +	      /* This isn't a load permute moved out.  */
> +	      if (src_op != 0)
> +		continue;
> +	      imin = MIN (imin, idx);
> +	      imax = MAX (imax, idx);
> +	      if (idx - SLP_TREE_LANE_PERMUTATION (src_node)[0].second != j)
> +		any_permute = true;
> +	    }
> +	  /* If there's no permute no need to split one out.  */
> +	  if (!any_permute)
> +	    continue;
> +	  /* If the span doesn't match we'd disrupt VF computation, avoid
> +	     that for now.  */
> +	  if (imax - imin + 1 != SLP_TREE_LANES (node))
> +	    continue;
> +
> +	  /* For now only handle true permutes, like
> +	     vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
> +	     when permuting constants and invariants keeping the permute
> +	     bijective.  */
> +	  auto_sbitmap load_index (SLP_TREE_LANES (src_node));
> +	  bitmap_clear (load_index);
> +	  for (unsigned j = 0; j < SLP_TREE_LANES (src_node); ++j)
> +	    bitmap_set_bit (load_index, SLP_TREE_LANE_PERMUTATION (src_node)[j].second - imin);
> +	  unsigned j;
> +	  for (j = 0; j < SLP_TREE_LANES (src_node); ++j)
> +	  if (!bitmap_bit_p (load_index, j))
> +	    break;
> +	  if (j != SLP_TREE_LANES (src_node))
> +	    continue;
>  
> -      vec<unsigned> perm = vNULL;
> -      perm.safe_grow (SLP_TREE_LANES (node), true);
> -      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> -	perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> -      perms.safe_push (perm);
> -      n_perm[idx] = perms.length () - 1;
> +	  vec<unsigned> perm = vNULL;
> +	  perm.safe_grow (SLP_TREE_LANES (src_node), true);
> +	  for (unsigned j = 0; j < SLP_TREE_LANES (src_node); ++j)
> +	    perm[j] = SLP_TREE_LANE_PERMUTATION (src_node)[j].second - imin;
> +	  perms.safe_push (perm);
> +	  n_perm[src_idx] = perms.length () - 1;
> +	  perm_seed = -1;
> +       }
> +      n_perm[idx] = perm_seed;
>      }
>  
>    /* Propagate permutes along the graph and compute materialization points.  */
> @@ -3667,12 +3808,12 @@ vect_optimize_slp (vec_info *vinfo)
>  	  slp_tree node = vertices[idx];
>  	  /* For leafs there's nothing to do - we've seeded permutes
>  	     on those above.  */
> -	  if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
> +	  if (SLP_TREE_DEF_TYPE (node) != vect_internal_def || !slpg->vertices[idx].succ)
>  	    continue;
>  
>  	  bitmap_set_bit (n_visited, idx);
>  
> -	  /* We cannot move a permute across a store.  */
> +	  /* We cannot move a permute across a store.  TODO: or a call.  */
>  	  if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
>  	      && DR_IS_WRITE
>  		   (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
> @@ -3723,6 +3864,12 @@ vect_optimize_slp (vec_info *vinfo)
>  	      changed = true;
>  	    }
>  
> +	  /* Check to see if this node can be transformed during permute
> +	     materialization.  */
> +	  bool patt_trans_cand_p = is_pattern_convert_candidate_p (node);
> +	  if (patt_trans_cand_p)
> +	    bitmap_set_bit (n_transform, idx);
> +
>  	  if (perm == 0)
>  	    continue;
>  
> @@ -3764,6 +3911,46 @@ vect_optimize_slp (vec_info *vinfo)
>      }
>    while (changed || iteration == 1);
>  
> +  bitmap_clear (n_visited);
> +
> +  /* Transform and record any permutes for the usage of the instruction.
> +     TODO: Check if it's cheaper to keep ADDSUB if available or use to
> +	   COMPLEX_ADD based on the reverse permute that's needed.  */
> +  sbitmap_iterator bi;
> +  EXECUTE_IF_SET_IN_BITMAP (n_transform, 0, i, bi)
> +    {
> +      slp_tree node = vertices[i];
> +
> +      for (graph_edge *succ = slpg->vertices[i].succ;
> +		       succ; succ = succ->succ_next)
> +	{
> +	  int perm_id = n_perm[succ->dest];
> +	  if (perm_id <= 0)
> +	    continue;
> +
> +	  load_permutation_t linperm = vect_reverse_perm (perms[perm_id]);
> +	  perms[perm_id].release ();
> +	  perms[perm_id] = linperm;
> +
> +          bitmap_set_bit (n_materialize, succ->dest);
> +	}
> +
> +      /* Transform the node if we have determined that it's beneficial to do so.
> +	 when we do so the perm that has been calculated for the children of the
> +	 node will be applied.  */
> +      stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
> +      gimple* stmt = STMT_VINFO_STMT (stmt_info);
> +      internal_fn old_ifn = gimple_call_internal_fn (stmt);
> +      internal_fn ifn = vect_get_transformed_pattern (old_ifn ,node);
> +
> +      if (dump_enabled_p ())
> +	dump_printf_loc (MSG_NOTE, vect_location,
> +			 "Tranforming SLP expression from %s to %s\n",
> +			 internal_fn_name (old_ifn), internal_fn_name (ifn));
> +      gimple_call_set_internal_fn (as_a <gcall *> (stmt), ifn);
> +    }
> +
> +
>    /* Materialize.  */
>    for (i = 0; i < vertices.length (); ++i)
>      {
> @@ -3798,6 +3985,11 @@ vect_optimize_slp (vec_info *vinfo)
>  			    SLP_TREE_SCALAR_OPS (child), true);
>  	}
>  
> +      if (dump_enabled_p ())
> +	dump_printf_loc (MSG_NOTE, vect_location,
> +			 "processing node %p\n",
> +			 node);
> +
>        if (bitmap_bit_p (n_materialize, i))
>  	{
>  	  if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
> @@ -3986,6 +4178,16 @@ vect_optimize_slp (vec_info *vinfo)
>  	    }
>  	}
>      }
> +
> +  if (dump_enabled_p ())
> +    {
> +      dump_printf_loc (MSG_NOTE, vect_location, "Optimized SLP instances: \n");
> +      hash_set<slp_tree> visited;
> +      slp_instance instance;
> +      FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
> +	vect_print_slp_graph (MSG_NOTE, vect_location,
> +			      SLP_INSTANCE_TREE (instance), visited);
> +    }
>  }
>  
>  /* Gather loads reachable from the individual SLP graph entries.  */
> 
> 
>
Richard Biener June 22, 2021, 1:14 p.m. UTC | #2
On Tue, 22 Jun 2021, Richard Biener wrote:

> On Mon, 21 Jun 2021, Tamar Christina wrote:
> 
> > Hi Richi,
> > 
[...]
> > since we are removing the TWO_OPERANDS node we need to drop one of the multiply
> > and so we need to give it the original 2 vectors as a parameter.  The current
> > implementation emits a permute operation to combine the loads again and later
> > pushes the permute down.  This is problematic as it again requires us to do df
> > analysis early.
> > 
> > To counter this, in the patch I have changed loads to no longer come out of
> > build_slp with LOAD_PERMUTES but instead to have a VEC_PERM above each load.
> 
> Yep.  I've been there before (not sure if I ever sent you my 
> work-in-progress here).  There's some wrongs in your patch but I guess
> doing this exactly for the permutes optimize_slp handles would be fine.
> 
> We should see to do this independently of the stuff above, I can handle
> this and will prepare a patch for this later.

So it's of course difficult and the current optimize_slp tied closely
to what the original vect_attempt_slp_rearrange_stmts did.

If you for example consider

double x[2], y[2], z[4];
void foo ()
{
  z[0] = x[0] + y[0];
  z[1] = x[1] + y[1];
  z[2] = x[1] + y[0];
  z[3] = x[0] + y[1];
}

then the x[0], x[1] loads look unform enough to be handled
but of course we end up with a group_size of 4 here and
a { 0, 1, 1, 0 } load permutation which optimize_slp wouldn't
handle either.  Of course in the end we should have split
the SLP at vector size boundary but the question is how
we should ensure this (if at all ...) or if we should
eventually even create a SLP tree like

      { x[0], x[1] }
         |       \
         |     VEC_PERM { 0[1], 0[0] }
         \      /
      VEC_PERM { 0[0], 0[1], 1[0], 1[1] }
             |

for the load.  Note all lane splitting/duplicating has
effects on vectorization factor compute - one complication
I ran into when originally trying to split out load permutations
from loads.

I'm not sure whether your example motivating the load SLP
changes is good enough - if you consider that the loaded
values get modified, like as { x[0]/a + y[1]/a, x[1]/a - y[0]/a }
splitting the load permutation from the load will not get you
the division CSEd at SLP build and if we divide by a different value
there's no CSE opportunity.  What would work and what should work
right now is that you get a lane permute down but you'll not know
whether values are "related"?  If you need that info and that
directly on the child you can look at the representatives
DR group leader, if any, as a heuristic.

I've pasted below what I was playing with, it shows CSE for
cases like

double x[2], z[4];
void foo ()
{
  z[0] = x[0] + 2 * x[1];
  z[1] = x[1] + 2 * x[0];
}

but it breaks various vect.exp tests that look for permutes
being merged with reductions (thus the optimize_slp propagation
somehow doesn't work - as I said it's a bit fragile).

Richard.

diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 6a26ccdd290..187bbfb70db 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -343,12 +343,14 @@ vect_slp_tree_uniform_p (slp_tree node)
 }
 
 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
-   that starts from FIRST_STMT_INFO.  Return -1 if the data-ref is not a part
-   of the chain.  */
+   that starts from FIRST_STMT_INFO.  If ADD_GAPS is true then if there's
+   a gap between elements account for that as well.
+   Return -1 if the data-ref is not a part of the chain.  */
 
 int
 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
-				      stmt_vec_info first_stmt_info)
+				      stmt_vec_info first_stmt_info,
+				      bool add_gaps)
 {
   stmt_vec_info next_stmt_info = first_stmt_info;
   int result = 0;
@@ -362,7 +364,7 @@ vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
 	return result;
       next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
       if (next_stmt_info)
-	result += DR_GROUP_GAP (next_stmt_info);
+	result += add_gaps ? DR_GROUP_GAP (next_stmt_info) : 1;
     }
   while (next_stmt_info);
 
@@ -1769,24 +1771,65 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
 	{
 	  *max_nunits = this_max_nunits;
 	  (*tree_size)++;
-	  node = vect_create_new_slp_node (node, stmts, 0);
-	  SLP_TREE_VECTYPE (node) = vectype;
 	  /* And compute the load permutation.  Whether it is actually
 	     a permutation depends on the unrolling factor which is
 	     decided later.  */
-	  vec<unsigned> load_permutation;
 	  int j;
 	  stmt_vec_info load_info;
+	  stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmts[0]);
+	  vec<unsigned> load_permutation;
 	  load_permutation.create (group_size);
-	  stmt_vec_info first_stmt_info
-	    = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
-	  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
+	  bool any_non_contiguous = false;
+	  FOR_EACH_VEC_ELT (stmts, j, load_info)
 	    {
 	      int load_place = vect_get_place_in_interleaving_chain
-		  (load_info, first_stmt_info);
+		  (load_info, first_stmt_info, true);
 	      gcc_assert (load_place != -1);
+	      if (load_place != j)
+		any_non_contiguous = true;
 	      load_permutation.safe_push (load_place);
 	    }
+	  if (any_non_contiguous
+	      && DR_GROUP_SIZE (first_stmt_info) == group_size)
+	    {
+	      vec<stmt_vec_info> alt_stmts;
+	      alt_stmts.create (group_size);
+	      for (stmt_vec_info cur = first_stmt_info; cur;
+		   cur = DR_GROUP_NEXT_ELEMENT (cur))
+		alt_stmts.quick_push (cur);
+	      if (alt_stmts.length () == group_size)
+		{
+		  load_permutation.release ();
+		  slp_tree load = vect_build_slp_tree (vinfo, alt_stmts,
+						       group_size,
+						       &this_max_nunits,
+						       matches, limit,
+						       tree_size, bst_map);
+		  if (!load)
+		    {
+		      alt_stmts.release ();
+		      matches[0] = false;
+		      return NULL;
+		    }
+		  lane_permutation_t lane_permute;
+		  lane_permute.create (group_size);
+		  FOR_EACH_VEC_ELT (stmts, j, load_info)
+		    {
+		      int load_place = vect_get_place_in_interleaving_chain
+					 (load_info, first_stmt_info, false);
+		      gcc_assert (load_place != -1);
+		      lane_permute.quick_push (std::make_pair (0, load_place));
+		    }
+		  node = vect_create_new_slp_node (node, stmts, 1);
+		  SLP_TREE_CODE (node) = VEC_PERM_EXPR;
+		  SLP_TREE_LANE_PERMUTATION (node) = lane_permute;
+		  SLP_TREE_VECTYPE (node) = vectype;
+		  SLP_TREE_CHILDREN (node).quick_push (load);
+		  return node;
+		}
+	    }
+	  node = vect_create_new_slp_node (node, stmts, 0);
+	  SLP_TREE_VECTYPE (node) = vectype;
 	  SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
 	  return node;
 	}
@@ -3628,7 +3671,24 @@ vect_optimize_slp (vec_info *vinfo)
 	 as entries to the reverse graph.  */
       if (!slpg->vertices[idx].succ)
 	bitmap_set_bit (n_visited, idx);
-      /* Loads are the only thing generating permutes.  */
+
+      /* Simple permute generating loads have been split into
+         non-permuted loads and a separate lane permutation.  */
+      if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
+	  && SLP_TREE_CHILDREN (node).length () == 1)
+	{
+	  gcc_assert (SLP_TREE_LANES (node)
+		      == SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[0]));
+	  vec<unsigned> perm = vNULL;
+	  perm.safe_grow (SLP_TREE_LANES (node), true);
+	  for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
+	    perm[j] = SLP_TREE_LANE_PERMUTATION (node)[j].second;
+	  perms.safe_push (perm);
+	  n_perm[idx] = perms.length () - 1;
+	  continue;
+	}
+
+      /* Loads are the only remaining things generating permutes.  */
       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
 	continue;
 
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index d95e359daae..2fb876dcfd7 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -8805,8 +8805,8 @@ vectorizable_load (vec_info *vinfo,
 	  if (grouped_load)
 	    cst_offset
 	      = (tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (vectype)))
-		 * vect_get_place_in_interleaving_chain (stmt_info,
-							 first_stmt_info));
+		 * vect_get_place_in_interleaving_chain
+		     (stmt_info, first_stmt_info, true));
 	  group_size = 1;
 	  ref_type = reference_alias_ptr_type (DR_REF (dr_info->dr));
 	}
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index fa28336d429..82e86855ca1 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2033,7 +2033,8 @@ extern bool can_duplicate_and_interleave_p (vec_info *, unsigned int, tree,
 					    tree * = NULL, tree * = NULL);
 extern void duplicate_and_interleave (vec_info *, gimple_seq *, tree,
 				      vec<tree>, unsigned int, vec<tree> &);
-extern int vect_get_place_in_interleaving_chain (stmt_vec_info, stmt_vec_info);
+extern int vect_get_place_in_interleaving_chain (stmt_vec_info, stmt_vec_info,
+						 bool);
 extern bool vect_update_shared_vectype (stmt_vec_info, tree);
 extern slp_tree vect_create_new_slp_node (unsigned, tree_code);
 extern void vect_free_slp_tree (slp_tree);
Tamar Christina June 22, 2021, 1:33 p.m. UTC | #3
> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Tuesday, June 22, 2021 1:08 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>
> Subject: Re: [PATCH]middle-end[RFC] slp: new implementation of complex
> numbers
> 
> On Mon, 21 Jun 2021, Tamar Christina wrote:
> 
> > Hi Richi,
> >
> > This patch is still very much incomplete and I do know that it is
> > missing things but it's complete enough such that examples are working
> > and allows me to show what I'm working towards.
> >
> > note, that this approach will remove a lot of code in
> > tree-vect-slp-patterns but to keep the diff readable I've left them in
> > and just commented out the calls or removed them where needed.
> >
> > The patch rewrites the complex numbers detection by splitting the
> > detection of structure from dataflow analysis.  In principle the
> > biggest difference between this and the previous implementation is
> > that instead of trying to detect valid complex operations it *makes*
> > an operation a valid complex operation.
> >
> > To do this each operation gets a dual optab which matches the same
> > structure but has no dataflow requirement.
> >
> > i.e. in this patch I added 4, ADDSUB, SUBADD, MUL_ADDSUB,
> MULL_SUBADD.
> >
> > There is a then a mapping between these and their variant with the
> dataflow:
> >
> > * ADDSUB -> COMPLEX_ADD_ROT270
> > * SUBADD -> COMPLEX_ADD_ROT90
> > * MUL_ADDSUB -> COMPLEX_MUL_CONJ
> > * MUL_SUBADD -> COMPLEX_MUL
> >
> > with the intention that when we detect the structure of an operation
> > we query the backend for both optabs.
> >
> > This should result in one of three states:
> >
> >  * not supported: Move on.
> >  * Supports ADDSUB only: Rewrite using ADDSUB, set type to
> 'cannot_transform'
> >  * Supports COMPLEX_ADD_ROT270 only: Rewrite using ADDSUB, set type
> to 'must_transform'
> >  * Supports both: Rewrite using ADDSUB, set type fo 'can_transform'
> >
> > with the idea behind `can_transform` is to check the costs of the
> > inverse permute needed to use the complex operation and if this is
> > very expensive then stick to addsub.  This requires the target to be
> > able to cost the operations reasonably correct.
> >
> > So for ADD this looks like
> >
> >  === vect_match_slp_patterns ===
> >  Analyzing SLP tree 0x494e970 for patterns  Found ADDSUB pattern in
> > SLP tree  Target does not support ADDSUB for vector type vector(4)
> > float  Found COMPLEX_ADD_ROT270 pattern in SLP tree  Target supports
> > COMPLEX_ADD_ROT270 vectorization with mode vector(4) float Pattern
> > matched SLP tree node 0x494e970 (max_nunits=4, refcnt=1) op template:
> > REALPART_EXPR <*_10> = _23;
> >   stmt 0 REALPART_EXPR <*_10> = _23;
> >   stmt 1 IMAGPART_EXPR <*_10> = _22;
> >   children 0x494ea00
> > node 0x494ea00 (max_nunits=4, refcnt=1) op template: slp_patt_39 =
> > .ADDSUB (_23, _23);
> >   stmt 0 _23 = _6 + _13;
> >   stmt 1 _22 = _12 - _8;
> >   children 0x494eb20 0x494ebb0
> > node 0x494eb20 (max_nunits=4, refcnt=1) op template: _13 =
> > REALPART_EXPR <*_3>;
> >   stmt 0 _13 = REALPART_EXPR <*_3>;
> >   stmt 1 _12 = IMAGPART_EXPR <*_3>;
> > node 0x494ebb0 (max_nunits=4, refcnt=1)
> > op: VEC_PERM_EXPR
> >   { }
> >   lane permutation { 0[1] 0[0] }
> >   children 0x494ec40
> > node 0x494ec40 (max_nunits=1, refcnt=2) op template: _8 =
> > REALPART_EXPR <*_5>;
> >   stmt 0 _8 = REALPART_EXPR <*_5>;
> >   stmt 1 _6 = IMAGPART_EXPR <*_5>;
> >   load permutation { 0 1 }
> >
> > and later during optimize_slp we get
> >
> > Tranforming SLP expression from ADDSUB to COMPLEX_ADD_ROT270
> > processing node 0x494ebb0 simplifying permute node 0x494ebb0
> Optimized
> > SLP instances:
> > node 0x494e970 (max_nunits=4, refcnt=1) op template: REALPART_EXPR
> > <*_10> = _23;
> >    stmt 0 REALPART_EXPR <*_10> = _23;
> >    stmt 1 IMAGPART_EXPR <*_10> = _22;
> >    children 0x494ea00
> > node 0x494ea00 (max_nunits=4, refcnt=1) op template: slp_patt_39 =
> > .COMPLEX_ADD_ROT270 (_23, _23);
> >    stmt 0 _23 = _6 + _13;
> >    stmt 1 _22 = _12 - _8;
> >    children 0x494eb20 0x494ebb0
> > node 0x494eb20 (max_nunits=4, refcnt=1) op template: _13 =
> > REALPART_EXPR <*_3>;
> >    stmt 0 _13 = REALPART_EXPR <*_3>;
> >    stmt 1 _12 = IMAGPART_EXPR <*_3>;
> > node 0x494ebb0 (max_nunits=4, refcnt=1)
> > op: VEC_PERM_EXPR
> >    { }
> >    lane permutation { 0[0] 0[1] }
> >    children 0x494ec40
> > node 0x494ec40 (max_nunits=1, refcnt=2) op template: _8 =
> > REALPART_EXPR <*_5>;
> >    stmt 0 _8 = REALPART_EXPR <*_5>;
> >    stmt 1 _6 = IMAGPART_EXPR <*_5>;
> >
> > Now I still have to elide the VEC_PERM_EXPR here but that's easy.
> 
> So having skimmed half of the patch - this means SLP pattern recog will
> initially recognize { +, -, +, - } as ADDSUB for example but not factor in lane
> permutes from loads yet.  Now, suppose we have { +, -, -, + } seen in pattern
> recog - how's that handled?

These will be rejected, the lanes are still checked to ensure that it's a { +, - ,+, -}.
The lane permutes are checked in unchanged code, namely

vect_detect_pair_op (slp_tree node, bool two_operands = true,...)

which only calls ::matches when the operation is consistent.

> It might feed operations where we'd like to have inputs permuted and thus
> would end up with ADDSUB in the end?

In principle we could allow this, if we were to do this then one way to handle
it would be to add a permute for ADDSUB as well to permute the lanes.

I think it's better in this scenario to have the pattern recog code insert the
permute early on then, in the ::build phase. This will then maintain correctness
in the SLP tree and the extra permute will be dealt with in optimize_slp's normal
working.

It becomes a bit tricky here though, since such cases are perhaps better handled by
Increasing the VF and using permuted loads (LOAD_LANES).

> 
> That said, you do
> 
> +         /* Check to see if this node can be transformed during permute
> +            materialization.  */
> +         bool patt_trans_cand_p = is_pattern_convert_candidate_p (node);
> +         if (patt_trans_cand_p)
> +           bitmap_set_bit (n_transform, idx);
> 
> in the propagation stage (but this looks like a static marking).

Yeah indeed, it's just a way to say "inspect these later".

> 
> And then just for each of those candidates you somehow "undo" the
> permutes on the SLP childs (by modifying it in-place - uh, see below).
> But if you figure one child that is not permuted you've done that anyway but
> stop.

Yeah, if the child is unpermuted, vect_reverse_perm returns the identity permute
which is then later elided.  In the final version this should just not set permute at all.

> 
> So what I've expected is that this "transform" is done during propagation
> (where you now set the n_transform bit), by means of detecting the
> candidate as materialization point (but I'd have to recap the kind of permutes
> we expect to be incoming).  That is, for a specific set of permutes on the
> successor edges claim the node can handle the permute and thus propagate
> unpermuted upwards.
> 
> At the materialization stage you'd then transform the node.

Yeah, that makes sense. I separated them out to have distinct "stages", but yeah
the loops can be combined. I'll do that.

> 
> Btw, how do we handle targets that can do complex_addsub but not addsub?
> 

That's why during pattern matching the optab code checks both. ADDSUB is temporarily
accepted and then the transform code MUST transform it. So ADDSUB cannot leave
optimize_slp is the operation is not supported.

This is what I meant with the three states (missing from this patch)

 * not supported: Move on.
 * Supports ADDSUB only: Rewrite using ADDSUB, set type to 'cannot_transform'
 * Supports COMPLEX_ADD_ROT270 only: Rewrite using ADDSUB, set type to 'must_transform'
 * Supports both: Rewrite using ADDSUB, set type fo 'can_transform'

> > This works for ADD, but it doesn't work well when there's a
> > complicated sequence of loads.  So for example
> >
> > #define N 200
> > void g (double complex a[restrict N], double complex b[restrict N],
> > 	double complex c[restrict N])
> > {
> >   for (int i=0; i < N; i++)
> >     {
> >       c[i] =  a[i] * b[i];
> >     }
> > }
> >
> > will results in an SLP tree where each operand of the multiply does
> > not get to see all the original vectors:
> >
> > Final SLP tree for instance 0x5678a30:
> > node 0x55965a0 (max_nunits=4, refcnt=2) op template: REALPART_EXPR
> > <*_7> = _25;
> >   stmt 0 REALPART_EXPR <*_7> = _25;
> >   stmt 1 IMAGPART_EXPR <*_7> = _26;
> >   children 0x5596630
> > node 0x5596630 (max_nunits=4, refcnt=2)
> > op: VEC_PERM_EXPR
> >   stmt 0 _25 = _17 - _22;
> >   stmt 1 _26 = _23 + _24;
> >   lane permutation { 0[0] 1[1] }
> >   children 0x5596a20 0x5596ab0
> > node 0x5596a20 (max_nunits=1, refcnt=1) op template: _25 = _17 - _22;
> >   { }
> >   children 0x55966c0 0x5596870
> > node 0x55966c0 (max_nunits=4, refcnt=3) op template: _17 = _10 * _19;
> >   stmt 0 _17 = _10 * _19;
> >   stmt 1 _23 = _10 * _18;
> >   children 0x5596750 0x55967e0
> > node 0x5596750 (max_nunits=4, refcnt=2) op template: _10 =
> > REALPART_EXPR <*_3>;
> >   stmt 0 _10 = REALPART_EXPR <*_3>;
> >   stmt 1 _10 = REALPART_EXPR <*_3>;
> >   load permutation { 0 0 }
> > node 0x55967e0 (max_nunits=4, refcnt=2) op template: _19 =
> > REALPART_EXPR <*_5>;
> >   stmt 0 _19 = REALPART_EXPR <*_5>;
> >   stmt 1 _18 = IMAGPART_EXPR <*_5>;
> >   load permutation { 0 1 }
> > node 0x5596870 (max_nunits=4, refcnt=3) op template: _22 = _9 * _18;
> >   stmt 0 _22 = _9 * _18;
> >   stmt 1 _24 = _9 * _19;
> >   children 0x5596900 0x5596990
> > node 0x5596900 (max_nunits=4, refcnt=2) op template: _9 =
> > IMAGPART_EXPR <*_3>;
> >   stmt 0 _9 = IMAGPART_EXPR <*_3>;
> >   stmt 1 _9 = IMAGPART_EXPR <*_3>;
> >   load permutation { 1 1 }
> > node 0x5596990 (max_nunits=4, refcnt=2) op template: _18 =
> > IMAGPART_EXPR <*_5>;
> >   stmt 0 _18 = IMAGPART_EXPR <*_5>;
> >   stmt 1 _19 = REALPART_EXPR <*_5>;
> >   load permutation { 1 0 }
> > node 0x5596ab0 (max_nunits=1, refcnt=1) op template: _26 = _23 + _24;
> >   { }
> >   children 0x55966c0 0x5596870
> >
> > So depending on the node each multiply either sees REAL 3 + REAL 5 +
> > IMAG 5 or IMAG 3 + IMAG 5 + REAL 5.
> >
> > since we are removing the TWO_OPERANDS node we need to drop one of
> the
> > multiply and so we need to give it the original 2 vectors as a
> > parameter.  The current implementation emits a permute operation to
> > combine the loads again and later pushes the permute down.  This is
> > problematic as it again requires us to do df analysis early.
> >
> > To counter this, in the patch I have changed loads to no longer come
> > out of build_slp with LOAD_PERMUTES but instead to have a VEC_PERM
> above each load.
> 
> Yep.  I've been there before (not sure if I ever sent you my work-in-progress
> here).  There's some wrongs in your patch but I guess doing this exactly for
> the permutes optimize_slp handles would be fine.

Yeah, it fell apart when testing x264 code above so I noticed I had some of the
lanes wrong, but didn't fix it yet as wasn't needed for my testcases.

> 
> We should see to do this independently of the stuff above, I can handle this
> and will prepare a patch for this later.
> 

Thanks! That's a big help!

> > This is only temporary and the idea is that optimize SLP will push
> > them back down into the loads once it's deemed there are still permutes
> needed.
> >
> > So COMPLEX MUL becomes:
> >
> > Final SLP tree for instance 0x46d9c80:
> > node 0x45f76c0 (max_nunits=4, refcnt=2) op template: REALPART_EXPR
> > <*_7> = _25;
> >   stmt 0 REALPART_EXPR <*_7> = _25;
> >   stmt 1 IMAGPART_EXPR <*_7> = _26;
> >   children 0x45f7750
> > node 0x45f7750 (max_nunits=4, refcnt=2)
> > op: VEC_PERM_EXPR
> >   stmt 0 _25 = _17 - _22;
> >   stmt 1 _26 = _23 + _24;
> >   lane permutation { 0[0] 1[1] }
> >   children 0x45f7bd0 0x45f7c60
> > node 0x45f7bd0 (max_nunits=1, refcnt=1) op template: _25 = _17 - _22;
> >   { }
> >   children 0x45f77e0 0x45f7a20
> > node 0x45f77e0 (max_nunits=4, refcnt=3) op template: _17 = _10 * _19;
> >   stmt 0 _17 = _10 * _19;
> >   stmt 1 _23 = _10 * _18;
> >   children 0x45f7870 0x45f7990
> > node 0x45f7870 (max_nunits=4, refcnt=2)
> > op: VEC_PERM_EXPR
> >   { }
> >   lane permutation { 0[0] 0[0] }
> >   children 0x45f7900
> > node 0x45f7900 (max_nunits=1, refcnt=4) op template: _10 =
> > REALPART_EXPR <*_3>;
> >   stmt 0 _10 = REALPART_EXPR <*_3>;
> >   stmt 1 _9 = IMAGPART_EXPR <*_3>;
> >   load permutation { 0 1 }
> > node 0x45f7990 (max_nunits=4, refcnt=3) op template: _19 =
> > REALPART_EXPR <*_5>;
> >   stmt 0 _19 = REALPART_EXPR <*_5>;
> >   stmt 1 _18 = IMAGPART_EXPR <*_5>;
> >   load permutation { 0 1 }
> > node 0x45f7a20 (max_nunits=4, refcnt=3) op template: _22 = _9 * _18;
> >   stmt 0 _22 = _9 * _18;
> >   stmt 1 _24 = _9 * _19;
> >   children 0x45f7ab0 0x45f7b40
> > node 0x45f7ab0 (max_nunits=4, refcnt=2)
> > op: VEC_PERM_EXPR
> >   { }
> >   lane permutation { 0[1] 0[1] }
> >   children 0x45f7900
> > node 0x45f7b40 (max_nunits=4, refcnt=2)
> > op: VEC_PERM_EXPR
> >   { }
> >   lane permutation { 0[1] 0[0] }
> >   children 0x45f7990
> > node 0x45f7c60 (max_nunits=1, refcnt=1) op template: _26 = _23 + _24;
> >   { }
> >   children 0x45f77e0 0x45f7a20
> >
> > This representation has a number of advantages:
> >
> >  * complex mul becomes trivial, as trivial as add.
> >  * add now detects itself as part of a sequence of complex mul.  i.e. if you
> >    have only add, it will use it instead of failing to follow the df like now.
> >  * It opens the door a bit more to vectorize the first loop in x264.  The code
> >    there has a cross iteration mix of loading the full vector but in an order
> >    that the vectorizer thinks it can't use a single linear load for.  e.g.
> >
> >    .. = (a[0] + b[0]) - (a[4] + b[4])
> >    .. = (a[1] + b[1]) - (a[5] + b[5])
> >    .. = (a[2] + b[2]) - (a[6] + b[6])
> >    .. = (a[3] + b[3]) - (a[7] + b[6])
> >
> >  Ultimately it thinks it's cheaper to use scalar loads here, while (at
> > least for
> >  aarch64) if we see the unrolled loop together we can pattern match
> > this sequence  and rewrite it to something that uses the linear load.
> > This is because the  addition and subtract are widening.  So the
> > widening arithmetic operations we  have already only read half the vector
> since they're widening.
> >
> >  For this to work, we need to see all 16 bytes loads, so the full
> > unrolled loop  of 4 iterations. Which currently doesn't happen for
> > NEON but does for SVE. (though  I wonder if now that you've added SLP
> > decomposition if the initial SLP build  shouldn't allow building of
> > SLP trees that are wider than the vector length and  later just
> > decompose it or bump up the VF.).  However this is something for  another
> discussion :).
> >
> >  Changing the loads like this makes things a lot simpler and we
> > require no  additional work anywhere else.  The complexity ends up in
> > optimize SLP which  now has the following changes:
> >
> >  * loads no longer introduce permutes, so they don't see the permutations.
> >    Instead each load can lead to a permute which can see the permutes.
> >  * As we're moving permutes up we stop at call boundaries as well.
> >  * As we're calculating materialization points for the permute we inspect
> the
> >    node and check if we can transform it, if so we mark it for transformation.
> >  * After calculating the materialization and transform points we iterate over
> >    the transform points checking whether we should transform.  If we
> should we
> >    modify the permute at the materialization point (which should be at the
> >    transform point) into the inverse permute.
> >  * During materialization as the permute is transformed this would undo
> the
> >    permutation if we transformed the instruction.
> >  * After computing this we elide the permute node, or push it down
> towards the
> >    load and then back into the load itself, splitting the node if required.
> >
> > After this the tree should be back to it's normal form.  I still need
> > to work out when and where one needs to push down the permute all the
> > way to the loads.
> 
> It's indeed the case that the current permute "propagation" is away from
> loads only and a phase to propagate in the reverse direction is missing if we
> now have permute sources other than loads.  The cycle handling of the
> propagation is also a bit awkward and incomplete (but it "works").
> 

The thing I'm also wondering about is if it's always better to push it back in,
e.g. if from a given load you have different permuted variants of the result
if it can't be beneficial to sometimes keep the permutes and do the load only
once.

Thanks,
Tamar

> Richard.
> 
> > But this covers my thoughts.  Any feedback is appreciated to get it
> > right early on :)
> >
> > Regards,
> > Tamar
> >
> > --- inline copy of patch --
> > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index
> >
> b2f414d2131b867eda337cd30f5ed40ed7c9fa10..73d1d742414d16e06e8312967
> 990
> > d565eddb218d 100644
> > --- a/gcc/internal-fn.def
> > +++ b/gcc/internal-fn.def
> > @@ -277,6 +277,10 @@ DEF_INTERNAL_FLT_FN (SCALB, ECF_CONST, scalb,
> > binary)  DEF_INTERNAL_FLT_FLOATN_FN (FMIN, ECF_CONST, fmin, binary)
> > DEF_INTERNAL_FLT_FLOATN_FN (FMAX, ECF_CONST, fmax, binary)
> > DEF_INTERNAL_OPTAB_FN (XORSIGN, ECF_CONST, xorsign, binary)
> > +DEF_INTERNAL_OPTAB_FN (ADDSUB, ECF_CONST, addsub, binary)
> > +DEF_INTERNAL_OPTAB_FN (SUBADD, ECF_CONST, subadd, binary)
> > +DEF_INTERNAL_OPTAB_FN (MUL_ADDSUB, ECF_CONST, mul_addsub,
> binary)
> > +DEF_INTERNAL_OPTAB_FN (MUL_SUBADD, ECF_CONST, mul_subadd,
> binary)
> >  DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT90, ECF_CONST, cadd90,
> binary)
> > DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT270, ECF_CONST,
> cadd270, binary)
> > DEF_INTERNAL_OPTAB_FN (COMPLEX_MUL, ECF_CONST, cmul, binary)
> diff
> > --git a/gcc/optabs.def b/gcc/optabs.def index
> >
> b192a9d070b8aa72e5676b2eaa020b5bdd7ffcc8..73d23c49bdfa7423f67a62406
> 50c
> > 558cfaf3f2f0 100644
> > --- a/gcc/optabs.def
> > +++ b/gcc/optabs.def
> > @@ -290,6 +290,10 @@ OPTAB_D (atan_optab, "atan$a2")  OPTAB_D
> > (atanh_optab, "atanh$a2")  OPTAB_D (copysign_optab, "copysign$F$a3")
> > OPTAB_D (xorsign_optab, "xorsign$F$a3")
> > +OPTAB_D (addsub_optab, "addsub$a3")
> > +OPTAB_D (subadd_optab, "subadd$a3")
> > +OPTAB_D (mul_addsub_optab, "mul_addsub$a3") OPTAB_D
> > +(mul_subadd_optab, "mul_subadd$a3")
> >  OPTAB_D (cadd90_optab, "cadd90$a3")
> >  OPTAB_D (cadd270_optab, "cadd270$a3")  OPTAB_D (cmul_optab,
> > "cmul$a3") diff --git a/gcc/tree-vect-slp-patterns.c
> > b/gcc/tree-vect-slp-patterns.c index
> >
> 2ed49cd9edcabd7948b365dd60d7405b79079a7b..ce8d2daa1f06f17eda2cb082
> 55d7
> > 6bac9cf263f0 100644
> > --- a/gcc/tree-vect-slp-patterns.c
> > +++ b/gcc/tree-vect-slp-patterns.c
> > @@ -68,6 +68,39 @@ along with GCC; see the file COPYING3.  If not see
> >   * vect_pattern class
> >
> >
> **********************************************************
> ************
> > ********/
> >
> > +static bool
> > +vect_pattern_validate_optab (internal_fn ifn, slp_tree node);
> > +
> > +
> > +/* Check if the base or complex equivalents are supported.  */ static
> > +bool vect_pattern_validate_optab_or_eq (internal_fn ifn, slp_tree
> > +node) {
> > +  if (vect_pattern_validate_optab (ifn, node))
> > +    return true;
> > +
> > +  switch (ifn)
> > +  {
> > +    case IFN_ADDSUB:
> > +      ifn = IFN_COMPLEX_ADD_ROT270;
> > +      break;
> > +    case IFN_SUBADD:
> > +      ifn = IFN_COMPLEX_ADD_ROT90;
> > +      break;
> > +    case IFN_MUL_ADDSUB:
> > +      ifn = IFN_COMPLEX_MUL_CONJ;
> > +      break;
> > +    case IFN_MUL_SUBADD:
> > +      ifn = IFN_COMPLEX_MUL;
> > +      break;
> > +    default:
> > +      gcc_unreachable ();
> > +  }
> > +
> > +  return vect_pattern_validate_optab (ifn, node); }
> > +
> > +
> >  /* Default implementation of recognize that performs matching, validation
> and
> >     replacement of nodes but that can be overriden if required.  */
> >
> > @@ -630,8 +663,7 @@ complex_add_pattern::build (vec_info *vinfo)
> >
> >    /* First re-arrange the children.  */
> >    SLP_TREE_CHILDREN (*this->m_node)[0] = children[0];
> > -  SLP_TREE_CHILDREN (*this->m_node)[1] =
> > -    vect_build_swap_evenodd_node (children[1]);
> > +  SLP_TREE_CHILDREN (*this->m_node)[1] = children[1];
> >
> >    SLP_TREE_REF_COUNT (SLP_TREE_CHILDREN (*this->m_node)[0])++;
> >    SLP_TREE_REF_COUNT (SLP_TREE_CHILDREN (*this->m_node)[1])++;
> @@
> > -670,9 +702,9 @@ complex_add_pattern::matches (complex_operation_t
> op,
> >        Rotation 0 and 180 can be handled by normal SIMD code, so we don't
> need
> >        to care about them here.  */
> >    if (op == MINUS_PLUS)
> > -    ifn = IFN_COMPLEX_ADD_ROT90;
> > +    ifn = IFN_SUBADD;
> >    else if (op == PLUS_MINUS)
> > -    ifn = IFN_COMPLEX_ADD_ROT270;
> > +    ifn = IFN_ADDSUB;
> >    else
> >      return ifn;
> >
> > @@ -680,17 +712,7 @@ complex_add_pattern::matches
> (complex_operation_t op,
> >       we support.  */
> >    gcc_assert (ops->length () == 2);
> >
> > -  vec<slp_tree> children = SLP_TREE_CHILDREN ((*ops)[0]);
> > -
> > -  /* First node must be unpermuted.  */
> > -  if (linear_loads_p (perm_cache, children[0]) != PERM_EVENODD)
> > -    return IFN_LAST;
> > -
> > -  /* Second node must be permuted.  */
> > -  if (linear_loads_p (perm_cache, children[1]) != PERM_ODDEVEN)
> > -    return IFN_LAST;
> > -
> > -  if (!vect_pattern_validate_optab (ifn, *node))
> > +  if (!vect_pattern_validate_optab_or_eq (ifn, *node))
> >      return IFN_LAST;
> >
> >    return ifn;
> > @@ -1501,7 +1523,8 @@ vect_pattern_decl_t slp_patterns[]
> >       order patterns from the largest to the smallest.  Especially if they
> >       overlap in what they can detect.  */
> >
> > -  SLP_PATTERN (complex_operations_pattern),
> > +  SLP_PATTERN (complex_add_pattern),
> > +  //SLP_PATTERN (complex_operations_pattern),
> >  };
> >  #undef SLP_PATTERN
> >
> > diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index
> >
> 0c1f85beeb2e9f3fb7c66c15d4d30594b2570f9e..f24ad85d15780807c49153895b
> 73
> > 40c9b4d4d1f3 100644
> > --- a/gcc/tree-vect-slp.c
> > +++ b/gcc/tree-vect-slp.c
> > @@ -1764,25 +1764,69 @@ vect_build_slp_tree_2 (vec_info *vinfo,
> slp_tree node,
> >  	{
> >  	  *max_nunits = this_max_nunits;
> >  	  (*tree_size)++;
> > -	  node = vect_create_new_slp_node (node, stmts, 0);
> > -	  SLP_TREE_VECTYPE (node) = vectype;
> > +	  node = vect_create_new_slp_node (node, stmts, 1);
> >  	  /* And compute the load permutation.  Whether it is actually
> >  	     a permutation depends on the unrolling factor which is
> > -	     decided later.  */
> > -	  vec<unsigned> load_permutation;
> > +	     decided later.  We specifically materialize permutes at this
> > +	     early stage and leave it up to optimize SLP to push them back
> > +	     into the load if needed.  */
> > +	  lane_permutation_t lane_permutation;
> > +
> > +	  /* See the loads with a linear permute, this so that optimize_slp
> > +	     can later push the permute upwards and downwards without
> needing
> > +	     to inspect the parent node.  */
> > +	  load_permutation_t load_permutation;
> >  	  int j;
> > -	  stmt_vec_info load_info;
> > +	  stmt_vec_info load_info, curr_load;
> > +	  lane_permutation.create (group_size);
> >  	  load_permutation.create (group_size);
> > +	  vec<stmt_vec_info> group_stmts;
> > +	  bool linear = true;
> >  	  stmt_vec_info first_stmt_info
> >  	    = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS
> (node)[0]);
> > +	  group_stmts.create (DR_GROUP_SIZE (first_stmt_info));
> > +	  curr_load = first_stmt_info;
> >  	  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j,
> load_info)
> >  	    {
> >  	      int load_place = vect_get_place_in_interleaving_chain
> >  		  (load_info, first_stmt_info);
> >  	      gcc_assert (load_place != -1);
> > -	      load_permutation.safe_push (load_place);
> > +
> > +	      lane_permutation.safe_push (std::make_pair (0, load_place));
> > +	      load_permutation.quick_push (j);
> > +
> > +	      group_stmts.quick_push (curr_load);
> > +	      linear = linear && curr_load == load_info;
> > +	      curr_load = DR_GROUP_NEXT_ELEMENT (curr_load);
> > +	    }
> > +
> > +	  if (!linear)
> > +	    {
> > +	      slp_tree child;
> > +	      if (slp_tree *load = bst_map->get (group_stmts))
> > +		child = *load;
> > +	      else
> > +		{
> > +		  child =  vect_create_new_slp_node (group_stmts.copy (),
> 0);
> > +		  SLP_TREE_VECTYPE (child) = vectype;
> > +		  /* One for me, one for the BST map.  */
> > +		  SLP_TREE_REF_COUNT (child)++;
> > +		  bst_map->put (group_stmts, child);
> > +		}
> > +
> > +	      /* Set some metadata on the load node.  */
> > +	      SLP_TREE_REF_COUNT (child)++;
> > +	      SLP_TREE_LANES (child) = SLP_TREE_LANES (node);
> > +	      SLP_TREE_LOAD_PERMUTATION (child) = load_permutation;
> > +
> > +	      /* But return a permute node.  */
> > +	      SLP_TREE_LANE_PERMUTATION (node) = lane_permutation;
> > +	      SLP_TREE_CODE (node) = VEC_PERM_EXPR;
> > +	      SLP_TREE_CHILDREN (node).quick_push (child);
> > +	      SLP_TREE_SCALAR_STMTS (node) = vNULL;
> >  	    }
> > -	  SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
> > +
> > +	  SLP_TREE_VECTYPE (node) = vectype;
> >  	  return node;
> >  	}
> >      }
> > @@ -3430,8 +3474,8 @@ vect_analyze_slp (vec_info *vinfo, unsigned
> max_tree_size)
> >        FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
> >  	{
> >  	  slp_tree root = SLP_INSTANCE_TREE (instance);
> > -	  optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES
> (root),
> > -					&load_map, root);
> > +	  //optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES
> (root),
> > +	//				&load_map, root);
> >  	}
> >      }
> >
> > @@ -3548,6 +3592,83 @@ vect_slp_perms_eq (const vec<vec<unsigned> >
> &perms,
> >  			 sizeof (unsigned) * perms[perm_a].length ()) ==
> 0));  }
> >
> > +static bool
> > +is_pattern_convert_candidate_p (slp_tree node) {
> > +  stmt_vec_info stmt_info;
> > +  if (!(stmt_info = SLP_TREE_REPRESENTATIVE (node))
> > +      || !STMT_VINFO_SLP_VECT_ONLY_PATTERN (stmt_info))
> > +    return false;
> > +
> > +  gimple* stmt = STMT_VINFO_STMT (stmt_info);  if (!is_gimple_call
> > + (stmt))
> > +    return false;
> > +
> > +  if (!gimple_call_internal_p (stmt))
> > +    return false;
> > +
> > +  internal_fn ifn = gimple_call_internal_fn (stmt);
> > +
> > +  switch (ifn)
> > +    {
> > +    case IFN_ADDSUB:
> > +    case IFN_SUBADD:
> > +    case IFN_MUL_ADDSUB:
> > +    case IFN_MUL_SUBADD:
> > +      return true;
> > +    default:
> > +      break;
> > +    }
> > +
> > +  return false;
> > +}
> > +
> > +static internal_fn
> > +vect_get_transformed_pattern (internal_fn ifn, slp_tree /* node */) {
> > +  switch (ifn)
> > +    {
> > +    case IFN_ADDSUB:
> > +      ifn = IFN_COMPLEX_ADD_ROT270;
> > +      break;
> > +    case IFN_SUBADD:
> > +      ifn = IFN_COMPLEX_ADD_ROT90;
> > +      break;
> > +    case IFN_MUL_ADDSUB:
> > +      ifn = IFN_COMPLEX_MUL_CONJ;
> > +      break;
> > +    case IFN_MUL_SUBADD:
> > +      ifn = IFN_COMPLEX_MUL;
> > +      break;
> > +    default:
> > +      gcc_unreachable ();
> > +    }
> > +
> > +  return ifn;
> > +}
> > +
> > +
> > +static load_permutation_t
> > +vect_reverse_perm (load_permutation_t perm) {
> > +  auto_vec<std::pair<unsigned, unsigned> > pairs;
> > +  pairs.create (perm.length ());
> > +  unsigned i;
> > +  for (i = 0; i < perm.length (); i++)
> > +    pairs.quick_push (std::make_pair (i, perm[i]));
> > +
> > +  typedef const std::pair<unsigned, unsigned>* cmp_t;  pairs.qsort
> > + ([](const void* a, const void* b) -> int
> > +    { return (int)((cmp_t)a)->second - (int)((cmp_t)b)->second; });
> > +
> > +  load_permutation_t rev_perm;
> > +  rev_perm.create (perm.length ());
> > +  for (i = 0; i < perm.length (); i++)
> > +    rev_perm.quick_push (pairs[i].first);
> > +
> > +  return rev_perm;
> > +}
> > +
> >  /* Optimize the SLP graph of VINFO.  */
> >
> >  void
> > @@ -3578,11 +3699,13 @@ vect_optimize_slp (vec_info *vinfo)
> >
> >    auto_sbitmap n_visited (vertices.length ());
> >    auto_sbitmap n_materialize (vertices.length ());
> > +  auto_sbitmap n_transform (vertices.length ());
> >    auto_vec<int> n_perm (vertices.length ());
> >    auto_vec<vec<unsigned> > perms;
> >
> >    bitmap_clear (n_visited);
> >    bitmap_clear (n_materialize);
> > +  bitmap_clear (n_transform);
> >    n_perm.quick_grow_cleared (vertices.length ());
> >    perms.safe_push (vNULL); /* zero is no permute */
> >
> > @@ -3602,55 +3725,73 @@ vect_optimize_slp (vec_info *vinfo)
> >  	 as entries to the reverse graph.  */
> >        if (!slpg->vertices[idx].succ)
> >  	bitmap_set_bit (n_visited, idx);
> > -      /* Loads are the only thing generating permutes.  */
> > -      if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
> > -	continue;
> >
> > -      /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
> > -	 node unpermuted, record this permute.  */
> > -      stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
> > -      if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
> > -	continue;
> > -      dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
> > -      unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
> > -      bool any_permute = false;
> > -      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > +      int perm_seed = 0;
> > +
> > +      for (graph_edge *pred = slpg->vertices[idx].pred;
> > +	   pred; pred = pred->pred_next)
> >  	{
> > -	  unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
> > -	  imin = MIN (imin, idx);
> > -	  imax = MAX (imax, idx);
> > -	  if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
> > -	    any_permute = true;
> > -	}
> > -      /* If there's no permute no need to split one out.  */
> > -      if (!any_permute)
> > -	continue;
> > -      /* If the span doesn't match we'd disrupt VF computation, avoid
> > -	 that for now.  */
> > -      if (imax - imin + 1 != SLP_TREE_LANES (node))
> > -	continue;
> > +	  int src_idx = pred->src;
> > +	  slp_tree src_node = vertices[src_idx];
> >
> > -      /* For now only handle true permutes, like
> > -	 vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
> > -	 when permuting constants and invariants keeping the permute
> > -	 bijective.  */
> > -      auto_sbitmap load_index (SLP_TREE_LANES (node));
> > -      bitmap_clear (load_index);
> > -      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > -	bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION
> (node)[j] - imin);
> > -      unsigned j;
> > -      for (j = 0; j < SLP_TREE_LANES (node); ++j)
> > -	if (!bitmap_bit_p (load_index, j))
> > -	  break;
> > -      if (j != SLP_TREE_LANES (node))
> > -	continue;
> > +	  /* Loads are the only thing generating permutes, however the
> permutes
> > +	     are not yet lowered into the loads.  So instead chase up the chain
> > +	     to find a permute node.  */
> > +	  if (!SLP_TREE_LANE_PERMUTATION (src_node).exists ())
> > +	    continue;
> > +
> > +	  /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
> > +	     node unpermuted, record this permute.  */
> > +	  stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
> > +	  if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
> > +	    continue;
> > +	  dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
> > +	  unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
> > +	  bool any_permute = false;
> > +	  for (unsigned j = 0; j < SLP_TREE_LANES (src_node); ++j)
> > +	    {
> > +	      unsigned src_op = SLP_TREE_LANE_PERMUTATION
> (src_node)[j].first;
> > +	      unsigned idx = SLP_TREE_LANE_PERMUTATION
> (src_node)[j].second;
> > +	      /* This isn't a load permute moved out.  */
> > +	      if (src_op != 0)
> > +		continue;
> > +	      imin = MIN (imin, idx);
> > +	      imax = MAX (imax, idx);
> > +	      if (idx - SLP_TREE_LANE_PERMUTATION (src_node)[0].second != j)
> > +		any_permute = true;
> > +	    }
> > +	  /* If there's no permute no need to split one out.  */
> > +	  if (!any_permute)
> > +	    continue;
> > +	  /* If the span doesn't match we'd disrupt VF computation, avoid
> > +	     that for now.  */
> > +	  if (imax - imin + 1 != SLP_TREE_LANES (node))
> > +	    continue;
> > +
> > +	  /* For now only handle true permutes, like
> > +	     vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
> > +	     when permuting constants and invariants keeping the permute
> > +	     bijective.  */
> > +	  auto_sbitmap load_index (SLP_TREE_LANES (src_node));
> > +	  bitmap_clear (load_index);
> > +	  for (unsigned j = 0; j < SLP_TREE_LANES (src_node); ++j)
> > +	    bitmap_set_bit (load_index, SLP_TREE_LANE_PERMUTATION
> (src_node)[j].second - imin);
> > +	  unsigned j;
> > +	  for (j = 0; j < SLP_TREE_LANES (src_node); ++j)
> > +	  if (!bitmap_bit_p (load_index, j))
> > +	    break;
> > +	  if (j != SLP_TREE_LANES (src_node))
> > +	    continue;
> >
> > -      vec<unsigned> perm = vNULL;
> > -      perm.safe_grow (SLP_TREE_LANES (node), true);
> > -      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > -	perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> > -      perms.safe_push (perm);
> > -      n_perm[idx] = perms.length () - 1;
> > +	  vec<unsigned> perm = vNULL;
> > +	  perm.safe_grow (SLP_TREE_LANES (src_node), true);
> > +	  for (unsigned j = 0; j < SLP_TREE_LANES (src_node); ++j)
> > +	    perm[j] = SLP_TREE_LANE_PERMUTATION (src_node)[j].second -
> imin;
> > +	  perms.safe_push (perm);
> > +	  n_perm[src_idx] = perms.length () - 1;
> > +	  perm_seed = -1;
> > +       }
> > +      n_perm[idx] = perm_seed;
> >      }
> >
> >    /* Propagate permutes along the graph and compute materialization
> > points.  */ @@ -3667,12 +3808,12 @@ vect_optimize_slp (vec_info *vinfo)
> >  	  slp_tree node = vertices[idx];
> >  	  /* For leafs there's nothing to do - we've seeded permutes
> >  	     on those above.  */
> > -	  if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
> > +	  if (SLP_TREE_DEF_TYPE (node) != vect_internal_def ||
> > +!slpg->vertices[idx].succ)
> >  	    continue;
> >
> >  	  bitmap_set_bit (n_visited, idx);
> >
> > -	  /* We cannot move a permute across a store.  */
> > +	  /* We cannot move a permute across a store.  TODO: or a call.  */
> >  	  if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
> >  	      && DR_IS_WRITE
> >  		   (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE
> (node)))) @@
> > -3723,6 +3864,12 @@ vect_optimize_slp (vec_info *vinfo)
> >  	      changed = true;
> >  	    }
> >
> > +	  /* Check to see if this node can be transformed during permute
> > +	     materialization.  */
> > +	  bool patt_trans_cand_p = is_pattern_convert_candidate_p (node);
> > +	  if (patt_trans_cand_p)
> > +	    bitmap_set_bit (n_transform, idx);
> > +
> >  	  if (perm == 0)
> >  	    continue;
> >
> > @@ -3764,6 +3911,46 @@ vect_optimize_slp (vec_info *vinfo)
> >      }
> >    while (changed || iteration == 1);
> >
> > +  bitmap_clear (n_visited);
> > +
> > +  /* Transform and record any permutes for the usage of the instruction.
> > +     TODO: Check if it's cheaper to keep ADDSUB if available or use to
> > +	   COMPLEX_ADD based on the reverse permute that's needed.  */
> > +  sbitmap_iterator bi;
> > +  EXECUTE_IF_SET_IN_BITMAP (n_transform, 0, i, bi)
> > +    {
> > +      slp_tree node = vertices[i];
> > +
> > +      for (graph_edge *succ = slpg->vertices[i].succ;
> > +		       succ; succ = succ->succ_next)
> > +	{
> > +	  int perm_id = n_perm[succ->dest];
> > +	  if (perm_id <= 0)
> > +	    continue;
> > +
> > +	  load_permutation_t linperm = vect_reverse_perm
> (perms[perm_id]);
> > +	  perms[perm_id].release ();
> > +	  perms[perm_id] = linperm;
> > +
> > +          bitmap_set_bit (n_materialize, succ->dest);
> > +	}
> > +
> > +      /* Transform the node if we have determined that it's beneficial to do
> so.
> > +	 when we do so the perm that has been calculated for the children of
> the
> > +	 node will be applied.  */
> > +      stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
> > +      gimple* stmt = STMT_VINFO_STMT (stmt_info);
> > +      internal_fn old_ifn = gimple_call_internal_fn (stmt);
> > +      internal_fn ifn = vect_get_transformed_pattern (old_ifn ,node);
> > +
> > +      if (dump_enabled_p ())
> > +	dump_printf_loc (MSG_NOTE, vect_location,
> > +			 "Tranforming SLP expression from %s to %s\n",
> > +			 internal_fn_name (old_ifn), internal_fn_name (ifn));
> > +      gimple_call_set_internal_fn (as_a <gcall *> (stmt), ifn);
> > +    }
> > +
> > +
> >    /* Materialize.  */
> >    for (i = 0; i < vertices.length (); ++i)
> >      {
> > @@ -3798,6 +3985,11 @@ vect_optimize_slp (vec_info *vinfo)
> >  			    SLP_TREE_SCALAR_OPS (child), true);
> >  	}
> >
> > +      if (dump_enabled_p ())
> > +	dump_printf_loc (MSG_NOTE, vect_location,
> > +			 "processing node %p\n",
> > +			 node);
> > +
> >        if (bitmap_bit_p (n_materialize, i))
> >  	{
> >  	  if (SLP_TREE_LOAD_PERMUTATION (node).exists ()) @@ -3986,6
> > +4178,16 @@ vect_optimize_slp (vec_info *vinfo)
> >  	    }
> >  	}
> >      }
> > +
> > +  if (dump_enabled_p ())
> > +    {
> > +      dump_printf_loc (MSG_NOTE, vect_location, "Optimized SLP instances:
> \n");
> > +      hash_set<slp_tree> visited;
> > +      slp_instance instance;
> > +      FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
> > +	vect_print_slp_graph (MSG_NOTE, vect_location,
> > +			      SLP_INSTANCE_TREE (instance), visited);
> > +    }
> >  }
> >
> >  /* Gather loads reachable from the individual SLP graph entries.  */
> >
> >
> >
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409
> Nuernberg, Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)
Richard Biener June 22, 2021, 2:13 p.m. UTC | #4
On Tue, 22 Jun 2021, Tamar Christina wrote:

> 
> 
> > -----Original Message-----
> > From: Richard Biener <rguenther@suse.de>
> > Sent: Tuesday, June 22, 2021 1:08 PM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>
> > Subject: Re: [PATCH]middle-end[RFC] slp: new implementation of complex
> > numbers
> > 
> > On Mon, 21 Jun 2021, Tamar Christina wrote:
> > 
> > > Hi Richi,
> > >
> > > This patch is still very much incomplete and I do know that it is
> > > missing things but it's complete enough such that examples are working
> > > and allows me to show what I'm working towards.
> > >
> > > note, that this approach will remove a lot of code in
> > > tree-vect-slp-patterns but to keep the diff readable I've left them in
> > > and just commented out the calls or removed them where needed.
> > >
> > > The patch rewrites the complex numbers detection by splitting the
> > > detection of structure from dataflow analysis.  In principle the
> > > biggest difference between this and the previous implementation is
> > > that instead of trying to detect valid complex operations it *makes*
> > > an operation a valid complex operation.
> > >
> > > To do this each operation gets a dual optab which matches the same
> > > structure but has no dataflow requirement.
> > >
> > > i.e. in this patch I added 4, ADDSUB, SUBADD, MUL_ADDSUB,
> > MULL_SUBADD.
> > >
> > > There is a then a mapping between these and their variant with the
> > dataflow:
> > >
> > > * ADDSUB -> COMPLEX_ADD_ROT270
> > > * SUBADD -> COMPLEX_ADD_ROT90
> > > * MUL_ADDSUB -> COMPLEX_MUL_CONJ
> > > * MUL_SUBADD -> COMPLEX_MUL
> > >
> > > with the intention that when we detect the structure of an operation
> > > we query the backend for both optabs.
> > >
> > > This should result in one of three states:
> > >
> > >  * not supported: Move on.
> > >  * Supports ADDSUB only: Rewrite using ADDSUB, set type to
> > 'cannot_transform'
> > >  * Supports COMPLEX_ADD_ROT270 only: Rewrite using ADDSUB, set type
> > to 'must_transform'
> > >  * Supports both: Rewrite using ADDSUB, set type fo 'can_transform'
> > >
> > > with the idea behind `can_transform` is to check the costs of the
> > > inverse permute needed to use the complex operation and if this is
> > > very expensive then stick to addsub.  This requires the target to be
> > > able to cost the operations reasonably correct.
> > >
> > > So for ADD this looks like
> > >
> > >  === vect_match_slp_patterns ===
> > >  Analyzing SLP tree 0x494e970 for patterns  Found ADDSUB pattern in
> > > SLP tree  Target does not support ADDSUB for vector type vector(4)
> > > float  Found COMPLEX_ADD_ROT270 pattern in SLP tree  Target supports
> > > COMPLEX_ADD_ROT270 vectorization with mode vector(4) float Pattern
> > > matched SLP tree node 0x494e970 (max_nunits=4, refcnt=1) op template:
> > > REALPART_EXPR <*_10> = _23;
> > >   stmt 0 REALPART_EXPR <*_10> = _23;
> > >   stmt 1 IMAGPART_EXPR <*_10> = _22;
> > >   children 0x494ea00
> > > node 0x494ea00 (max_nunits=4, refcnt=1) op template: slp_patt_39 =
> > > .ADDSUB (_23, _23);
> > >   stmt 0 _23 = _6 + _13;
> > >   stmt 1 _22 = _12 - _8;
> > >   children 0x494eb20 0x494ebb0
> > > node 0x494eb20 (max_nunits=4, refcnt=1) op template: _13 =
> > > REALPART_EXPR <*_3>;
> > >   stmt 0 _13 = REALPART_EXPR <*_3>;
> > >   stmt 1 _12 = IMAGPART_EXPR <*_3>;
> > > node 0x494ebb0 (max_nunits=4, refcnt=1)
> > > op: VEC_PERM_EXPR
> > >   { }
> > >   lane permutation { 0[1] 0[0] }
> > >   children 0x494ec40
> > > node 0x494ec40 (max_nunits=1, refcnt=2) op template: _8 =
> > > REALPART_EXPR <*_5>;
> > >   stmt 0 _8 = REALPART_EXPR <*_5>;
> > >   stmt 1 _6 = IMAGPART_EXPR <*_5>;
> > >   load permutation { 0 1 }
> > >
> > > and later during optimize_slp we get
> > >
> > > Tranforming SLP expression from ADDSUB to COMPLEX_ADD_ROT270
> > > processing node 0x494ebb0 simplifying permute node 0x494ebb0
> > Optimized
> > > SLP instances:
> > > node 0x494e970 (max_nunits=4, refcnt=1) op template: REALPART_EXPR
> > > <*_10> = _23;
> > >    stmt 0 REALPART_EXPR <*_10> = _23;
> > >    stmt 1 IMAGPART_EXPR <*_10> = _22;
> > >    children 0x494ea00
> > > node 0x494ea00 (max_nunits=4, refcnt=1) op template: slp_patt_39 =
> > > .COMPLEX_ADD_ROT270 (_23, _23);
> > >    stmt 0 _23 = _6 + _13;
> > >    stmt 1 _22 = _12 - _8;
> > >    children 0x494eb20 0x494ebb0
> > > node 0x494eb20 (max_nunits=4, refcnt=1) op template: _13 =
> > > REALPART_EXPR <*_3>;
> > >    stmt 0 _13 = REALPART_EXPR <*_3>;
> > >    stmt 1 _12 = IMAGPART_EXPR <*_3>;
> > > node 0x494ebb0 (max_nunits=4, refcnt=1)
> > > op: VEC_PERM_EXPR
> > >    { }
> > >    lane permutation { 0[0] 0[1] }
> > >    children 0x494ec40
> > > node 0x494ec40 (max_nunits=1, refcnt=2) op template: _8 =
> > > REALPART_EXPR <*_5>;
> > >    stmt 0 _8 = REALPART_EXPR <*_5>;
> > >    stmt 1 _6 = IMAGPART_EXPR <*_5>;
> > >
> > > Now I still have to elide the VEC_PERM_EXPR here but that's easy.
> > 
> > So having skimmed half of the patch - this means SLP pattern recog will
> > initially recognize { +, -, +, - } as ADDSUB for example but not factor in lane
> > permutes from loads yet.  Now, suppose we have { +, -, -, + } seen in pattern
> > recog - how's that handled?
> 
> These will be rejected, the lanes are still checked to ensure that it's a { +, - ,+, -}.
> The lane permutes are checked in unchanged code, namely
> 
> vect_detect_pair_op (slp_tree node, bool two_operands = true,...)
> 
> which only calls ::matches when the operation is consistent.
> 
> > It might feed operations where we'd like to have inputs permuted and thus
> > would end up with ADDSUB in the end?
> 
> In principle we could allow this, if we were to do this then one way to handle
> it would be to add a permute for ADDSUB as well to permute the lanes.
> 
> I think it's better in this scenario to have the pattern recog code insert the
> permute early on then, in the ::build phase. This will then maintain correctness
> in the SLP tree and the extra permute will be dealt with in optimize_slp's normal
> working.
> 
> It becomes a bit tricky here though, since such cases are perhaps better handled by
> Increasing the VF and using permuted loads (LOAD_LANES).
> 
> > 
> > That said, you do
> > 
> > +         /* Check to see if this node can be transformed during permute
> > +            materialization.  */
> > +         bool patt_trans_cand_p = is_pattern_convert_candidate_p (node);
> > +         if (patt_trans_cand_p)
> > +           bitmap_set_bit (n_transform, idx);
> > 
> > in the propagation stage (but this looks like a static marking).
> 
> Yeah indeed, it's just a way to say "inspect these later".
> 
> > 
> > And then just for each of those candidates you somehow "undo" the
> > permutes on the SLP childs (by modifying it in-place - uh, see below).
> > But if you figure one child that is not permuted you've done that anyway but
> > stop.
> 
> Yeah, if the child is unpermuted, vect_reverse_perm returns the identity permute
> which is then later elided.  In the final version this should just not set permute at all.
> 
> > 
> > So what I've expected is that this "transform" is done during propagation
> > (where you now set the n_transform bit), by means of detecting the
> > candidate as materialization point (but I'd have to recap the kind of permutes
> > we expect to be incoming).  That is, for a specific set of permutes on the
> > successor edges claim the node can handle the permute and thus propagate
> > unpermuted upwards.
> > 
> > At the materialization stage you'd then transform the node.
> 
> Yeah, that makes sense. I separated them out to have distinct "stages", but yeah
> the loops can be combined. I'll do that.
> 
> > 
> > Btw, how do we handle targets that can do complex_addsub but not addsub?
> > 
> 
> That's why during pattern matching the optab code checks both. ADDSUB is temporarily
> accepted and then the transform code MUST transform it. So ADDSUB cannot leave
> optimize_slp is the operation is not supported.
> 
> This is what I meant with the three states (missing from this patch)
> 
>  * not supported: Move on.
>  * Supports ADDSUB only: Rewrite using ADDSUB, set type to 'cannot_transform'
>  * Supports COMPLEX_ADD_ROT270 only: Rewrite using ADDSUB, set type to 'must_transform'
>  * Supports both: Rewrite using ADDSUB, set type fo 'can_transform'

So I wonder why we don't do both during optimize_slp then?  The
candidates would be the two-operator VEC_PERM nodes and we'd have
"known" (but eventually not yet stable) permutes on the edges
of the graph.

Note yesterday I was playing to see how to match x86 pmaddwd which does

int res[4];
short src1[8], src2[8];

res[0] = ((int)src1[0]*(int)src2[0] + (int)src1[1]*(int)src2[1]);
res[1] = ((int)src1[2]*(int)src2[2] + (int)src1[3]*(int)src2[3]);
...

thus it performes a widening HI->SImode multiplication on two vectors
producing a SImode vector of half the number of elements, combining
adjacent lanes by accumulating them.  That's a SLP patter because
it involves two source lanes per operation (the accumulate), it is
somewhat special since it reduces the number of lanes on a SLP
branch - sth we likely do not handle very well.  If you look at
the SLP graph at this point you'll see the add and on one
operand there's a permute (select) { 0, 2, 4, 6 } and on the
other there's { 1, 3, 5, 7 } and since x86 supports widening mults
the scalar pattern detection handled that part.

Currently the lane permute propagation will not handle this
(I think), so we would need to amend it a bit.  I guess we also
need to add some debugging verboseness to it ;)

> > > This works for ADD, but it doesn't work well when there's a
> > > complicated sequence of loads.  So for example
> > >
> > > #define N 200
> > > void g (double complex a[restrict N], double complex b[restrict N],
> > > 	double complex c[restrict N])
> > > {
> > >   for (int i=0; i < N; i++)
> > >     {
> > >       c[i] =  a[i] * b[i];
> > >     }
> > > }
> > >
> > > will results in an SLP tree where each operand of the multiply does
> > > not get to see all the original vectors:
> > >
> > > Final SLP tree for instance 0x5678a30:
> > > node 0x55965a0 (max_nunits=4, refcnt=2) op template: REALPART_EXPR
> > > <*_7> = _25;
> > >   stmt 0 REALPART_EXPR <*_7> = _25;
> > >   stmt 1 IMAGPART_EXPR <*_7> = _26;
> > >   children 0x5596630
> > > node 0x5596630 (max_nunits=4, refcnt=2)
> > > op: VEC_PERM_EXPR
> > >   stmt 0 _25 = _17 - _22;
> > >   stmt 1 _26 = _23 + _24;
> > >   lane permutation { 0[0] 1[1] }
> > >   children 0x5596a20 0x5596ab0
> > > node 0x5596a20 (max_nunits=1, refcnt=1) op template: _25 = _17 - _22;
> > >   { }
> > >   children 0x55966c0 0x5596870
> > > node 0x55966c0 (max_nunits=4, refcnt=3) op template: _17 = _10 * _19;
> > >   stmt 0 _17 = _10 * _19;
> > >   stmt 1 _23 = _10 * _18;
> > >   children 0x5596750 0x55967e0
> > > node 0x5596750 (max_nunits=4, refcnt=2) op template: _10 =
> > > REALPART_EXPR <*_3>;
> > >   stmt 0 _10 = REALPART_EXPR <*_3>;
> > >   stmt 1 _10 = REALPART_EXPR <*_3>;
> > >   load permutation { 0 0 }
> > > node 0x55967e0 (max_nunits=4, refcnt=2) op template: _19 =
> > > REALPART_EXPR <*_5>;
> > >   stmt 0 _19 = REALPART_EXPR <*_5>;
> > >   stmt 1 _18 = IMAGPART_EXPR <*_5>;
> > >   load permutation { 0 1 }
> > > node 0x5596870 (max_nunits=4, refcnt=3) op template: _22 = _9 * _18;
> > >   stmt 0 _22 = _9 * _18;
> > >   stmt 1 _24 = _9 * _19;
> > >   children 0x5596900 0x5596990
> > > node 0x5596900 (max_nunits=4, refcnt=2) op template: _9 =
> > > IMAGPART_EXPR <*_3>;
> > >   stmt 0 _9 = IMAGPART_EXPR <*_3>;
> > >   stmt 1 _9 = IMAGPART_EXPR <*_3>;
> > >   load permutation { 1 1 }
> > > node 0x5596990 (max_nunits=4, refcnt=2) op template: _18 =
> > > IMAGPART_EXPR <*_5>;
> > >   stmt 0 _18 = IMAGPART_EXPR <*_5>;
> > >   stmt 1 _19 = REALPART_EXPR <*_5>;
> > >   load permutation { 1 0 }
> > > node 0x5596ab0 (max_nunits=1, refcnt=1) op template: _26 = _23 + _24;
> > >   { }
> > >   children 0x55966c0 0x5596870
> > >
> > > So depending on the node each multiply either sees REAL 3 + REAL 5 +
> > > IMAG 5 or IMAG 3 + IMAG 5 + REAL 5.
> > >
> > > since we are removing the TWO_OPERANDS node we need to drop one of
> > the
> > > multiply and so we need to give it the original 2 vectors as a
> > > parameter.  The current implementation emits a permute operation to
> > > combine the loads again and later pushes the permute down.  This is
> > > problematic as it again requires us to do df analysis early.
> > >
> > > To counter this, in the patch I have changed loads to no longer come
> > > out of build_slp with LOAD_PERMUTES but instead to have a VEC_PERM
> > above each load.
> > 
> > Yep.  I've been there before (not sure if I ever sent you my work-in-progress
> > here).  There's some wrongs in your patch but I guess doing this exactly for
> > the permutes optimize_slp handles would be fine.
> 
> Yeah, it fell apart when testing x264 code above so I noticed I had some of the
> lanes wrong, but didn't fix it yet as wasn't needed for my testcases.
> 
> > 
> > We should see to do this independently of the stuff above, I can handle this
> > and will prepare a patch for this later.
> > 
> 
> Thanks! That's a big help!
> 
> > > This is only temporary and the idea is that optimize SLP will push
> > > them back down into the loads once it's deemed there are still permutes
> > needed.
> > >
> > > So COMPLEX MUL becomes:
> > >
> > > Final SLP tree for instance 0x46d9c80:
> > > node 0x45f76c0 (max_nunits=4, refcnt=2) op template: REALPART_EXPR
> > > <*_7> = _25;
> > >   stmt 0 REALPART_EXPR <*_7> = _25;
> > >   stmt 1 IMAGPART_EXPR <*_7> = _26;
> > >   children 0x45f7750
> > > node 0x45f7750 (max_nunits=4, refcnt=2)
> > > op: VEC_PERM_EXPR
> > >   stmt 0 _25 = _17 - _22;
> > >   stmt 1 _26 = _23 + _24;
> > >   lane permutation { 0[0] 1[1] }
> > >   children 0x45f7bd0 0x45f7c60
> > > node 0x45f7bd0 (max_nunits=1, refcnt=1) op template: _25 = _17 - _22;
> > >   { }
> > >   children 0x45f77e0 0x45f7a20
> > > node 0x45f77e0 (max_nunits=4, refcnt=3) op template: _17 = _10 * _19;
> > >   stmt 0 _17 = _10 * _19;
> > >   stmt 1 _23 = _10 * _18;
> > >   children 0x45f7870 0x45f7990
> > > node 0x45f7870 (max_nunits=4, refcnt=2)
> > > op: VEC_PERM_EXPR
> > >   { }
> > >   lane permutation { 0[0] 0[0] }
> > >   children 0x45f7900
> > > node 0x45f7900 (max_nunits=1, refcnt=4) op template: _10 =
> > > REALPART_EXPR <*_3>;
> > >   stmt 0 _10 = REALPART_EXPR <*_3>;
> > >   stmt 1 _9 = IMAGPART_EXPR <*_3>;
> > >   load permutation { 0 1 }
> > > node 0x45f7990 (max_nunits=4, refcnt=3) op template: _19 =
> > > REALPART_EXPR <*_5>;
> > >   stmt 0 _19 = REALPART_EXPR <*_5>;
> > >   stmt 1 _18 = IMAGPART_EXPR <*_5>;
> > >   load permutation { 0 1 }
> > > node 0x45f7a20 (max_nunits=4, refcnt=3) op template: _22 = _9 * _18;
> > >   stmt 0 _22 = _9 * _18;
> > >   stmt 1 _24 = _9 * _19;
> > >   children 0x45f7ab0 0x45f7b40
> > > node 0x45f7ab0 (max_nunits=4, refcnt=2)
> > > op: VEC_PERM_EXPR
> > >   { }
> > >   lane permutation { 0[1] 0[1] }
> > >   children 0x45f7900
> > > node 0x45f7b40 (max_nunits=4, refcnt=2)
> > > op: VEC_PERM_EXPR
> > >   { }
> > >   lane permutation { 0[1] 0[0] }
> > >   children 0x45f7990
> > > node 0x45f7c60 (max_nunits=1, refcnt=1) op template: _26 = _23 + _24;
> > >   { }
> > >   children 0x45f77e0 0x45f7a20
> > >
> > > This representation has a number of advantages:
> > >
> > >  * complex mul becomes trivial, as trivial as add.
> > >  * add now detects itself as part of a sequence of complex mul.  i.e. if you
> > >    have only add, it will use it instead of failing to follow the df like now.
> > >  * It opens the door a bit more to vectorize the first loop in x264.  The code
> > >    there has a cross iteration mix of loading the full vector but in an order
> > >    that the vectorizer thinks it can't use a single linear load for.  e.g.
> > >
> > >    .. = (a[0] + b[0]) - (a[4] + b[4])
> > >    .. = (a[1] + b[1]) - (a[5] + b[5])
> > >    .. = (a[2] + b[2]) - (a[6] + b[6])
> > >    .. = (a[3] + b[3]) - (a[7] + b[6])
> > >
> > >  Ultimately it thinks it's cheaper to use scalar loads here, while (at
> > > least for
> > >  aarch64) if we see the unrolled loop together we can pattern match
> > > this sequence  and rewrite it to something that uses the linear load.
> > > This is because the  addition and subtract are widening.  So the
> > > widening arithmetic operations we  have already only read half the vector
> > since they're widening.
> > >
> > >  For this to work, we need to see all 16 bytes loads, so the full
> > > unrolled loop  of 4 iterations. Which currently doesn't happen for
> > > NEON but does for SVE. (though  I wonder if now that you've added SLP
> > > decomposition if the initial SLP build  shouldn't allow building of
> > > SLP trees that are wider than the vector length and  later just
> > > decompose it or bump up the VF.).  However this is something for  another
> > discussion :).
> > >
> > >  Changing the loads like this makes things a lot simpler and we
> > > require no  additional work anywhere else.  The complexity ends up in
> > > optimize SLP which  now has the following changes:
> > >
> > >  * loads no longer introduce permutes, so they don't see the permutations.
> > >    Instead each load can lead to a permute which can see the permutes.
> > >  * As we're moving permutes up we stop at call boundaries as well.
> > >  * As we're calculating materialization points for the permute we inspect
> > the
> > >    node and check if we can transform it, if so we mark it for transformation.
> > >  * After calculating the materialization and transform points we iterate over
> > >    the transform points checking whether we should transform.  If we
> > should we
> > >    modify the permute at the materialization point (which should be at the
> > >    transform point) into the inverse permute.
> > >  * During materialization as the permute is transformed this would undo
> > the
> > >    permutation if we transformed the instruction.
> > >  * After computing this we elide the permute node, or push it down
> > towards the
> > >    load and then back into the load itself, splitting the node if required.
> > >
> > > After this the tree should be back to it's normal form.  I still need
> > > to work out when and where one needs to push down the permute all the
> > > way to the loads.
> > 
> > It's indeed the case that the current permute "propagation" is away from
> > loads only and a phase to propagate in the reverse direction is missing if we
> > now have permute sources other than loads.  The cycle handling of the
> > propagation is also a bit awkward and incomplete (but it "works").
> > 
> 
> The thing I'm also wondering about is if it's always better to push it back in,
> e.g. if from a given load you have different permuted variants of the result
> if it can't be beneficial to sometimes keep the permutes and do the load only
> once.
> 
> Thanks,
> Tamar
> 
> > Richard.
> > 
> > > But this covers my thoughts.  Any feedback is appreciated to get it
> > > right early on :)
> > >
> > > Regards,
> > > Tamar
> > >
> > > --- inline copy of patch --
> > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index
> > >
> > b2f414d2131b867eda337cd30f5ed40ed7c9fa10..73d1d742414d16e06e8312967
> > 990
> > > d565eddb218d 100644
> > > --- a/gcc/internal-fn.def
> > > +++ b/gcc/internal-fn.def
> > > @@ -277,6 +277,10 @@ DEF_INTERNAL_FLT_FN (SCALB, ECF_CONST, scalb,
> > > binary)  DEF_INTERNAL_FLT_FLOATN_FN (FMIN, ECF_CONST, fmin, binary)
> > > DEF_INTERNAL_FLT_FLOATN_FN (FMAX, ECF_CONST, fmax, binary)
> > > DEF_INTERNAL_OPTAB_FN (XORSIGN, ECF_CONST, xorsign, binary)
> > > +DEF_INTERNAL_OPTAB_FN (ADDSUB, ECF_CONST, addsub, binary)
> > > +DEF_INTERNAL_OPTAB_FN (SUBADD, ECF_CONST, subadd, binary)
> > > +DEF_INTERNAL_OPTAB_FN (MUL_ADDSUB, ECF_CONST, mul_addsub,
> > binary)
> > > +DEF_INTERNAL_OPTAB_FN (MUL_SUBADD, ECF_CONST, mul_subadd,
> > binary)
> > >  DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT90, ECF_CONST, cadd90,
> > binary)
> > > DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT270, ECF_CONST,
> > cadd270, binary)
> > > DEF_INTERNAL_OPTAB_FN (COMPLEX_MUL, ECF_CONST, cmul, binary)
> > diff
> > > --git a/gcc/optabs.def b/gcc/optabs.def index
> > >
> > b192a9d070b8aa72e5676b2eaa020b5bdd7ffcc8..73d23c49bdfa7423f67a62406
> > 50c
> > > 558cfaf3f2f0 100644
> > > --- a/gcc/optabs.def
> > > +++ b/gcc/optabs.def
> > > @@ -290,6 +290,10 @@ OPTAB_D (atan_optab, "atan$a2")  OPTAB_D
> > > (atanh_optab, "atanh$a2")  OPTAB_D (copysign_optab, "copysign$F$a3")
> > > OPTAB_D (xorsign_optab, "xorsign$F$a3")
> > > +OPTAB_D (addsub_optab, "addsub$a3")
> > > +OPTAB_D (subadd_optab, "subadd$a3")
> > > +OPTAB_D (mul_addsub_optab, "mul_addsub$a3") OPTAB_D
> > > +(mul_subadd_optab, "mul_subadd$a3")
> > >  OPTAB_D (cadd90_optab, "cadd90$a3")
> > >  OPTAB_D (cadd270_optab, "cadd270$a3")  OPTAB_D (cmul_optab,
> > > "cmul$a3") diff --git a/gcc/tree-vect-slp-patterns.c
> > > b/gcc/tree-vect-slp-patterns.c index
> > >
> > 2ed49cd9edcabd7948b365dd60d7405b79079a7b..ce8d2daa1f06f17eda2cb082
> > 55d7
> > > 6bac9cf263f0 100644
> > > --- a/gcc/tree-vect-slp-patterns.c
> > > +++ b/gcc/tree-vect-slp-patterns.c
> > > @@ -68,6 +68,39 @@ along with GCC; see the file COPYING3.  If not see
> > >   * vect_pattern class
> > >
> > >
> > **********************************************************
> > ************
> > > ********/
> > >
> > > +static bool
> > > +vect_pattern_validate_optab (internal_fn ifn, slp_tree node);
> > > +
> > > +
> > > +/* Check if the base or complex equivalents are supported.  */ static
> > > +bool vect_pattern_validate_optab_or_eq (internal_fn ifn, slp_tree
> > > +node) {
> > > +  if (vect_pattern_validate_optab (ifn, node))
> > > +    return true;
> > > +
> > > +  switch (ifn)
> > > +  {
> > > +    case IFN_ADDSUB:
> > > +      ifn = IFN_COMPLEX_ADD_ROT270;
> > > +      break;
> > > +    case IFN_SUBADD:
> > > +      ifn = IFN_COMPLEX_ADD_ROT90;
> > > +      break;
> > > +    case IFN_MUL_ADDSUB:
> > > +      ifn = IFN_COMPLEX_MUL_CONJ;
> > > +      break;
> > > +    case IFN_MUL_SUBADD:
> > > +      ifn = IFN_COMPLEX_MUL;
> > > +      break;
> > > +    default:
> > > +      gcc_unreachable ();
> > > +  }
> > > +
> > > +  return vect_pattern_validate_optab (ifn, node); }
> > > +
> > > +
> > >  /* Default implementation of recognize that performs matching, validation
> > and
> > >     replacement of nodes but that can be overriden if required.  */
> > >
> > > @@ -630,8 +663,7 @@ complex_add_pattern::build (vec_info *vinfo)
> > >
> > >    /* First re-arrange the children.  */
> > >    SLP_TREE_CHILDREN (*this->m_node)[0] = children[0];
> > > -  SLP_TREE_CHILDREN (*this->m_node)[1] =
> > > -    vect_build_swap_evenodd_node (children[1]);
> > > +  SLP_TREE_CHILDREN (*this->m_node)[1] = children[1];
> > >
> > >    SLP_TREE_REF_COUNT (SLP_TREE_CHILDREN (*this->m_node)[0])++;
> > >    SLP_TREE_REF_COUNT (SLP_TREE_CHILDREN (*this->m_node)[1])++;
> > @@
> > > -670,9 +702,9 @@ complex_add_pattern::matches (complex_operation_t
> > op,
> > >        Rotation 0 and 180 can be handled by normal SIMD code, so we don't
> > need
> > >        to care about them here.  */
> > >    if (op == MINUS_PLUS)
> > > -    ifn = IFN_COMPLEX_ADD_ROT90;
> > > +    ifn = IFN_SUBADD;
> > >    else if (op == PLUS_MINUS)
> > > -    ifn = IFN_COMPLEX_ADD_ROT270;
> > > +    ifn = IFN_ADDSUB;
> > >    else
> > >      return ifn;
> > >
> > > @@ -680,17 +712,7 @@ complex_add_pattern::matches
> > (complex_operation_t op,
> > >       we support.  */
> > >    gcc_assert (ops->length () == 2);
> > >
> > > -  vec<slp_tree> children = SLP_TREE_CHILDREN ((*ops)[0]);
> > > -
> > > -  /* First node must be unpermuted.  */
> > > -  if (linear_loads_p (perm_cache, children[0]) != PERM_EVENODD)
> > > -    return IFN_LAST;
> > > -
> > > -  /* Second node must be permuted.  */
> > > -  if (linear_loads_p (perm_cache, children[1]) != PERM_ODDEVEN)
> > > -    return IFN_LAST;
> > > -
> > > -  if (!vect_pattern_validate_optab (ifn, *node))
> > > +  if (!vect_pattern_validate_optab_or_eq (ifn, *node))
> > >      return IFN_LAST;
> > >
> > >    return ifn;
> > > @@ -1501,7 +1523,8 @@ vect_pattern_decl_t slp_patterns[]
> > >       order patterns from the largest to the smallest.  Especially if they
> > >       overlap in what they can detect.  */
> > >
> > > -  SLP_PATTERN (complex_operations_pattern),
> > > +  SLP_PATTERN (complex_add_pattern),
> > > +  //SLP_PATTERN (complex_operations_pattern),
> > >  };
> > >  #undef SLP_PATTERN
> > >
> > > diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index
> > >
> > 0c1f85beeb2e9f3fb7c66c15d4d30594b2570f9e..f24ad85d15780807c49153895b
> > 73
> > > 40c9b4d4d1f3 100644
> > > --- a/gcc/tree-vect-slp.c
> > > +++ b/gcc/tree-vect-slp.c
> > > @@ -1764,25 +1764,69 @@ vect_build_slp_tree_2 (vec_info *vinfo,
> > slp_tree node,
> > >  	{
> > >  	  *max_nunits = this_max_nunits;
> > >  	  (*tree_size)++;
> > > -	  node = vect_create_new_slp_node (node, stmts, 0);
> > > -	  SLP_TREE_VECTYPE (node) = vectype;
> > > +	  node = vect_create_new_slp_node (node, stmts, 1);
> > >  	  /* And compute the load permutation.  Whether it is actually
> > >  	     a permutation depends on the unrolling factor which is
> > > -	     decided later.  */
> > > -	  vec<unsigned> load_permutation;
> > > +	     decided later.  We specifically materialize permutes at this
> > > +	     early stage and leave it up to optimize SLP to push them back
> > > +	     into the load if needed.  */
> > > +	  lane_permutation_t lane_permutation;
> > > +
> > > +	  /* See the loads with a linear permute, this so that optimize_slp
> > > +	     can later push the permute upwards and downwards without
> > needing
> > > +	     to inspect the parent node.  */
> > > +	  load_permutation_t load_permutation;
> > >  	  int j;
> > > -	  stmt_vec_info load_info;
> > > +	  stmt_vec_info load_info, curr_load;
> > > +	  lane_permutation.create (group_size);
> > >  	  load_permutation.create (group_size);
> > > +	  vec<stmt_vec_info> group_stmts;
> > > +	  bool linear = true;
> > >  	  stmt_vec_info first_stmt_info
> > >  	    = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS
> > (node)[0]);
> > > +	  group_stmts.create (DR_GROUP_SIZE (first_stmt_info));
> > > +	  curr_load = first_stmt_info;
> > >  	  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j,
> > load_info)
> > >  	    {
> > >  	      int load_place = vect_get_place_in_interleaving_chain
> > >  		  (load_info, first_stmt_info);
> > >  	      gcc_assert (load_place != -1);
> > > -	      load_permutation.safe_push (load_place);
> > > +
> > > +	      lane_permutation.safe_push (std::make_pair (0, load_place));
> > > +	      load_permutation.quick_push (j);
> > > +
> > > +	      group_stmts.quick_push (curr_load);
> > > +	      linear = linear && curr_load == load_info;
> > > +	      curr_load = DR_GROUP_NEXT_ELEMENT (curr_load);
> > > +	    }
> > > +
> > > +	  if (!linear)
> > > +	    {
> > > +	      slp_tree child;
> > > +	      if (slp_tree *load = bst_map->get (group_stmts))
> > > +		child = *load;
> > > +	      else
> > > +		{
> > > +		  child =  vect_create_new_slp_node (group_stmts.copy (),
> > 0);
> > > +		  SLP_TREE_VECTYPE (child) = vectype;
> > > +		  /* One for me, one for the BST map.  */
> > > +		  SLP_TREE_REF_COUNT (child)++;
> > > +		  bst_map->put (group_stmts, child);
> > > +		}
> > > +
> > > +	      /* Set some metadata on the load node.  */
> > > +	      SLP_TREE_REF_COUNT (child)++;
> > > +	      SLP_TREE_LANES (child) = SLP_TREE_LANES (node);
> > > +	      SLP_TREE_LOAD_PERMUTATION (child) = load_permutation;
> > > +
> > > +	      /* But return a permute node.  */
> > > +	      SLP_TREE_LANE_PERMUTATION (node) = lane_permutation;
> > > +	      SLP_TREE_CODE (node) = VEC_PERM_EXPR;
> > > +	      SLP_TREE_CHILDREN (node).quick_push (child);
> > > +	      SLP_TREE_SCALAR_STMTS (node) = vNULL;
> > >  	    }
> > > -	  SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
> > > +
> > > +	  SLP_TREE_VECTYPE (node) = vectype;
> > >  	  return node;
> > >  	}
> > >      }
> > > @@ -3430,8 +3474,8 @@ vect_analyze_slp (vec_info *vinfo, unsigned
> > max_tree_size)
> > >        FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
> > >  	{
> > >  	  slp_tree root = SLP_INSTANCE_TREE (instance);
> > > -	  optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES
> > (root),
> > > -					&load_map, root);
> > > +	  //optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES
> > (root),
> > > +	//				&load_map, root);
> > >  	}
> > >      }
> > >
> > > @@ -3548,6 +3592,83 @@ vect_slp_perms_eq (const vec<vec<unsigned> >
> > &perms,
> > >  			 sizeof (unsigned) * perms[perm_a].length ()) ==
> > 0));  }
> > >
> > > +static bool
> > > +is_pattern_convert_candidate_p (slp_tree node) {
> > > +  stmt_vec_info stmt_info;
> > > +  if (!(stmt_info = SLP_TREE_REPRESENTATIVE (node))
> > > +      || !STMT_VINFO_SLP_VECT_ONLY_PATTERN (stmt_info))
> > > +    return false;
> > > +
> > > +  gimple* stmt = STMT_VINFO_STMT (stmt_info);  if (!is_gimple_call
> > > + (stmt))
> > > +    return false;
> > > +
> > > +  if (!gimple_call_internal_p (stmt))
> > > +    return false;
> > > +
> > > +  internal_fn ifn = gimple_call_internal_fn (stmt);
> > > +
> > > +  switch (ifn)
> > > +    {
> > > +    case IFN_ADDSUB:
> > > +    case IFN_SUBADD:
> > > +    case IFN_MUL_ADDSUB:
> > > +    case IFN_MUL_SUBADD:
> > > +      return true;
> > > +    default:
> > > +      break;
> > > +    }
> > > +
> > > +  return false;
> > > +}
> > > +
> > > +static internal_fn
> > > +vect_get_transformed_pattern (internal_fn ifn, slp_tree /* node */) {
> > > +  switch (ifn)
> > > +    {
> > > +    case IFN_ADDSUB:
> > > +      ifn = IFN_COMPLEX_ADD_ROT270;
> > > +      break;
> > > +    case IFN_SUBADD:
> > > +      ifn = IFN_COMPLEX_ADD_ROT90;
> > > +      break;
> > > +    case IFN_MUL_ADDSUB:
> > > +      ifn = IFN_COMPLEX_MUL_CONJ;
> > > +      break;
> > > +    case IFN_MUL_SUBADD:
> > > +      ifn = IFN_COMPLEX_MUL;
> > > +      break;
> > > +    default:
> > > +      gcc_unreachable ();
> > > +    }
> > > +
> > > +  return ifn;
> > > +}
> > > +
> > > +
> > > +static load_permutation_t
> > > +vect_reverse_perm (load_permutation_t perm) {
> > > +  auto_vec<std::pair<unsigned, unsigned> > pairs;
> > > +  pairs.create (perm.length ());
> > > +  unsigned i;
> > > +  for (i = 0; i < perm.length (); i++)
> > > +    pairs.quick_push (std::make_pair (i, perm[i]));
> > > +
> > > +  typedef const std::pair<unsigned, unsigned>* cmp_t;  pairs.qsort
> > > + ([](const void* a, const void* b) -> int
> > > +    { return (int)((cmp_t)a)->second - (int)((cmp_t)b)->second; });
> > > +
> > > +  load_permutation_t rev_perm;
> > > +  rev_perm.create (perm.length ());
> > > +  for (i = 0; i < perm.length (); i++)
> > > +    rev_perm.quick_push (pairs[i].first);
> > > +
> > > +  return rev_perm;
> > > +}
> > > +
> > >  /* Optimize the SLP graph of VINFO.  */
> > >
> > >  void
> > > @@ -3578,11 +3699,13 @@ vect_optimize_slp (vec_info *vinfo)
> > >
> > >    auto_sbitmap n_visited (vertices.length ());
> > >    auto_sbitmap n_materialize (vertices.length ());
> > > +  auto_sbitmap n_transform (vertices.length ());
> > >    auto_vec<int> n_perm (vertices.length ());
> > >    auto_vec<vec<unsigned> > perms;
> > >
> > >    bitmap_clear (n_visited);
> > >    bitmap_clear (n_materialize);
> > > +  bitmap_clear (n_transform);
> > >    n_perm.quick_grow_cleared (vertices.length ());
> > >    perms.safe_push (vNULL); /* zero is no permute */
> > >
> > > @@ -3602,55 +3725,73 @@ vect_optimize_slp (vec_info *vinfo)
> > >  	 as entries to the reverse graph.  */
> > >        if (!slpg->vertices[idx].succ)
> > >  	bitmap_set_bit (n_visited, idx);
> > > -      /* Loads are the only thing generating permutes.  */
> > > -      if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
> > > -	continue;
> > >
> > > -      /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
> > > -	 node unpermuted, record this permute.  */
> > > -      stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
> > > -      if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
> > > -	continue;
> > > -      dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
> > > -      unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
> > > -      bool any_permute = false;
> > > -      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > +      int perm_seed = 0;
> > > +
> > > +      for (graph_edge *pred = slpg->vertices[idx].pred;
> > > +	   pred; pred = pred->pred_next)
> > >  	{
> > > -	  unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
> > > -	  imin = MIN (imin, idx);
> > > -	  imax = MAX (imax, idx);
> > > -	  if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
> > > -	    any_permute = true;
> > > -	}
> > > -      /* If there's no permute no need to split one out.  */
> > > -      if (!any_permute)
> > > -	continue;
> > > -      /* If the span doesn't match we'd disrupt VF computation, avoid
> > > -	 that for now.  */
> > > -      if (imax - imin + 1 != SLP_TREE_LANES (node))
> > > -	continue;
> > > +	  int src_idx = pred->src;
> > > +	  slp_tree src_node = vertices[src_idx];
> > >
> > > -      /* For now only handle true permutes, like
> > > -	 vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
> > > -	 when permuting constants and invariants keeping the permute
> > > -	 bijective.  */
> > > -      auto_sbitmap load_index (SLP_TREE_LANES (node));
> > > -      bitmap_clear (load_index);
> > > -      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > -	bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION
> > (node)[j] - imin);
> > > -      unsigned j;
> > > -      for (j = 0; j < SLP_TREE_LANES (node); ++j)
> > > -	if (!bitmap_bit_p (load_index, j))
> > > -	  break;
> > > -      if (j != SLP_TREE_LANES (node))
> > > -	continue;
> > > +	  /* Loads are the only thing generating permutes, however the
> > permutes
> > > +	     are not yet lowered into the loads.  So instead chase up the chain
> > > +	     to find a permute node.  */
> > > +	  if (!SLP_TREE_LANE_PERMUTATION (src_node).exists ())
> > > +	    continue;
> > > +
> > > +	  /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
> > > +	     node unpermuted, record this permute.  */
> > > +	  stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
> > > +	  if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
> > > +	    continue;
> > > +	  dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
> > > +	  unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
> > > +	  bool any_permute = false;
> > > +	  for (unsigned j = 0; j < SLP_TREE_LANES (src_node); ++j)
> > > +	    {
> > > +	      unsigned src_op = SLP_TREE_LANE_PERMUTATION
> > (src_node)[j].first;
> > > +	      unsigned idx = SLP_TREE_LANE_PERMUTATION
> > (src_node)[j].second;
> > > +	      /* This isn't a load permute moved out.  */
> > > +	      if (src_op != 0)
> > > +		continue;
> > > +	      imin = MIN (imin, idx);
> > > +	      imax = MAX (imax, idx);
> > > +	      if (idx - SLP_TREE_LANE_PERMUTATION (src_node)[0].second != j)
> > > +		any_permute = true;
> > > +	    }
> > > +	  /* If there's no permute no need to split one out.  */
> > > +	  if (!any_permute)
> > > +	    continue;
> > > +	  /* If the span doesn't match we'd disrupt VF computation, avoid
> > > +	     that for now.  */
> > > +	  if (imax - imin + 1 != SLP_TREE_LANES (node))
> > > +	    continue;
> > > +
> > > +	  /* For now only handle true permutes, like
> > > +	     vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
> > > +	     when permuting constants and invariants keeping the permute
> > > +	     bijective.  */
> > > +	  auto_sbitmap load_index (SLP_TREE_LANES (src_node));
> > > +	  bitmap_clear (load_index);
> > > +	  for (unsigned j = 0; j < SLP_TREE_LANES (src_node); ++j)
> > > +	    bitmap_set_bit (load_index, SLP_TREE_LANE_PERMUTATION
> > (src_node)[j].second - imin);
> > > +	  unsigned j;
> > > +	  for (j = 0; j < SLP_TREE_LANES (src_node); ++j)
> > > +	  if (!bitmap_bit_p (load_index, j))
> > > +	    break;
> > > +	  if (j != SLP_TREE_LANES (src_node))
> > > +	    continue;
> > >
> > > -      vec<unsigned> perm = vNULL;
> > > -      perm.safe_grow (SLP_TREE_LANES (node), true);
> > > -      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > -	perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> > > -      perms.safe_push (perm);
> > > -      n_perm[idx] = perms.length () - 1;
> > > +	  vec<unsigned> perm = vNULL;
> > > +	  perm.safe_grow (SLP_TREE_LANES (src_node), true);
> > > +	  for (unsigned j = 0; j < SLP_TREE_LANES (src_node); ++j)
> > > +	    perm[j] = SLP_TREE_LANE_PERMUTATION (src_node)[j].second -
> > imin;
> > > +	  perms.safe_push (perm);
> > > +	  n_perm[src_idx] = perms.length () - 1;
> > > +	  perm_seed = -1;
> > > +       }
> > > +      n_perm[idx] = perm_seed;
> > >      }
> > >
> > >    /* Propagate permutes along the graph and compute materialization
> > > points.  */ @@ -3667,12 +3808,12 @@ vect_optimize_slp (vec_info *vinfo)
> > >  	  slp_tree node = vertices[idx];
> > >  	  /* For leafs there's nothing to do - we've seeded permutes
> > >  	     on those above.  */
> > > -	  if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
> > > +	  if (SLP_TREE_DEF_TYPE (node) != vect_internal_def ||
> > > +!slpg->vertices[idx].succ)
> > >  	    continue;
> > >
> > >  	  bitmap_set_bit (n_visited, idx);
> > >
> > > -	  /* We cannot move a permute across a store.  */
> > > +	  /* We cannot move a permute across a store.  TODO: or a call.  */
> > >  	  if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
> > >  	      && DR_IS_WRITE
> > >  		   (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE
> > (node)))) @@
> > > -3723,6 +3864,12 @@ vect_optimize_slp (vec_info *vinfo)
> > >  	      changed = true;
> > >  	    }
> > >
> > > +	  /* Check to see if this node can be transformed during permute
> > > +	     materialization.  */
> > > +	  bool patt_trans_cand_p = is_pattern_convert_candidate_p (node);
> > > +	  if (patt_trans_cand_p)
> > > +	    bitmap_set_bit (n_transform, idx);
> > > +
> > >  	  if (perm == 0)
> > >  	    continue;
> > >
> > > @@ -3764,6 +3911,46 @@ vect_optimize_slp (vec_info *vinfo)
> > >      }
> > >    while (changed || iteration == 1);
> > >
> > > +  bitmap_clear (n_visited);
> > > +
> > > +  /* Transform and record any permutes for the usage of the instruction.
> > > +     TODO: Check if it's cheaper to keep ADDSUB if available or use to
> > > +	   COMPLEX_ADD based on the reverse permute that's needed.  */
> > > +  sbitmap_iterator bi;
> > > +  EXECUTE_IF_SET_IN_BITMAP (n_transform, 0, i, bi)
> > > +    {
> > > +      slp_tree node = vertices[i];
> > > +
> > > +      for (graph_edge *succ = slpg->vertices[i].succ;
> > > +		       succ; succ = succ->succ_next)
> > > +	{
> > > +	  int perm_id = n_perm[succ->dest];
> > > +	  if (perm_id <= 0)
> > > +	    continue;
> > > +
> > > +	  load_permutation_t linperm = vect_reverse_perm
> > (perms[perm_id]);
> > > +	  perms[perm_id].release ();
> > > +	  perms[perm_id] = linperm;
> > > +
> > > +          bitmap_set_bit (n_materialize, succ->dest);
> > > +	}
> > > +
> > > +      /* Transform the node if we have determined that it's beneficial to do
> > so.
> > > +	 when we do so the perm that has been calculated for the children of
> > the
> > > +	 node will be applied.  */
> > > +      stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
> > > +      gimple* stmt = STMT_VINFO_STMT (stmt_info);
> > > +      internal_fn old_ifn = gimple_call_internal_fn (stmt);
> > > +      internal_fn ifn = vect_get_transformed_pattern (old_ifn ,node);
> > > +
> > > +      if (dump_enabled_p ())
> > > +	dump_printf_loc (MSG_NOTE, vect_location,
> > > +			 "Tranforming SLP expression from %s to %s\n",
> > > +			 internal_fn_name (old_ifn), internal_fn_name (ifn));
> > > +      gimple_call_set_internal_fn (as_a <gcall *> (stmt), ifn);
> > > +    }
> > > +
> > > +
> > >    /* Materialize.  */
> > >    for (i = 0; i < vertices.length (); ++i)
> > >      {
> > > @@ -3798,6 +3985,11 @@ vect_optimize_slp (vec_info *vinfo)
> > >  			    SLP_TREE_SCALAR_OPS (child), true);
> > >  	}
> > >
> > > +      if (dump_enabled_p ())
> > > +	dump_printf_loc (MSG_NOTE, vect_location,
> > > +			 "processing node %p\n",
> > > +			 node);
> > > +
> > >        if (bitmap_bit_p (n_materialize, i))
> > >  	{
> > >  	  if (SLP_TREE_LOAD_PERMUTATION (node).exists ()) @@ -3986,6
> > > +4178,16 @@ vect_optimize_slp (vec_info *vinfo)
> > >  	    }
> > >  	}
> > >      }
> > > +
> > > +  if (dump_enabled_p ())
> > > +    {
> > > +      dump_printf_loc (MSG_NOTE, vect_location, "Optimized SLP instances:
> > \n");
> > > +      hash_set<slp_tree> visited;
> > > +      slp_instance instance;
> > > +      FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
> > > +	vect_print_slp_graph (MSG_NOTE, vect_location,
> > > +			      SLP_INSTANCE_TREE (instance), visited);
> > > +    }
> > >  }
> > >
> > >  /* Gather loads reachable from the individual SLP graph entries.  */
> > >
> > >
> > >
> > 
> > --
> > Richard Biener <rguenther@suse.de>
> > SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409
> > Nuernberg, Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)
>
diff mbox series

Patch

diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index b2f414d2131b867eda337cd30f5ed40ed7c9fa10..73d1d742414d16e06e8312967990d565eddb218d 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -277,6 +277,10 @@  DEF_INTERNAL_FLT_FN (SCALB, ECF_CONST, scalb, binary)
 DEF_INTERNAL_FLT_FLOATN_FN (FMIN, ECF_CONST, fmin, binary)
 DEF_INTERNAL_FLT_FLOATN_FN (FMAX, ECF_CONST, fmax, binary)
 DEF_INTERNAL_OPTAB_FN (XORSIGN, ECF_CONST, xorsign, binary)
+DEF_INTERNAL_OPTAB_FN (ADDSUB, ECF_CONST, addsub, binary)
+DEF_INTERNAL_OPTAB_FN (SUBADD, ECF_CONST, subadd, binary)
+DEF_INTERNAL_OPTAB_FN (MUL_ADDSUB, ECF_CONST, mul_addsub, binary)
+DEF_INTERNAL_OPTAB_FN (MUL_SUBADD, ECF_CONST, mul_subadd, binary)
 DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT90, ECF_CONST, cadd90, binary)
 DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT270, ECF_CONST, cadd270, binary)
 DEF_INTERNAL_OPTAB_FN (COMPLEX_MUL, ECF_CONST, cmul, binary)
diff --git a/gcc/optabs.def b/gcc/optabs.def
index b192a9d070b8aa72e5676b2eaa020b5bdd7ffcc8..73d23c49bdfa7423f67a6240650c558cfaf3f2f0 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -290,6 +290,10 @@  OPTAB_D (atan_optab, "atan$a2")
 OPTAB_D (atanh_optab, "atanh$a2")
 OPTAB_D (copysign_optab, "copysign$F$a3")
 OPTAB_D (xorsign_optab, "xorsign$F$a3")
+OPTAB_D (addsub_optab, "addsub$a3")
+OPTAB_D (subadd_optab, "subadd$a3")
+OPTAB_D (mul_addsub_optab, "mul_addsub$a3")
+OPTAB_D (mul_subadd_optab, "mul_subadd$a3")
 OPTAB_D (cadd90_optab, "cadd90$a3")
 OPTAB_D (cadd270_optab, "cadd270$a3")
 OPTAB_D (cmul_optab, "cmul$a3")
diff --git a/gcc/tree-vect-slp-patterns.c b/gcc/tree-vect-slp-patterns.c
index 2ed49cd9edcabd7948b365dd60d7405b79079a7b..ce8d2daa1f06f17eda2cb08255d76bac9cf263f0 100644
--- a/gcc/tree-vect-slp-patterns.c
+++ b/gcc/tree-vect-slp-patterns.c
@@ -68,6 +68,39 @@  along with GCC; see the file COPYING3.  If not see
  * vect_pattern class
  ******************************************************************************/
 
+static bool
+vect_pattern_validate_optab (internal_fn ifn, slp_tree node);
+
+
+/* Check if the base or complex equivalents are supported.  */
+static bool
+vect_pattern_validate_optab_or_eq (internal_fn ifn, slp_tree node)
+{
+  if (vect_pattern_validate_optab (ifn, node))
+    return true;
+
+  switch (ifn)
+  {
+    case IFN_ADDSUB:
+      ifn = IFN_COMPLEX_ADD_ROT270;
+      break;
+    case IFN_SUBADD:
+      ifn = IFN_COMPLEX_ADD_ROT90;
+      break;
+    case IFN_MUL_ADDSUB:
+      ifn = IFN_COMPLEX_MUL_CONJ;
+      break;
+    case IFN_MUL_SUBADD:
+      ifn = IFN_COMPLEX_MUL;
+      break;
+    default:
+      gcc_unreachable ();
+  }
+
+  return vect_pattern_validate_optab (ifn, node);
+}
+
+
 /* Default implementation of recognize that performs matching, validation and
    replacement of nodes but that can be overriden if required.  */
 
@@ -630,8 +663,7 @@  complex_add_pattern::build (vec_info *vinfo)
 
   /* First re-arrange the children.  */
   SLP_TREE_CHILDREN (*this->m_node)[0] = children[0];
-  SLP_TREE_CHILDREN (*this->m_node)[1] =
-    vect_build_swap_evenodd_node (children[1]);
+  SLP_TREE_CHILDREN (*this->m_node)[1] = children[1];
 
   SLP_TREE_REF_COUNT (SLP_TREE_CHILDREN (*this->m_node)[0])++;
   SLP_TREE_REF_COUNT (SLP_TREE_CHILDREN (*this->m_node)[1])++;
@@ -670,9 +702,9 @@  complex_add_pattern::matches (complex_operation_t op,
       Rotation 0 and 180 can be handled by normal SIMD code, so we don't need
       to care about them here.  */
   if (op == MINUS_PLUS)
-    ifn = IFN_COMPLEX_ADD_ROT90;
+    ifn = IFN_SUBADD;
   else if (op == PLUS_MINUS)
-    ifn = IFN_COMPLEX_ADD_ROT270;
+    ifn = IFN_ADDSUB;
   else
     return ifn;
 
@@ -680,17 +712,7 @@  complex_add_pattern::matches (complex_operation_t op,
      we support.  */
   gcc_assert (ops->length () == 2);
 
-  vec<slp_tree> children = SLP_TREE_CHILDREN ((*ops)[0]);
-
-  /* First node must be unpermuted.  */
-  if (linear_loads_p (perm_cache, children[0]) != PERM_EVENODD)
-    return IFN_LAST;
-
-  /* Second node must be permuted.  */
-  if (linear_loads_p (perm_cache, children[1]) != PERM_ODDEVEN)
-    return IFN_LAST;
-
-  if (!vect_pattern_validate_optab (ifn, *node))
+  if (!vect_pattern_validate_optab_or_eq (ifn, *node))
     return IFN_LAST;
 
   return ifn;
@@ -1501,7 +1523,8 @@  vect_pattern_decl_t slp_patterns[]
      order patterns from the largest to the smallest.  Especially if they
      overlap in what they can detect.  */
 
-  SLP_PATTERN (complex_operations_pattern),
+  SLP_PATTERN (complex_add_pattern),
+  //SLP_PATTERN (complex_operations_pattern),
 };
 #undef SLP_PATTERN
 
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 0c1f85beeb2e9f3fb7c66c15d4d30594b2570f9e..f24ad85d15780807c49153895b7340c9b4d4d1f3 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -1764,25 +1764,69 @@  vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
 	{
 	  *max_nunits = this_max_nunits;
 	  (*tree_size)++;
-	  node = vect_create_new_slp_node (node, stmts, 0);
-	  SLP_TREE_VECTYPE (node) = vectype;
+	  node = vect_create_new_slp_node (node, stmts, 1);
 	  /* And compute the load permutation.  Whether it is actually
 	     a permutation depends on the unrolling factor which is
-	     decided later.  */
-	  vec<unsigned> load_permutation;
+	     decided later.  We specifically materialize permutes at this
+	     early stage and leave it up to optimize SLP to push them back
+	     into the load if needed.  */
+	  lane_permutation_t lane_permutation;
+
+	  /* See the loads with a linear permute, this so that optimize_slp
+	     can later push the permute upwards and downwards without needing
+	     to inspect the parent node.  */
+	  load_permutation_t load_permutation;
 	  int j;
-	  stmt_vec_info load_info;
+	  stmt_vec_info load_info, curr_load;
+	  lane_permutation.create (group_size);
 	  load_permutation.create (group_size);
+	  vec<stmt_vec_info> group_stmts;
+	  bool linear = true;
 	  stmt_vec_info first_stmt_info
 	    = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
+	  group_stmts.create (DR_GROUP_SIZE (first_stmt_info));
+	  curr_load = first_stmt_info;
 	  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
 	    {
 	      int load_place = vect_get_place_in_interleaving_chain
 		  (load_info, first_stmt_info);
 	      gcc_assert (load_place != -1);
-	      load_permutation.safe_push (load_place);
+
+	      lane_permutation.safe_push (std::make_pair (0, load_place));
+	      load_permutation.quick_push (j);
+
+	      group_stmts.quick_push (curr_load);
+	      linear = linear && curr_load == load_info;
+	      curr_load = DR_GROUP_NEXT_ELEMENT (curr_load);
+	    }
+
+	  if (!linear)
+	    {
+	      slp_tree child;
+	      if (slp_tree *load = bst_map->get (group_stmts))
+		child = *load;
+	      else
+		{
+		  child =  vect_create_new_slp_node (group_stmts.copy (), 0);
+		  SLP_TREE_VECTYPE (child) = vectype;
+		  /* One for me, one for the BST map.  */
+		  SLP_TREE_REF_COUNT (child)++;
+		  bst_map->put (group_stmts, child);
+		}
+
+	      /* Set some metadata on the load node.  */
+	      SLP_TREE_REF_COUNT (child)++;
+	      SLP_TREE_LANES (child) = SLP_TREE_LANES (node);
+	      SLP_TREE_LOAD_PERMUTATION (child) = load_permutation;
+
+	      /* But return a permute node.  */
+	      SLP_TREE_LANE_PERMUTATION (node) = lane_permutation;
+	      SLP_TREE_CODE (node) = VEC_PERM_EXPR;
+	      SLP_TREE_CHILDREN (node).quick_push (child);
+	      SLP_TREE_SCALAR_STMTS (node) = vNULL;
 	    }
-	  SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
+
+	  SLP_TREE_VECTYPE (node) = vectype;
 	  return node;
 	}
     }
@@ -3430,8 +3474,8 @@  vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
 	{
 	  slp_tree root = SLP_INSTANCE_TREE (instance);
-	  optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
-					&load_map, root);
+	  //optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
+	//				&load_map, root);
 	}
     }
 
@@ -3548,6 +3592,83 @@  vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
 			 sizeof (unsigned) * perms[perm_a].length ()) == 0));
 }
 
+static bool
+is_pattern_convert_candidate_p (slp_tree node)
+{
+  stmt_vec_info stmt_info;
+  if (!(stmt_info = SLP_TREE_REPRESENTATIVE (node))
+      || !STMT_VINFO_SLP_VECT_ONLY_PATTERN (stmt_info))
+    return false;
+
+  gimple* stmt = STMT_VINFO_STMT (stmt_info);
+  if (!is_gimple_call (stmt))
+    return false;
+
+  if (!gimple_call_internal_p (stmt))
+    return false;
+
+  internal_fn ifn = gimple_call_internal_fn (stmt);
+
+  switch (ifn)
+    {
+    case IFN_ADDSUB:
+    case IFN_SUBADD:
+    case IFN_MUL_ADDSUB:
+    case IFN_MUL_SUBADD:
+      return true;
+    default:
+      break;
+    }
+
+  return false;
+}
+
+static internal_fn
+vect_get_transformed_pattern (internal_fn ifn, slp_tree /* node */)
+{
+  switch (ifn)
+    {
+    case IFN_ADDSUB:
+      ifn = IFN_COMPLEX_ADD_ROT270;
+      break;
+    case IFN_SUBADD:
+      ifn = IFN_COMPLEX_ADD_ROT90;
+      break;
+    case IFN_MUL_ADDSUB:
+      ifn = IFN_COMPLEX_MUL_CONJ;
+      break;
+    case IFN_MUL_SUBADD:
+      ifn = IFN_COMPLEX_MUL;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  return ifn;
+}
+
+
+static load_permutation_t
+vect_reverse_perm (load_permutation_t perm)
+{
+  auto_vec<std::pair<unsigned, unsigned> > pairs;
+  pairs.create (perm.length ());
+  unsigned i;
+  for (i = 0; i < perm.length (); i++)
+    pairs.quick_push (std::make_pair (i, perm[i]));
+
+  typedef const std::pair<unsigned, unsigned>* cmp_t;
+  pairs.qsort ([](const void* a, const void* b) -> int
+    { return (int)((cmp_t)a)->second - (int)((cmp_t)b)->second; });
+
+  load_permutation_t rev_perm;
+  rev_perm.create (perm.length ());
+  for (i = 0; i < perm.length (); i++)
+    rev_perm.quick_push (pairs[i].first);
+
+  return rev_perm;
+}
+
 /* Optimize the SLP graph of VINFO.  */
 
 void
@@ -3578,11 +3699,13 @@  vect_optimize_slp (vec_info *vinfo)
 
   auto_sbitmap n_visited (vertices.length ());
   auto_sbitmap n_materialize (vertices.length ());
+  auto_sbitmap n_transform (vertices.length ());
   auto_vec<int> n_perm (vertices.length ());
   auto_vec<vec<unsigned> > perms;
 
   bitmap_clear (n_visited);
   bitmap_clear (n_materialize);
+  bitmap_clear (n_transform);
   n_perm.quick_grow_cleared (vertices.length ());
   perms.safe_push (vNULL); /* zero is no permute */
 
@@ -3602,55 +3725,73 @@  vect_optimize_slp (vec_info *vinfo)
 	 as entries to the reverse graph.  */
       if (!slpg->vertices[idx].succ)
 	bitmap_set_bit (n_visited, idx);
-      /* Loads are the only thing generating permutes.  */
-      if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
-	continue;
 
-      /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
-	 node unpermuted, record this permute.  */
-      stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
-      if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
-	continue;
-      dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
-      unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
-      bool any_permute = false;
-      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
+      int perm_seed = 0;
+
+      for (graph_edge *pred = slpg->vertices[idx].pred;
+	   pred; pred = pred->pred_next)
 	{
-	  unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
-	  imin = MIN (imin, idx);
-	  imax = MAX (imax, idx);
-	  if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
-	    any_permute = true;
-	}
-      /* If there's no permute no need to split one out.  */
-      if (!any_permute)
-	continue;
-      /* If the span doesn't match we'd disrupt VF computation, avoid
-	 that for now.  */
-      if (imax - imin + 1 != SLP_TREE_LANES (node))
-	continue;
+	  int src_idx = pred->src;
+	  slp_tree src_node = vertices[src_idx];
 
-      /* For now only handle true permutes, like
-	 vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
-	 when permuting constants and invariants keeping the permute
-	 bijective.  */
-      auto_sbitmap load_index (SLP_TREE_LANES (node));
-      bitmap_clear (load_index);
-      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
-	bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
-      unsigned j;
-      for (j = 0; j < SLP_TREE_LANES (node); ++j)
-	if (!bitmap_bit_p (load_index, j))
-	  break;
-      if (j != SLP_TREE_LANES (node))
-	continue;
+	  /* Loads are the only thing generating permutes, however the permutes
+	     are not yet lowered into the loads.  So instead chase up the chain
+	     to find a permute node.  */
+	  if (!SLP_TREE_LANE_PERMUTATION (src_node).exists ())
+	    continue;
+
+	  /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
+	     node unpermuted, record this permute.  */
+	  stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
+	  if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
+	    continue;
+	  dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
+	  unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
+	  bool any_permute = false;
+	  for (unsigned j = 0; j < SLP_TREE_LANES (src_node); ++j)
+	    {
+	      unsigned src_op = SLP_TREE_LANE_PERMUTATION (src_node)[j].first;
+	      unsigned idx = SLP_TREE_LANE_PERMUTATION (src_node)[j].second;
+	      /* This isn't a load permute moved out.  */
+	      if (src_op != 0)
+		continue;
+	      imin = MIN (imin, idx);
+	      imax = MAX (imax, idx);
+	      if (idx - SLP_TREE_LANE_PERMUTATION (src_node)[0].second != j)
+		any_permute = true;
+	    }
+	  /* If there's no permute no need to split one out.  */
+	  if (!any_permute)
+	    continue;
+	  /* If the span doesn't match we'd disrupt VF computation, avoid
+	     that for now.  */
+	  if (imax - imin + 1 != SLP_TREE_LANES (node))
+	    continue;
+
+	  /* For now only handle true permutes, like
+	     vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
+	     when permuting constants and invariants keeping the permute
+	     bijective.  */
+	  auto_sbitmap load_index (SLP_TREE_LANES (src_node));
+	  bitmap_clear (load_index);
+	  for (unsigned j = 0; j < SLP_TREE_LANES (src_node); ++j)
+	    bitmap_set_bit (load_index, SLP_TREE_LANE_PERMUTATION (src_node)[j].second - imin);
+	  unsigned j;
+	  for (j = 0; j < SLP_TREE_LANES (src_node); ++j)
+	  if (!bitmap_bit_p (load_index, j))
+	    break;
+	  if (j != SLP_TREE_LANES (src_node))
+	    continue;
 
-      vec<unsigned> perm = vNULL;
-      perm.safe_grow (SLP_TREE_LANES (node), true);
-      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
-	perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
-      perms.safe_push (perm);
-      n_perm[idx] = perms.length () - 1;
+	  vec<unsigned> perm = vNULL;
+	  perm.safe_grow (SLP_TREE_LANES (src_node), true);
+	  for (unsigned j = 0; j < SLP_TREE_LANES (src_node); ++j)
+	    perm[j] = SLP_TREE_LANE_PERMUTATION (src_node)[j].second - imin;
+	  perms.safe_push (perm);
+	  n_perm[src_idx] = perms.length () - 1;
+	  perm_seed = -1;
+       }
+      n_perm[idx] = perm_seed;
     }
 
   /* Propagate permutes along the graph and compute materialization points.  */
@@ -3667,12 +3808,12 @@  vect_optimize_slp (vec_info *vinfo)
 	  slp_tree node = vertices[idx];
 	  /* For leafs there's nothing to do - we've seeded permutes
 	     on those above.  */
-	  if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
+	  if (SLP_TREE_DEF_TYPE (node) != vect_internal_def || !slpg->vertices[idx].succ)
 	    continue;
 
 	  bitmap_set_bit (n_visited, idx);
 
-	  /* We cannot move a permute across a store.  */
+	  /* We cannot move a permute across a store.  TODO: or a call.  */
 	  if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
 	      && DR_IS_WRITE
 		   (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
@@ -3723,6 +3864,12 @@  vect_optimize_slp (vec_info *vinfo)
 	      changed = true;
 	    }
 
+	  /* Check to see if this node can be transformed during permute
+	     materialization.  */
+	  bool patt_trans_cand_p = is_pattern_convert_candidate_p (node);
+	  if (patt_trans_cand_p)
+	    bitmap_set_bit (n_transform, idx);
+
 	  if (perm == 0)
 	    continue;
 
@@ -3764,6 +3911,46 @@  vect_optimize_slp (vec_info *vinfo)
     }
   while (changed || iteration == 1);
 
+  bitmap_clear (n_visited);
+
+  /* Transform and record any permutes for the usage of the instruction.
+     TODO: Check if it's cheaper to keep ADDSUB if available or use to
+	   COMPLEX_ADD based on the reverse permute that's needed.  */
+  sbitmap_iterator bi;
+  EXECUTE_IF_SET_IN_BITMAP (n_transform, 0, i, bi)
+    {
+      slp_tree node = vertices[i];
+
+      for (graph_edge *succ = slpg->vertices[i].succ;
+		       succ; succ = succ->succ_next)
+	{
+	  int perm_id = n_perm[succ->dest];
+	  if (perm_id <= 0)
+	    continue;
+
+	  load_permutation_t linperm = vect_reverse_perm (perms[perm_id]);
+	  perms[perm_id].release ();
+	  perms[perm_id] = linperm;
+
+          bitmap_set_bit (n_materialize, succ->dest);
+	}
+
+      /* Transform the node if we have determined that it's beneficial to do so.
+	 when we do so the perm that has been calculated for the children of the
+	 node will be applied.  */
+      stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
+      gimple* stmt = STMT_VINFO_STMT (stmt_info);
+      internal_fn old_ifn = gimple_call_internal_fn (stmt);
+      internal_fn ifn = vect_get_transformed_pattern (old_ifn ,node);
+
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "Tranforming SLP expression from %s to %s\n",
+			 internal_fn_name (old_ifn), internal_fn_name (ifn));
+      gimple_call_set_internal_fn (as_a <gcall *> (stmt), ifn);
+    }
+
+
   /* Materialize.  */
   for (i = 0; i < vertices.length (); ++i)
     {
@@ -3798,6 +3985,11 @@  vect_optimize_slp (vec_info *vinfo)
 			    SLP_TREE_SCALAR_OPS (child), true);
 	}
 
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "processing node %p\n",
+			 node);
+
       if (bitmap_bit_p (n_materialize, i))
 	{
 	  if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
@@ -3986,6 +4178,16 @@  vect_optimize_slp (vec_info *vinfo)
 	    }
 	}
     }
+
+  if (dump_enabled_p ())
+    {
+      dump_printf_loc (MSG_NOTE, vect_location, "Optimized SLP instances: \n");
+      hash_set<slp_tree> visited;
+      slp_instance instance;
+      FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
+	vect_print_slp_graph (MSG_NOTE, vect_location,
+			      SLP_INSTANCE_TREE (instance), visited);
+    }
 }
 
 /* Gather loads reachable from the individual SLP graph entries.  */