diff mbox series

[8/9] ifcvt: Handle swap-style idioms differently.

Message ID 20190802151028.15590-9-rdapp@linux.ibm.com
State New
Headers show
Series Improve icvt "convert multiple" | expand

Commit Message

Robin Dapp Aug. 2, 2019, 3:10 p.m. UTC
A swap-style idiom like
 tmp = a
   a = b
   b = tmp
would be transformed like
 tmp_tmp = cond ? a : tmp
 tmp_a   = cond ? b : a
 tmp_b   = cond ? tmp_tmp : b
 [...]
including rewiring the first source operand to previous writes (e.g. tmp ->
tmp_tmp).

The code would recognize this, though, and change the last line to
 tmp_b   = cond ? a : b.

Without additional temporaries we can now emit the following sequence:
 tmp = a	     // (no condition here)
 a = cond ? b : a
 b = cond ? tmp : b
avoiding any rewiring which would break things now.

check_need_cmovs () finds swap-style idioms and marks the first of the
three instructions as not needing a cmove.
---
 gcc/ifcvt.c | 97 ++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 78 insertions(+), 19 deletions(-)

Comments

Richard Sandiford Aug. 6, 2019, 8:36 p.m. UTC | #1
Robin Dapp <rdapp@linux.ibm.com> writes:
> A swap-style idiom like
>  tmp = a
>    a = b
>    b = tmp
> would be transformed like
>  tmp_tmp = cond ? a : tmp
>  tmp_a   = cond ? b : a
>  tmp_b   = cond ? tmp_tmp : b
>  [...]
> including rewiring the first source operand to previous writes (e.g. tmp ->
> tmp_tmp).
>
> The code would recognize this, though, and change the last line to
>  tmp_b   = cond ? a : b.
>
> Without additional temporaries we can now emit the following sequence:
>  tmp = a	     // (no condition here)
>  a = cond ? b : a
>  b = cond ? tmp : b
> avoiding any rewiring which would break things now.
>
> check_need_cmovs () finds swap-style idioms and marks the first of the
> three instructions as not needing a cmove.

Looks like a nice optimisation, but could we just test whether the
destination of a set isn't live on exit from the then block?  I think
we could do that on the fly during the main noce_convert_multiple_sets
loop.

Thanks,
Richard

> ---
>  gcc/ifcvt.c | 97 ++++++++++++++++++++++++++++++++++++++++++-----------
>  1 file changed, 78 insertions(+), 19 deletions(-)
>
> diff --git a/gcc/ifcvt.c b/gcc/ifcvt.c
> index 955f9541f60..09bf443656c 100644
> --- a/gcc/ifcvt.c
> +++ b/gcc/ifcvt.c
> @@ -103,6 +103,7 @@ static rtx_insn *block_has_only_trap (basic_block);
>  static void check_need_temps (basic_block bb,
>                                hash_map<rtx, bool> *need_temp,
>                                rtx cond);
> +static void check_need_cmovs (basic_block bb, hash_map<rtx, bool> *need_cmov);
>  
>  
>  /* Count the number of non-jump active insns in BB.  */
> @@ -3207,6 +3208,7 @@ noce_convert_multiple_sets (struct noce_if_info *if_info)
>    int count = 0;
>  
>    hash_map<rtx, bool> need_temps;
> +  hash_map<rtx, bool> need_no_cmovs;
>  
>    /* If we newly set a CC before a cmov, we might need a temporary
>       even though the compare will be removed by a later pass.  Costing
> @@ -3214,6 +3216,9 @@ noce_convert_multiple_sets (struct noce_if_info *if_info)
>  
>    check_need_temps (then_bb, &need_temps, cond);
>  
> +  /* Identify swap-style idioms.  */
> +  check_need_cmovs (then_bb, &need_no_cmovs);
> +
>    hash_map<rtx, bool> temps_created;
>  
>    FOR_BB_INSNS (then_bb, insn)
> @@ -3229,10 +3234,8 @@ noce_convert_multiple_sets (struct noce_if_info *if_info)
>        rtx new_val = SET_SRC (set);
>        rtx old_val = target;
>  
> -      rtx dest = SET_DEST (set);
> -
>        rtx temp;
> -      if (need_temps.get (dest))
> +      if (need_temps.get (target))
>         {
>           temp = gen_reg_rtx (GET_MODE (target));
>           temps_created.put (target, true);
> @@ -3241,18 +3244,11 @@ noce_convert_multiple_sets (struct noce_if_info *if_info)
>         temp = target;
>  
>        /* If we were supposed to read from an earlier write in this block,
> -	 we've changed the register allocation.  Rewire the read.  While
> -	 we are looking, also try to catch a swap idiom.  */
> +	 we've changed the register allocation.  Rewire the read.   */
>        for (int i = count - 1; i >= 0; --i)
>  	if (reg_overlap_mentioned_p (new_val, targets[i]))
>  	  {
> -	    /* Catch a "swap" style idiom.  */
> -	    if (find_reg_note (insn, REG_DEAD, new_val) != NULL_RTX)
> -	      /* The write to targets[i] is only live until the read
> -		 here.  As the condition codes match, we can propagate
> -		 the set to here.  */
> -	      new_val = SET_SRC (single_set (unmodified_insns[i]));
> -	    else
> +	    if (find_reg_note (insn, REG_DEAD, new_val) == NULL_RTX)
>  	      new_val = temporaries[i];
>  	    break;
>  	  }
> @@ -3324,8 +3320,11 @@ noce_convert_multiple_sets (struct noce_if_info *if_info)
>  
>        {
>  	start_sequence ();
> -	temp_dest1 = noce_emit_cmove (if_info, temp, cond_code,
> -	    x, y, new_val, old_val, NULL_RTX);
> +	if (!need_no_cmovs.get (insn))
> +	  temp_dest1 = noce_emit_cmove (if_info, temp, cond_code,
> +	      x, y, new_val, old_val, NULL_RTX);
> +	else
> +	  noce_emit_move_insn (target, new_val);
>  
>  	/* If we failed to expand the conditional move, drop out and don't
>  	   try to continue.  */
> @@ -3346,13 +3345,16 @@ noce_convert_multiple_sets (struct noce_if_info *if_info)
>  	end_sequence ();
>        }
>  
> -	/* Now try emitting one passing a non-canonicalized cc comparison.
> -	   This allows the backend to emit a cmov directly without additional
> +	/* Now try emitting a cmov passing a non-canonicalized cc comparison.
> +	   This allows backends to emit a cmov directly without additional
>  	   compare.  */
>        {
>  	start_sequence ();
> -	temp_dest2 = noce_emit_cmove (if_info, target, cond_code,
> -	    x, y, new_val, old_val, cc_cmp);
> +	if (!need_no_cmovs.get (insn))
> +	  temp_dest2 = noce_emit_cmove (if_info, target, cond_code,
> +	      x, y, new_val, old_val, cc_cmp);
> +	else
> +	  noce_emit_move_insn (target, new_val);
>  
>  	/* If we failed to expand the conditional move, drop out and don't
>  	   try to continue.  */
> @@ -3931,6 +3933,7 @@ check_cond_move_block (basic_block bb,
>  
>  /* Check for which sets we need to emit temporaries to hold the destination of
>     a conditional move.  */
> +
>  static void
>  check_need_temps (basic_block bb, hash_map<rtx, bool> *need_temp, rtx cond)
>  {
> @@ -3938,7 +3941,7 @@ check_need_temps (basic_block bb, hash_map<rtx, bool> *need_temp, rtx cond)
>  
>    FOR_BB_INSNS (bb, insn)
>      {
> -      rtx set, dest;
> +      rtx set, src, dest;
>  
>        if (!active_insn_p (insn))
>  	continue;
> @@ -3947,12 +3950,68 @@ check_need_temps (basic_block bb, hash_map<rtx, bool> *need_temp, rtx cond)
>        if (set == NULL_RTX)
>  	continue;
>  
> +      src = SET_SRC (set);
>        dest = SET_DEST (set);
>  
>        /* noce_emit_cmove will emit the condition check every time it is called
>           so we need a temp register if the destination is modified.  */
>        if (reg_overlap_mentioned_p (dest, cond))
>  	need_temp->put (dest, true);
> +
> +    }
> +}
> +
> +/* Find local swap-style idioms in BB and mark the first insn (1)
> +   that is only a temporary as not needing a conditional move as
> +   it is going to be dead afterwards anyway.
> +
> +     (1) int tmp = a;
> +	 a = b;
> +	 b = tmp;
> +
> +
> +        ifcvt
> +	 -->
> +
> +	 load tmp,a
> +	 cmov a,b
> +	 cmov b,tmp */
> +
> +static void
> +check_need_cmovs (basic_block bb, hash_map<rtx, bool> *need_cmov)
> +{
> +  rtx_insn *insn;
> +  int count = 0;
> +  auto_vec<rtx> insns;
> +  auto_vec<rtx> dests;
> +
> +  FOR_BB_INSNS (bb, insn)
> +    {
> +      rtx set, src, dest;
> +
> +      if (!active_insn_p (insn))
> +	continue;
> +
> +      set = single_set (insn);
> +      if (set == NULL_RTX)
> +	continue;
> +
> +      src = SET_SRC (set);
> +      dest = SET_DEST (set);
> +
> +      for (int i = count - 1; i >= 0; --i)
> +	{
> +	  if (reg_overlap_mentioned_p (src, dests[i])
> +	      && find_reg_note (insn, REG_DEAD, src) != NULL_RTX)
> +	    {
> +	      need_cmov->put (insns[i], false);
> +	    }
> +	}
> +
> +      insns.safe_push (insn);
> +      dests.safe_push (dest);
> +
> +      count++;
>      }
>  }
Robin Dapp Aug. 16, 2019, 12:52 p.m. UTC | #2
> Looks like a nice optimisation, but could we just test whether the
> destination of a set isn't live on exit from the then block?  I think
> we could do that on the fly during the main noce_convert_multiple_sets
> loop.

I included this locally along with the rest of the remarks. Any comments
on the rest/bulk of the patch (the part trying to compare costs of both
cmovs "types")?

Regards
 Robin
Richard Sandiford Aug. 16, 2019, 3:25 p.m. UTC | #3
Robin Dapp <rdapp@linux.ibm.com> writes:
>> Looks like a nice optimisation, but could we just test whether the
>> destination of a set isn't live on exit from the then block?  I think
>> we could do that on the fly during the main noce_convert_multiple_sets
>> loop.
>
> I included this locally along with the rest of the remarks. Any comments
> on the rest/bulk of the patch (the part trying to compare costs of both
> cmovs "types")?

I'm still a bit worried about the overlap between the expanded
noce_convert_multiple_sets and cond_move_process_if_block (5/9).
It seems like we're making noce_convert_multiple_set handle most of
the conditional move cases that cond_move_process_if_block can handle.
But like you say, noce_convert_multiple_set is still restricted to
half-diamonds, whereas cond_move_process_if_block can handle full
diamonds.

3/9, 4/9 and 8/9 seems like good independent improvements.
But for 5/9, 6/9 and 7/9, it looks to me like it would be better
to improve cond_move_process_if_block so that it handles all the
conditional move cases you're targetting.

Definitely welcome other opinions though. :-)

Thanks,
Richard
Robin Dapp Aug. 17, 2019, 11:44 a.m. UTC | #4
> I'm still a bit worried about the overlap between the expanded
> noce_convert_multiple_sets and cond_move_process_if_block (5/9).
> It seems like we're making noce_convert_multiple_set handle most of
> the conditional move cases that cond_move_process_if_block can handle.
> But like you say, noce_convert_multiple_set is still restricted to
> half-diamonds, whereas cond_move_process_if_block can handle full
> diamonds.

I see your concern and would also agree on that there should be a
clearer demarcation of which method can handle what.  I'm not really
sure I fully understand the distinction between "noce" and "cond_move"
anyway, when we emit conditional moves in the "noce" parts.  As said
before, in my opinion, both noce_convert_multiple and cond_move_process
should be unified rather sooner than later.

Yet, before and after my suggested patch, noce_convert_multiple_set can
handle blocks that cond_move_process cannot, e.g. cmovs that write to
one of the registers the condition checked (like min/max idioms) or even
cmovs that touch registers that are modified earlier in the block (like
in swap idioms).  As far as I understand, this is why the additional
temporaries have been introduced in the first place - my patch now tries
to limit these temporaries to situations where we really need them.

I would argue that there is still sufficient difference between them,
even though both can now handle constants.

Regards
 Robin
diff mbox series

Patch

diff --git a/gcc/ifcvt.c b/gcc/ifcvt.c
index 955f9541f60..09bf443656c 100644
--- a/gcc/ifcvt.c
+++ b/gcc/ifcvt.c
@@ -103,6 +103,7 @@  static rtx_insn *block_has_only_trap (basic_block);
 static void check_need_temps (basic_block bb,
                               hash_map<rtx, bool> *need_temp,
                               rtx cond);
+static void check_need_cmovs (basic_block bb, hash_map<rtx, bool> *need_cmov);
 
 
 /* Count the number of non-jump active insns in BB.  */
@@ -3207,6 +3208,7 @@  noce_convert_multiple_sets (struct noce_if_info *if_info)
   int count = 0;
 
   hash_map<rtx, bool> need_temps;
+  hash_map<rtx, bool> need_no_cmovs;
 
   /* If we newly set a CC before a cmov, we might need a temporary
      even though the compare will be removed by a later pass.  Costing
@@ -3214,6 +3216,9 @@  noce_convert_multiple_sets (struct noce_if_info *if_info)
 
   check_need_temps (then_bb, &need_temps, cond);
 
+  /* Identify swap-style idioms.  */
+  check_need_cmovs (then_bb, &need_no_cmovs);
+
   hash_map<rtx, bool> temps_created;
 
   FOR_BB_INSNS (then_bb, insn)
@@ -3229,10 +3234,8 @@  noce_convert_multiple_sets (struct noce_if_info *if_info)
       rtx new_val = SET_SRC (set);
       rtx old_val = target;
 
-      rtx dest = SET_DEST (set);
-
       rtx temp;
-      if (need_temps.get (dest))
+      if (need_temps.get (target))
        {
          temp = gen_reg_rtx (GET_MODE (target));
          temps_created.put (target, true);
@@ -3241,18 +3244,11 @@  noce_convert_multiple_sets (struct noce_if_info *if_info)
        temp = target;
 
       /* If we were supposed to read from an earlier write in this block,
-	 we've changed the register allocation.  Rewire the read.  While
-	 we are looking, also try to catch a swap idiom.  */
+	 we've changed the register allocation.  Rewire the read.   */
       for (int i = count - 1; i >= 0; --i)
 	if (reg_overlap_mentioned_p (new_val, targets[i]))
 	  {
-	    /* Catch a "swap" style idiom.  */
-	    if (find_reg_note (insn, REG_DEAD, new_val) != NULL_RTX)
-	      /* The write to targets[i] is only live until the read
-		 here.  As the condition codes match, we can propagate
-		 the set to here.  */
-	      new_val = SET_SRC (single_set (unmodified_insns[i]));
-	    else
+	    if (find_reg_note (insn, REG_DEAD, new_val) == NULL_RTX)
 	      new_val = temporaries[i];
 	    break;
 	  }
@@ -3324,8 +3320,11 @@  noce_convert_multiple_sets (struct noce_if_info *if_info)
 
       {
 	start_sequence ();
-	temp_dest1 = noce_emit_cmove (if_info, temp, cond_code,
-	    x, y, new_val, old_val, NULL_RTX);
+	if (!need_no_cmovs.get (insn))
+	  temp_dest1 = noce_emit_cmove (if_info, temp, cond_code,
+	      x, y, new_val, old_val, NULL_RTX);
+	else
+	  noce_emit_move_insn (target, new_val);
 
 	/* If we failed to expand the conditional move, drop out and don't
 	   try to continue.  */
@@ -3346,13 +3345,16 @@  noce_convert_multiple_sets (struct noce_if_info *if_info)
 	end_sequence ();
       }
 
-	/* Now try emitting one passing a non-canonicalized cc comparison.
-	   This allows the backend to emit a cmov directly without additional
+	/* Now try emitting a cmov passing a non-canonicalized cc comparison.
+	   This allows backends to emit a cmov directly without additional
 	   compare.  */
       {
 	start_sequence ();
-	temp_dest2 = noce_emit_cmove (if_info, target, cond_code,
-	    x, y, new_val, old_val, cc_cmp);
+	if (!need_no_cmovs.get (insn))
+	  temp_dest2 = noce_emit_cmove (if_info, target, cond_code,
+	      x, y, new_val, old_val, cc_cmp);
+	else
+	  noce_emit_move_insn (target, new_val);
 
 	/* If we failed to expand the conditional move, drop out and don't
 	   try to continue.  */
@@ -3931,6 +3933,7 @@  check_cond_move_block (basic_block bb,
 
 /* Check for which sets we need to emit temporaries to hold the destination of
    a conditional move.  */
+
 static void
 check_need_temps (basic_block bb, hash_map<rtx, bool> *need_temp, rtx cond)
 {
@@ -3938,7 +3941,7 @@  check_need_temps (basic_block bb, hash_map<rtx, bool> *need_temp, rtx cond)
 
   FOR_BB_INSNS (bb, insn)
     {
-      rtx set, dest;
+      rtx set, src, dest;
 
       if (!active_insn_p (insn))
 	continue;
@@ -3947,12 +3950,68 @@  check_need_temps (basic_block bb, hash_map<rtx, bool> *need_temp, rtx cond)
       if (set == NULL_RTX)
 	continue;
 
+      src = SET_SRC (set);
       dest = SET_DEST (set);
 
       /* noce_emit_cmove will emit the condition check every time it is called
          so we need a temp register if the destination is modified.  */
       if (reg_overlap_mentioned_p (dest, cond))
 	need_temp->put (dest, true);
+
+    }
+}
+
+/* Find local swap-style idioms in BB and mark the first insn (1)
+   that is only a temporary as not needing a conditional move as
+   it is going to be dead afterwards anyway.
+
+     (1) int tmp = a;
+	 a = b;
+	 b = tmp;
+
+
+        ifcvt
+	 -->
+
+	 load tmp,a
+	 cmov a,b
+	 cmov b,tmp */
+
+static void
+check_need_cmovs (basic_block bb, hash_map<rtx, bool> *need_cmov)
+{
+  rtx_insn *insn;
+  int count = 0;
+  auto_vec<rtx> insns;
+  auto_vec<rtx> dests;
+
+  FOR_BB_INSNS (bb, insn)
+    {
+      rtx set, src, dest;
+
+      if (!active_insn_p (insn))
+	continue;
+
+      set = single_set (insn);
+      if (set == NULL_RTX)
+	continue;
+
+      src = SET_SRC (set);
+      dest = SET_DEST (set);
+
+      for (int i = count - 1; i >= 0; --i)
+	{
+	  if (reg_overlap_mentioned_p (src, dests[i])
+	      && find_reg_note (insn, REG_DEAD, src) != NULL_RTX)
+	    {
+	      need_cmov->put (insns[i], false);
+	    }
+	}
+
+      insns.safe_push (insn);
+      dests.safe_push (dest);
+
+      count++;
     }
 }