diff mbox series

[1/2] middle-end Teach CSE to be able to do vector extracts.

Message ID patch-14773-tamar@arm.com
State New
Headers show
Series [1/2] middle-end Teach CSE to be able to do vector extracts. | expand

Commit Message

Tamar Christina Aug. 31, 2021, 1:29 p.m. UTC
Hi All,

This patch gets CSE to re-use constants already inside a vector rather than
re-materializing the constant again.

Basically consider the following case:

#include <stdint.h>
#include <arm_neon.h>

uint64_t
test (uint64_t a, uint64x2_t b, uint64x2_t* rt)
{
  uint64_t arr[2] = { 0x0942430810234076UL, 0x0942430810234076UL};
  uint64_t res = a | arr[0];
  uint64x2_t val = vld1q_u64 (arr);
  *rt = vaddq_u64 (val, b);
  return res;
}

The actual behavior is inconsequential however notice that the same constants
are used in the vector (arr and later val) and in the calculation of res.

The code we generate for this however is quite sub-optimal:

test:
        adrp    x2, .LC0
        sub     sp, sp, #16
        ldr     q1, [x2, #:lo12:.LC0]
        mov     x2, 16502
        movk    x2, 0x1023, lsl 16
        movk    x2, 0x4308, lsl 32
        add     v1.2d, v1.2d, v0.2d
        movk    x2, 0x942, lsl 48
        orr     x0, x0, x2
        str     q1, [x1]
        add     sp, sp, 16
        ret
.LC0:
        .xword  667169396713799798
        .xword  667169396713799798

Essentially we materialize the same constant twice.  The reason for this is
because the front-end lowers the constant extracted from arr[0] quite early on.
If you look into the result of fre you'll find

  <bb 2> :
  arr[0] = 667169396713799798;
  arr[1] = 667169396713799798;
  res_7 = a_6(D) | 667169396713799798;
  _16 = __builtin_aarch64_ld1v2di (&arr);
  _17 = VIEW_CONVERT_EXPR<uint64x2_t>(_16);
  _11 = b_10(D) + _17;
  *rt_12(D) = _11;
  arr ={v} {CLOBBER};
  return res_7;

Which makes sense for further optimization.  However come expand time if the
constant isn't representable in the target arch it will be assigned to a
register again.

(insn 8 5 9 2 (set (reg:V2DI 99)
        (const_vector:V2DI [
                (const_int 667169396713799798 [0x942430810234076]) repeated x2
            ])) "cse.c":7:12 -1
     (nil))
...
(insn 14 13 15 2 (set (reg:DI 103)
        (const_int 667169396713799798 [0x942430810234076])) "cse.c":8:12 -1
     (nil))
(insn 15 14 16 2 (set (reg:DI 102 [ res ])
        (ior:DI (reg/v:DI 96 [ a ])
            (reg:DI 103))) "cse.c":8:12 -1
     (nil))

And since it's out of the immediate range of the scalar instruction used
combine won't be able to do anything here.

This will then trigger the re-materialization of the constant twice.

To fix this this patch extends CSE to be able to generate an extract for a
constant from another vector, or to make a vector for a constant by duplicating
another constant.

Whether this transformation is done or not depends entirely on the costing for
the target for the different constants and operations.

I Initially also investigated doing this in PRE, but PRE requires at least 2 BB
to work and does not currently have any way to remove redundancies within a
single BB and it did not look easy to support.

Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* cse.c (find_sets_in_insn): Register constants in sets.
	(cse_insn): Try materializing using vec_dup.

--- inline copy of patch -- 
diff --git a/gcc/cse.c b/gcc/cse.c
index 330c1e90ce05b8f95b58f24576ec93e10ec55d89..d76e01b6478e22e9dd5760b7c78cecb536d7daef 100644


--

Comments

Jeff Law Sept. 1, 2021, 11:48 p.m. UTC | #1
On 8/31/2021 7:29 AM, Tamar Christina wrote:
> Hi All,
>
> This patch gets CSE to re-use constants already inside a vector rather than
> re-materializing the constant again.
>
> Basically consider the following case:
>
> #include <stdint.h>
> #include <arm_neon.h>
>
> uint64_t
> test (uint64_t a, uint64x2_t b, uint64x2_t* rt)
> {
>    uint64_t arr[2] = { 0x0942430810234076UL, 0x0942430810234076UL};
>    uint64_t res = a | arr[0];
>    uint64x2_t val = vld1q_u64 (arr);
>    *rt = vaddq_u64 (val, b);
>    return res;
> }
>
> The actual behavior is inconsequential however notice that the same constants
> are used in the vector (arr and later val) and in the calculation of res.
>
> The code we generate for this however is quite sub-optimal:
>
> test:
>          adrp    x2, .LC0
>          sub     sp, sp, #16
>          ldr     q1, [x2, #:lo12:.LC0]
>          mov     x2, 16502
>          movk    x2, 0x1023, lsl 16
>          movk    x2, 0x4308, lsl 32
>          add     v1.2d, v1.2d, v0.2d
>          movk    x2, 0x942, lsl 48
>          orr     x0, x0, x2
>          str     q1, [x1]
>          add     sp, sp, 16
>          ret
> .LC0:
>          .xword  667169396713799798
>          .xword  667169396713799798
>
> Essentially we materialize the same constant twice.  The reason for this is
> because the front-end lowers the constant extracted from arr[0] quite early on.
> If you look into the result of fre you'll find
>
>    <bb 2> :
>    arr[0] = 667169396713799798;
>    arr[1] = 667169396713799798;
>    res_7 = a_6(D) | 667169396713799798;
>    _16 = __builtin_aarch64_ld1v2di (&arr);
>    _17 = VIEW_CONVERT_EXPR<uint64x2_t>(_16);
>    _11 = b_10(D) + _17;
>    *rt_12(D) = _11;
>    arr ={v} {CLOBBER};
>    return res_7;
>
> Which makes sense for further optimization.  However come expand time if the
> constant isn't representable in the target arch it will be assigned to a
> register again.
>
> (insn 8 5 9 2 (set (reg:V2DI 99)
>          (const_vector:V2DI [
>                  (const_int 667169396713799798 [0x942430810234076]) repeated x2
>              ])) "cse.c":7:12 -1
>       (nil))
> ...
> (insn 14 13 15 2 (set (reg:DI 103)
>          (const_int 667169396713799798 [0x942430810234076])) "cse.c":8:12 -1
>       (nil))
> (insn 15 14 16 2 (set (reg:DI 102 [ res ])
>          (ior:DI (reg/v:DI 96 [ a ])
>              (reg:DI 103))) "cse.c":8:12 -1
>       (nil))
>
> And since it's out of the immediate range of the scalar instruction used
> combine won't be able to do anything here.
>
> This will then trigger the re-materialization of the constant twice.
>
> To fix this this patch extends CSE to be able to generate an extract for a
> constant from another vector, or to make a vector for a constant by duplicating
> another constant.
>
> Whether this transformation is done or not depends entirely on the costing for
> the target for the different constants and operations.
>
> I Initially also investigated doing this in PRE, but PRE requires at least 2 BB
> to work and does not currently have any way to remove redundancies within a
> single BB and it did not look easy to support.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* cse.c (find_sets_in_insn): Register constants in sets.
> 	(cse_insn): Try materializing using vec_dup.
Looks good to me.

If you can turn that example into a test, even if it's just in the 
aarch64 directory, that would be helpful

Thanks,
Jeff
Richard Sandiford Sept. 3, 2021, 10:26 a.m. UTC | #2
Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> diff --git a/gcc/cse.c b/gcc/cse.c
> index 330c1e90ce05b8f95b58f24576ec93e10ec55d89..d76e01b6478e22e9dd5760b7c78cecb536d7daef 100644
> --- a/gcc/cse.c
> +++ b/gcc/cse.c
> @@ -44,6 +44,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "regs.h"
>  #include "function-abi.h"
>  #include "rtlanal.h"
> +#include "expr.h"
>  
>  /* The basic idea of common subexpression elimination is to go
>     through the code, keeping a record of expressions that would
> @@ -4274,6 +4275,25 @@ find_sets_in_insn (rtx_insn *insn, struct set **psets)
>  	 someplace else, so it isn't worth cse'ing.  */
>        else if (GET_CODE (SET_SRC (x)) == CALL)
>  	;
> +      else if (GET_CODE (SET_SRC (x)) == CONST_VECTOR
> +	       && GET_MODE_CLASS (GET_MODE (SET_SRC (x))) != MODE_VECTOR_BOOL)
> +	{
> +	  /* First register the vector itself.  */
> +	  sets[n_sets++].rtl = x;
> +	  rtx src = SET_SRC (x);
> +	  machine_mode elem_mode = GET_MODE_INNER (GET_MODE (src));
> +	  /* Go over the constants of the CONST_VECTOR in forward order, to
> +	     put them in the same order in the SETS array.  */
> +	  for (unsigned i = 0; i < const_vector_encoded_nelts (src) ; i++)
> +	    {
> +	      /* These are templates and don't actually get emitted but are
> +		 used to tell CSE how to get to a particular constant.  */
> +	      rtx tmp = gen_rtx_PARALLEL (VOIDmode,
> +					  gen_rtvec (1, GEN_INT (i)));
> +	      rtx y = gen_rtx_VEC_SELECT (elem_mode, SET_DEST (x), tmp);
> +	      sets[n_sets++].rtl = gen_rtx_SET (y, CONST_VECTOR_ELT (src, i));
> +	    }
> +	}

As mentioned in the 2/2 thread, I think we should use subregs for
the case where they're canonical.  It'd probably be worth adding a
simplify-rtx.c helper to extract one element from a vector, e.g.:

  rtx simplify_gen_vec_select (rtx op, unsigned int index);

so that this is easier to do.

Does making the loop above per-element mean that, for 128-bit Advanced
SIMD, the optimisation “only” kicks in for 64-bit element sizes?
Perhaps for other element sizes we could do “top” and “bottom” halves.
(There's obviously no need to do that as part of this work, was just
wondering.)

>        else
>  	sets[n_sets++].rtl = x;
>      }
> @@ -4513,7 +4533,14 @@ cse_insn (rtx_insn *insn)
>    struct set *sets = (struct set *) 0;
>  
>    if (GET_CODE (x) == SET)
> -    sets = XALLOCA (struct set);
> +    {
> +      /* For CONST_VECTOR we wants to be able to CSE the vector itself along with
> +	 elements inside the vector if the target says it's cheap.  */
> +      if (GET_CODE (SET_SRC (x)) == CONST_VECTOR)
> +	sets = XALLOCAVEC (struct set, const_vector_encoded_nelts (SET_SRC (x)) + 1);
> +      else
> +	sets = XALLOCA (struct set);
> +    }
>    else if (GET_CODE (x) == PARALLEL)
>      sets = XALLOCAVEC (struct set, XVECLEN (x, 0));

I think this would be easier if “sets” was first converted to an
auto_vec, say auto_vec<struct set, 8>.  We then wouldn't need to
predict in advance how many elements are needed.

> @@ -4997,6 +5024,26 @@ cse_insn (rtx_insn *insn)
>  	  src_related_is_const_anchor = src_related != NULL_RTX;
>  	}
>  
> +      /* Try to re-materialize a vec_dup with an existing constant.   */
> +      if (GET_CODE (src) == CONST_VECTOR
> +	  && const_vector_encoded_nelts (src) == 1)
> +	{
> +	   rtx const_rtx = CONST_VECTOR_ELT (src, 0);

Would be simpler as:

  rtx src_elt;
  if (const_vec_duplicate_p (src, &src_elt))

I think we should also check !src_eqv_here, or perhaps:

  (!src_eqv_here || CONSTANT_P (src_eqv_here))

so that we don't override any existing reg notes, which could have more
chance of succeeding.

> +	   machine_mode const_mode = GET_MODE_INNER (GET_MODE (src));
> +	   struct table_elt *related_elt
> +		= lookup (const_rtx, HASH (const_rtx, const_mode), const_mode);
> +	   if (related_elt)
> +	    {
> +	      for (related_elt = related_elt->first_same_value;
> +		   related_elt; related_elt = related_elt->next_same_value)
> +		if (REG_P (related_elt->exp))
> +		  {
> +		    src_eqv_here
> +			= gen_rtx_VEC_DUPLICATE (GET_MODE (src),
> +						 related_elt->exp);
> +		  }

Other similar loops seem to break after the first match, instead of
picking the last match.

Thanks,
Richard

> +	    }
> +	}
>  
>        if (src == src_folded)
>  	src_folded = 0;
Tamar Christina Sept. 8, 2021, 12:34 p.m. UTC | #3
Hi Jeff & Richard,

> If you can turn that example into a test, even if it's just in the
> aarch64 directory, that would be helpful

The second patch 2/2 has various tests for this as the cost model had to
be made more accurate for it to work.

> 
> As mentioned in the 2/2 thread, I think we should use subregs for
> the case where they're canonical.  It'd probably be worth adding a
> simplify-rtx.c helper to extract one element from a vector, e.g.:
> 
>   rtx simplify_gen_vec_select (rtx op, unsigned int index);
> 
> so that this is easier to do.
> 
> Does making the loop above per-element mean that, for 128-bit Advanced
> SIMD, the optimisation “only” kicks in for 64-bit element sizes?
> Perhaps for other element sizes we could do “top” and “bottom” halves.
> (There's obviously no need to do that as part of this work, was just
> wondering.)
> 

It should handle extraction of any element size, so it's able to use a value
in any abitrary location.  CSE already handles low/hi re-use optimally. So e.g.

#include <arm_neon.h>

extern int16x8_t bar (int16x8_t, int16x8_t);

int16x8_t foo ()
{
    int16_t s[4] = {1,2,3,4};
    int16_t d[8] = {1,2,3,4,5,6,7,8};

    int16x4_t r1 = vld1_s16 (s);
    int16x8_t r2 = vcombine_s16 (r1, r1);
    int16x8_t r3 = vld1q_s16 (d);
    return bar (r2, r3);
}

but our cost model is currently blocking it because we never costed vec_consts.
Without the 2/2 patch we generate:

foo:
        stp     x29, x30, [sp, -48]!
        adrp    x0, .LC0
        mov     x29, sp
        ldr     q1, [x0, #:lo12:.LC0]
        adrp    x0, .LC1
        ldr     q0, [x0, #:lo12:.LC1]
        adrp    x0, .LC2
        str     q1, [sp, 32]
        ldr     d2, [x0, #:lo12:.LC2]
        str     d2, [sp, 24]
        bl      bar
        ldp     x29, x30, [sp], 48
        ret
.LC0:
        .hword  1
        .hword  2
        .hword  3
        .hword  4
        .hword  5
        .hword  6
        .hword  7
        .hword  8
.LC1:
        .hword  1
        .hword  2
        .hword  3
        .hword  4
        .hword  1
        .hword  2
        .hword  3
        .hword  4

but with the 2/2 patch:

foo:
        stp     x29, x30, [sp, -48]!
        adrp    x0, .LC0
        mov     x29, sp
        ldr     d2, [x0, #:lo12:.LC0]
        adrp    x0, .LC1
        ldr     q1, [x0, #:lo12:.LC1]
        str     d2, [sp, 24]
        dup     d0, v2.d[0]
        str     q1, [sp, 32]
        ins     v0.d[1], v2.d[0]
        bl      bar
        ldp     x29, x30, [sp], 48
        ret
.LC1:
        .hword  1
        .hword  2
        .hword  3
        .hword  4
        .hword  5
        .hword  6
        .hword  7
        .hword  8

It's not entirely optimal of course, but is step forward. I think when we fix
the vld's this should then become optimal as current the MEMs are causing it to
not re-use those values.

> >        else
> >  	sets[n_sets++].rtl = x;
> >      }
> > @@ -4513,7 +4533,14 @@ cse_insn (rtx_insn *insn)
> >    struct set *sets = (struct set *) 0;
> >  
> >    if (GET_CODE (x) == SET)
> > -    sets = XALLOCA (struct set);
> > +    {
> > +      /* For CONST_VECTOR we wants to be able to CSE the vector itself along with
> > +	 elements inside the vector if the target says it's cheap.  */
> > +      if (GET_CODE (SET_SRC (x)) == CONST_VECTOR)
> > +	sets = XALLOCAVEC (struct set, const_vector_encoded_nelts (SET_SRC (x)) + 1);
> > +      else
> > +	sets = XALLOCA (struct set);
> > +    }
> >    else if (GET_CODE (x) == PARALLEL)
> >      sets = XALLOCAVEC (struct set, XVECLEN (x, 0));
> 
> I think this would be easier if “sets” was first converted to an
> auto_vec, say auto_vec<struct set, 8>.  We then wouldn't need to
> predict in advance how many elements are needed.
> 

Done.

> > @@ -4997,6 +5024,26 @@ cse_insn (rtx_insn *insn)
> >  	  src_related_is_const_anchor = src_related != NULL_RTX;
> >  	}
> >  
> > +      /* Try to re-materialize a vec_dup with an existing constant.   */
> > +      if (GET_CODE (src) == CONST_VECTOR
> > +	  && const_vector_encoded_nelts (src) == 1)
> > +	{
> > +	   rtx const_rtx = CONST_VECTOR_ELT (src, 0);
> 
> Would be simpler as:
> 
>   rtx src_elt;
>   if (const_vec_duplicate_p (src, &src_elt))
> 
> I think we should also check !src_eqv_here, or perhaps:
> 
>   (!src_eqv_here || CONSTANT_P (src_eqv_here))
> 
> so that we don't override any existing reg notes, which could have more
> chance of succeeding.
> 

Done.

> > +	   machine_mode const_mode = GET_MODE_INNER (GET_MODE (src));
> > +	   struct table_elt *related_elt
> > +		= lookup (const_rtx, HASH (const_rtx, const_mode), const_mode);
> > +	   if (related_elt)
> > +	    {
> > +	      for (related_elt = related_elt->first_same_value;
> > +		   related_elt; related_elt = related_elt->next_same_value)
> > +		if (REG_P (related_elt->exp))
> > +		  {
> > +		    src_eqv_here
> > +			= gen_rtx_VEC_DUPLICATE (GET_MODE (src),
> > +						 related_elt->exp);
> > +		  }
> 
> Other similar loops seem to break after the first match, instead of
> picking the last match.
> 

Done.

Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* cse.c (add_to_set): New.
	(find_sets_in_insn): Register constants in sets.
	(canonicalize_insn): Use auto_vec instead.
	(cse_insn): Try materializing using vec_dup.
	* rtl.h (simplify_context::simplify_gen_vec_select,
	simplify_gen_vec_select): New.
	* simplify-rtx.c (simplify_context::simplify_gen_vec_select): New.

> Thanks,
> Richard
> 
> > +	    }
> > +	}
> >  
> >        if (src == src_folded)
> >  	src_folded = 0;

--
diff mbox series

Patch

diff --git a/gcc/cse.c b/gcc/cse.c
index 330c1e90ce05b8f95b58f24576ec93e10ec55d89..d76e01b6478e22e9dd5760b7c78cecb536d7daef 100644
--- a/gcc/cse.c
+++ b/gcc/cse.c
@@ -44,6 +44,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "regs.h"
 #include "function-abi.h"
 #include "rtlanal.h"
+#include "expr.h"
 
 /* The basic idea of common subexpression elimination is to go
    through the code, keeping a record of expressions that would
@@ -4274,6 +4275,25 @@  find_sets_in_insn (rtx_insn *insn, struct set **psets)
 	 someplace else, so it isn't worth cse'ing.  */
       else if (GET_CODE (SET_SRC (x)) == CALL)
 	;
+      else if (GET_CODE (SET_SRC (x)) == CONST_VECTOR
+	       && GET_MODE_CLASS (GET_MODE (SET_SRC (x))) != MODE_VECTOR_BOOL)
+	{
+	  /* First register the vector itself.  */
+	  sets[n_sets++].rtl = x;
+	  rtx src = SET_SRC (x);
+	  machine_mode elem_mode = GET_MODE_INNER (GET_MODE (src));
+	  /* Go over the constants of the CONST_VECTOR in forward order, to
+	     put them in the same order in the SETS array.  */
+	  for (unsigned i = 0; i < const_vector_encoded_nelts (src) ; i++)
+	    {
+	      /* These are templates and don't actually get emitted but are
+		 used to tell CSE how to get to a particular constant.  */
+	      rtx tmp = gen_rtx_PARALLEL (VOIDmode,
+					  gen_rtvec (1, GEN_INT (i)));
+	      rtx y = gen_rtx_VEC_SELECT (elem_mode, SET_DEST (x), tmp);
+	      sets[n_sets++].rtl = gen_rtx_SET (y, CONST_VECTOR_ELT (src, i));
+	    }
+	}
       else
 	sets[n_sets++].rtl = x;
     }
@@ -4513,7 +4533,14 @@  cse_insn (rtx_insn *insn)
   struct set *sets = (struct set *) 0;
 
   if (GET_CODE (x) == SET)
-    sets = XALLOCA (struct set);
+    {
+      /* For CONST_VECTOR we wants to be able to CSE the vector itself along with
+	 elements inside the vector if the target says it's cheap.  */
+      if (GET_CODE (SET_SRC (x)) == CONST_VECTOR)
+	sets = XALLOCAVEC (struct set, const_vector_encoded_nelts (SET_SRC (x)) + 1);
+      else
+	sets = XALLOCA (struct set);
+    }
   else if (GET_CODE (x) == PARALLEL)
     sets = XALLOCAVEC (struct set, XVECLEN (x, 0));
 
@@ -4997,6 +5024,26 @@  cse_insn (rtx_insn *insn)
 	  src_related_is_const_anchor = src_related != NULL_RTX;
 	}
 
+      /* Try to re-materialize a vec_dup with an existing constant.   */
+      if (GET_CODE (src) == CONST_VECTOR
+	  && const_vector_encoded_nelts (src) == 1)
+	{
+	   rtx const_rtx = CONST_VECTOR_ELT (src, 0);
+	   machine_mode const_mode = GET_MODE_INNER (GET_MODE (src));
+	   struct table_elt *related_elt
+		= lookup (const_rtx, HASH (const_rtx, const_mode), const_mode);
+	   if (related_elt)
+	    {
+	      for (related_elt = related_elt->first_same_value;
+		   related_elt; related_elt = related_elt->next_same_value)
+		if (REG_P (related_elt->exp))
+		  {
+		    src_eqv_here
+			= gen_rtx_VEC_DUPLICATE (GET_MODE (src),
+						 related_elt->exp);
+		  }
+	    }
+	}
 
       if (src == src_folded)
 	src_folded = 0;