diff mbox series

[v5] dse: Remove partial load after full store for high part access[PR71309]

Message ID 1f966acd-605a-ef60-4084-8a3bd03fd90e@linux.ibm.com
State New
Headers show
Series [v5] dse: Remove partial load after full store for high part access[PR71309] | expand

Commit Message

Xionghu Luo Aug. 3, 2020, 6:58 a.m. UTC
Thanks, the v5 update as comments:
 1. Move const_rhs shift out of loop;
 2. Iterate from int size for read_mode.


This patch could optimize(works for char/short/int/void*):

6: r119:TI=[r118:DI+0x10]
7: [r118:DI]=r119:TI
8: r121:DI=[r118:DI+0x8]

=>

6: r119:TI=[r118:DI+0x10]
16: r122:DI=r119:TI#8

Final ASM will be as below without partial load after full store(stxv+ld):
  ld 10,16(3)
  mr 9,3
  ld 3,24(3)
  std 10,0(9)
  std 3,8(9)
  blr

It could achieve ~25% performance improvement for typical cases on
Power9.  Bootstrap and regression tested on Power9-LE.

For AArch64, one ldr is replaced by mov with this patch:

ldp     x2, x3, [x0, 16]
stp     x2, x3, [x0]
ldr     x0, [x0, 8]

=>

mov     x1, x0
ldp     x2, x0, [x0, 16]
stp     x2, x0, [x1]

gcc/ChangeLog:

2020-08-03  Xionghu Luo  <luoxhu@linux.ibm.com>

	PR rtl-optimization/71309
	* dse.c (find_shift_sequence): Use subreg of shifted from high part
	register to avoid loading from address.

gcc/testsuite/ChangeLog:

2020-08-03  Xionghu Luo  <luoxhu@linux.ibm.com>

	PR rtl-optimization/71309
	* gcc.target/powerpc/pr71309.c: New test.
---
 gcc/dse.c                                  | 78 ++++++++++++++--------
 gcc/testsuite/gcc.target/powerpc/pr71309.c | 33 +++++++++
 2 files changed, 82 insertions(+), 29 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c

Comments

Richard Sandiford Aug. 3, 2020, 2:01 p.m. UTC | #1
luoxhu <luoxhu@linux.ibm.com> writes:
> Thanks, the v5 update as comments:
>  1. Move const_rhs shift out of loop;
>  2. Iterate from int size for read_mode.
>
>
> This patch could optimize(works for char/short/int/void*):
>
> 6: r119:TI=[r118:DI+0x10]
> 7: [r118:DI]=r119:TI
> 8: r121:DI=[r118:DI+0x8]
>
> =>
>
> 6: r119:TI=[r118:DI+0x10]
> 16: r122:DI=r119:TI#8
>
> Final ASM will be as below without partial load after full store(stxv+ld):
>   ld 10,16(3)
>   mr 9,3
>   ld 3,24(3)
>   std 10,0(9)
>   std 3,8(9)
>   blr
>
> It could achieve ~25% performance improvement for typical cases on
> Power9.  Bootstrap and regression tested on Power9-LE.
>
> For AArch64, one ldr is replaced by mov with this patch:
>
> ldp     x2, x3, [x0, 16]
> stp     x2, x3, [x0]
> ldr     x0, [x0, 8]
>
> =>
>
> mov     x1, x0
> ldp     x2, x0, [x0, 16]
> stp     x2, x0, [x1]
>
> gcc/ChangeLog:
>
> 2020-08-03  Xionghu Luo  <luoxhu@linux.ibm.com>
>
> 	PR rtl-optimization/71309
> 	* dse.c (find_shift_sequence): Use subreg of shifted from high part
> 	register to avoid loading from address.
>
> gcc/testsuite/ChangeLog:
>
> 2020-08-03  Xionghu Luo  <luoxhu@linux.ibm.com>
>
> 	PR rtl-optimization/71309
> 	* gcc.target/powerpc/pr71309.c: New test.
> ---
>  gcc/dse.c                                  | 78 ++++++++++++++--------
>  gcc/testsuite/gcc.target/powerpc/pr71309.c | 33 +++++++++
>  2 files changed, 82 insertions(+), 29 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c
>
> diff --git a/gcc/dse.c b/gcc/dse.c
> index bbe792e48e8..b67b5c2ba35 100644
> --- a/gcc/dse.c
> +++ b/gcc/dse.c
> @@ -1720,6 +1720,35 @@ find_shift_sequence (poly_int64 access_size,
>    scalar_int_mode new_mode;
>    rtx read_reg = NULL;
>  
> +  /* If a constant was stored into memory, try to simplify it here,
> +     otherwise the cost of the shift might preclude this optimization
> +     e.g. at -Os, even when no actual shift will be needed.  */
> +  if (store_info->const_rhs)
> +    {
> +      auto new_mode = smallest_int_mode_for_size (access_size * BITS_PER_UNIT);
> +      auto byte = subreg_lowpart_offset (new_mode, store_mode);
> +      rtx ret
> +	= simplify_subreg (new_mode, store_info->const_rhs, store_mode, byte);
> +      if (ret && CONSTANT_P (ret))
> +	{
> +	  rtx shift_rtx = gen_int_shift_amount (new_mode, shift);
> +	  ret = simplify_const_binary_operation (LSHIFTRT, new_mode, ret,
> +						 shift_rtx);
> +	  if (ret && CONSTANT_P (ret))
> +	    {
> +	      byte = subreg_lowpart_offset (read_mode, new_mode);
> +	      ret = simplify_subreg (read_mode, ret, new_mode, byte);
> +	      if (ret && CONSTANT_P (ret)
> +		  && (set_src_cost (ret, read_mode, speed)
> +		      <= COSTS_N_INSNS (1)))
> +		return ret;
> +	    }
> +	}
> +    }
> +
> +  if (require_cst)
> +    return NULL_RTX;
> +
>    /* Some machines like the x86 have shift insns for each size of
>       operand.  Other machines like the ppc or the ia-64 may only have
>       shift insns that shift values within 32 or 64 bit registers.
> @@ -1729,7 +1758,7 @@ find_shift_sequence (poly_int64 access_size,
>  
>    opt_scalar_int_mode new_mode_iter;
>    FOR_EACH_MODE_FROM (new_mode_iter,
> -		      smallest_int_mode_for_size (access_size * BITS_PER_UNIT))
> +		      smallest_int_mode_for_size (GET_MODE_BITSIZE (read_mode)))
>      {
>        rtx target, new_reg, new_lhs;
>        rtx_insn *shift_seq, *insn;
> @@ -1739,34 +1768,6 @@ find_shift_sequence (poly_int64 access_size,
>        if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD)
>  	break;
>  
> -      /* If a constant was stored into memory, try to simplify it here,
> -	 otherwise the cost of the shift might preclude this optimization
> -	 e.g. at -Os, even when no actual shift will be needed.  */
> -      if (store_info->const_rhs)
> -	{
> -	  poly_uint64 byte = subreg_lowpart_offset (new_mode, store_mode);
> -	  rtx ret = simplify_subreg (new_mode, store_info->const_rhs,
> -				     store_mode, byte);
> -	  if (ret && CONSTANT_P (ret))
> -	    {
> -	      rtx shift_rtx = gen_int_shift_amount (new_mode, shift);
> -	      ret = simplify_const_binary_operation (LSHIFTRT, new_mode,
> -						     ret, shift_rtx);
> -	      if (ret && CONSTANT_P (ret))
> -		{
> -		  byte = subreg_lowpart_offset (read_mode, new_mode);
> -		  ret = simplify_subreg (read_mode, ret, new_mode, byte);
> -		  if (ret && CONSTANT_P (ret)
> -		      && (set_src_cost (ret, read_mode, speed)
> -			  <= COSTS_N_INSNS (1)))
> -		    return ret;
> -		}
> -	    }
> -	}
> -
> -      if (require_cst)
> -	return NULL_RTX;
> -
>        /* Try a wider mode if truncating the store mode to NEW_MODE
>  	 requires a real instruction.  */
>        if (maybe_lt (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode))
> @@ -1779,6 +1780,25 @@ find_shift_sequence (poly_int64 access_size,
>  	  && !targetm.modes_tieable_p (new_mode, store_mode))
>  	continue;
>  
> +      if (multiple_p (GET_MODE_BITSIZE (new_mode), shift)

Like I mentioned before, this should be the other way around:

    multiple_p (shift, GET_MODE_BITSIZE (new_mode))

i.e. for the subreg to be valid, the shift amount must be a multiple
of the outer mode size in bits.

OK with that change, thanks, and sorry for the multiple review cycles.

Richard
Segher Boessenkool Aug. 3, 2020, 11:17 p.m. UTC | #2
Hi Xiong Hu,

Very sorry I notice this only just now...

On Mon, Aug 03, 2020 at 02:58:45PM +0800, luoxhu wrote:
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr71309.c
> @@ -0,0 +1,33 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target powerpc_p9vector_ok } */

This needs

/* { dg-require-effective-target lp64 } */

as well:

> +/* { dg-final { scan-assembler-times {\mld\M} 2 } } */
> +/* { dg-final { scan-assembler-times {\mstd\M} 2 } } */

... or those insns will not be generated.


Well done getting this fixed!  Thanks,


Segher
Xionghu Luo Aug. 4, 2020, 3:28 a.m. UTC | #3
On 2020/8/3 22:01, Richard Sandiford wrote:

>>         /* Try a wider mode if truncating the store mode to NEW_MODE
>>   	 requires a real instruction.  */
>>         if (maybe_lt (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode))
>> @@ -1779,6 +1780,25 @@ find_shift_sequence (poly_int64 access_size,
>>   	  && !targetm.modes_tieable_p (new_mode, store_mode))
>>   	continue;
>>   
>> +      if (multiple_p (GET_MODE_BITSIZE (new_mode), shift)
> 
> Like I mentioned before, this should be the other way around:
> 
>      multiple_p (shift, GET_MODE_BITSIZE (new_mode))
> 
> i.e. for the subreg to be valid, the shift amount must be a multiple
> of the outer mode size in bits.
> 
> OK with that change, thanks, and sorry for the multiple review cycles.

Appreciate your time and patience to make it better :).
Updated the change and lp64 requirements from Segher and committed by
r11-2526-g265d817b1eb4644c7a9613ad6920315d98e2e0a4, thank you all.

Xionghu

> 
> Richard
>
Xionghu Luo Aug. 5, 2020, 11:30 p.m. UTC | #4
Hi Richard,

On 2020/8/3 22:01, Richard Sandiford wrote:
>>         /* Try a wider mode if truncating the store mode to NEW_MODE
>>   	 requires a real instruction.  */
>>         if (maybe_lt (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode))
>> @@ -1779,6 +1780,25 @@ find_shift_sequence (poly_int64 access_size,
>>   	  && !targetm.modes_tieable_p (new_mode, store_mode))
>>   	continue;
>>   
>> +      if (multiple_p (GET_MODE_BITSIZE (new_mode), shift)
> 
> Like I mentioned before, this should be the other way around:
> 
>      multiple_p (shift, GET_MODE_BITSIZE (new_mode))
> 
> i.e. for the subreg to be valid, the shift amount must be a multiple
> of the outer mode size in bits.
> 
> OK with that change, thanks, and sorry for the multiple review cycles.

Just had another question is do we want to backport this patch to gcc-10
and gcc-9 branches? :)


Xionghu


> 
> Richard
>
Richard Sandiford Aug. 6, 2020, 5:23 a.m. UTC | #5
luoxhu <luoxhu@linux.ibm.com> writes:
> Hi Richard,
>
> On 2020/8/3 22:01, Richard Sandiford wrote:
>>>         /* Try a wider mode if truncating the store mode to NEW_MODE
>>>   	 requires a real instruction.  */
>>>         if (maybe_lt (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode))
>>> @@ -1779,6 +1780,25 @@ find_shift_sequence (poly_int64 access_size,
>>>   	  && !targetm.modes_tieable_p (new_mode, store_mode))
>>>   	continue;
>>>   
>>> +      if (multiple_p (GET_MODE_BITSIZE (new_mode), shift)
>> 
>> Like I mentioned before, this should be the other way around:
>> 
>>      multiple_p (shift, GET_MODE_BITSIZE (new_mode))
>> 
>> i.e. for the subreg to be valid, the shift amount must be a multiple
>> of the outer mode size in bits.
>> 
>> OK with that change, thanks, and sorry for the multiple review cycles.
>
> Just had another question is do we want to backport this patch to gcc-10
> and gcc-9 branches? :)

One for the RMs, but it looks like the PR was reported against GCC 7 and
isn't a regression.  Given that it's also “just” a missed optimisation,
I'm guessing it should be trunk only.

Thanks,
Richard
Li, Pan2 via Gcc-patches Aug. 24, 2020, 7:09 p.m. UTC | #6
On Thu, 2020-08-06 at 06:23 +0100, Richard Sandiford wrote:
> luoxhu <luoxhu@linux.ibm.com> writes:
> > Hi Richard,
> > 
> > On 2020/8/3 22:01, Richard Sandiford wrote:
> > > >         /* Try a wider mode if truncating the store mode to NEW_MODE
> > > >   	 requires a real instruction.  */
> > > >         if (maybe_lt (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode))
> > > > @@ -1779,6 +1780,25 @@ find_shift_sequence (poly_int64 access_size,
> > > >   	  && !targetm.modes_tieable_p (new_mode, store_mode))
> > > >   	continue;
> > > >   
> > > > +      if (multiple_p (GET_MODE_BITSIZE (new_mode), shift)
> > > 
> > > Like I mentioned before, this should be the other way around:
> > > 
> > >      multiple_p (shift, GET_MODE_BITSIZE (new_mode))
> > > 
> > > i.e. for the subreg to be valid, the shift amount must be a multiple
> > > of the outer mode size in bits.
> > > 
> > > OK with that change, thanks, and sorry for the multiple review cycles.
> > 
> > Just had another question is do we want to backport this patch to gcc-10
> > and gcc-9 branches? :)
> 
> One for the RMs, but it looks like the PR was reported against GCC 7 and
> isn't a regression.  Given that it's also “just” a missed optimisation,
> I'm guessing it should be trunk only.
Not a release manager, but I agree, let's keep this on the trunk.

jeff
>
diff mbox series

Patch

diff --git a/gcc/dse.c b/gcc/dse.c
index bbe792e48e8..b67b5c2ba35 100644
--- a/gcc/dse.c
+++ b/gcc/dse.c
@@ -1720,6 +1720,35 @@  find_shift_sequence (poly_int64 access_size,
   scalar_int_mode new_mode;
   rtx read_reg = NULL;
 
+  /* If a constant was stored into memory, try to simplify it here,
+     otherwise the cost of the shift might preclude this optimization
+     e.g. at -Os, even when no actual shift will be needed.  */
+  if (store_info->const_rhs)
+    {
+      auto new_mode = smallest_int_mode_for_size (access_size * BITS_PER_UNIT);
+      auto byte = subreg_lowpart_offset (new_mode, store_mode);
+      rtx ret
+	= simplify_subreg (new_mode, store_info->const_rhs, store_mode, byte);
+      if (ret && CONSTANT_P (ret))
+	{
+	  rtx shift_rtx = gen_int_shift_amount (new_mode, shift);
+	  ret = simplify_const_binary_operation (LSHIFTRT, new_mode, ret,
+						 shift_rtx);
+	  if (ret && CONSTANT_P (ret))
+	    {
+	      byte = subreg_lowpart_offset (read_mode, new_mode);
+	      ret = simplify_subreg (read_mode, ret, new_mode, byte);
+	      if (ret && CONSTANT_P (ret)
+		  && (set_src_cost (ret, read_mode, speed)
+		      <= COSTS_N_INSNS (1)))
+		return ret;
+	    }
+	}
+    }
+
+  if (require_cst)
+    return NULL_RTX;
+
   /* Some machines like the x86 have shift insns for each size of
      operand.  Other machines like the ppc or the ia-64 may only have
      shift insns that shift values within 32 or 64 bit registers.
@@ -1729,7 +1758,7 @@  find_shift_sequence (poly_int64 access_size,
 
   opt_scalar_int_mode new_mode_iter;
   FOR_EACH_MODE_FROM (new_mode_iter,
-		      smallest_int_mode_for_size (access_size * BITS_PER_UNIT))
+		      smallest_int_mode_for_size (GET_MODE_BITSIZE (read_mode)))
     {
       rtx target, new_reg, new_lhs;
       rtx_insn *shift_seq, *insn;
@@ -1739,34 +1768,6 @@  find_shift_sequence (poly_int64 access_size,
       if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD)
 	break;
 
-      /* If a constant was stored into memory, try to simplify it here,
-	 otherwise the cost of the shift might preclude this optimization
-	 e.g. at -Os, even when no actual shift will be needed.  */
-      if (store_info->const_rhs)
-	{
-	  poly_uint64 byte = subreg_lowpart_offset (new_mode, store_mode);
-	  rtx ret = simplify_subreg (new_mode, store_info->const_rhs,
-				     store_mode, byte);
-	  if (ret && CONSTANT_P (ret))
-	    {
-	      rtx shift_rtx = gen_int_shift_amount (new_mode, shift);
-	      ret = simplify_const_binary_operation (LSHIFTRT, new_mode,
-						     ret, shift_rtx);
-	      if (ret && CONSTANT_P (ret))
-		{
-		  byte = subreg_lowpart_offset (read_mode, new_mode);
-		  ret = simplify_subreg (read_mode, ret, new_mode, byte);
-		  if (ret && CONSTANT_P (ret)
-		      && (set_src_cost (ret, read_mode, speed)
-			  <= COSTS_N_INSNS (1)))
-		    return ret;
-		}
-	    }
-	}
-
-      if (require_cst)
-	return NULL_RTX;
-
       /* Try a wider mode if truncating the store mode to NEW_MODE
 	 requires a real instruction.  */
       if (maybe_lt (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode))
@@ -1779,6 +1780,25 @@  find_shift_sequence (poly_int64 access_size,
 	  && !targetm.modes_tieable_p (new_mode, store_mode))
 	continue;
 
+      if (multiple_p (GET_MODE_BITSIZE (new_mode), shift)
+	  && known_le (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode)))
+	{
+	  /* Try to implement the shift using a subreg.  */
+	  poly_int64 offset = subreg_offset_from_lsb (new_mode,
+						      store_mode, shift);
+	  rtx rhs_subreg = simplify_gen_subreg (new_mode, store_info->rhs,
+						store_mode, offset);
+	  if (rhs_subreg)
+	    {
+	      read_reg
+		= extract_low_bits (read_mode, new_mode, copy_rtx (rhs_subreg));
+	      break;
+	    }
+	}
+
+      if (maybe_lt (GET_MODE_SIZE (new_mode), access_size))
+	continue;
+
       new_reg = gen_reg_rtx (new_mode);
 
       start_sequence ();
diff --git a/gcc/testsuite/gcc.target/powerpc/pr71309.c b/gcc/testsuite/gcc.target/powerpc/pr71309.c
new file mode 100644
index 00000000000..94d727a8ed9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr71309.c
@@ -0,0 +1,33 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
+
+#define TYPE void*
+#define TYPE2 void*
+
+struct path {
+    TYPE2 mnt;
+    TYPE dentry;
+};
+
+struct nameidata {
+    struct path path;
+    struct path root;
+};
+
+__attribute__ ((noinline))
+TYPE foo(struct nameidata *nd)
+{
+  TYPE d;
+  TYPE2 d2;
+
+  nd->path = nd->root;
+  d = nd->path.dentry;
+  d2 = nd->path.mnt;
+  return d;
+}
+
+/* { dg-final { scan-assembler-not {\mlxv\M} } } */
+/* { dg-final { scan-assembler-not {\mstxv\M} } } */
+/* { dg-final { scan-assembler-times {\mld\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mstd\M} 2 } } */