diff mbox series

[v4] dse: Remove partial load after full store for high part access[PR71309]

Message ID a086e3c8-870f-daf8-85ee-b63a9f3d32c0@linux.ibm.com
State New
Headers show
Series [v4] dse: Remove partial load after full store for high part access[PR71309] | expand

Commit Message

Xionghu Luo July 24, 2020, 10:47 a.m. UTC
Hi Richard,

This is the updated version that could pass all regression test on
Power9-LE. 
Just need another  "maybe_lt (GET_MODE_SIZE (new_mode), access_size)"
before generating shift for store_info->const_rhs to ensure correct
constant is generated, take testsuite/gfortran1/equiv_2.x for example:

equiv_2.x-equiv_2.f90.264r.cse2:
5: r119:DI=0x6867666564636261
6: [sfp:DI+0x20]=r119:DI
18: r123:SI=0x65646362
19: r124:SI=[sfp:DI+0x21]
20: r125:CC=cmp(r124:SI,r123:SI)

Bad code due to get SImode subreg0 and shift right 8 bits got 0x646362 in
insn #32, equiv_2.x-equiv_2.f90.265r.dse1:
5:  r119:DI=0x6867666564636261
32: r126:SI=0x646362
6:  [sfp:DI+0x20]=r119:DI
18: r123:SI=0x65646362
19: r124:SI=r126:SI
20: r125:CC=cmp(r124:SI,r123:SI)

Good code, get 0x65646362 in insn #32, equiv_2.x-equiv_2.f90.265r.dse1:
5:  r119:DI=0x6867666564636261
32: r126:SI=0x65646362
6:  [sfp:DI+0x20]=r119:DI
18: r123:SI=0x65646362
19: r124:SI=r126:SI
20: r125:CC=cmp(r124:SI,r123:SI)

So the new_mode size should also be GE than access_size for const_rhs shift.
It seems a bit more complicated now...



[PATCH v4] dse: Remove partial load after full store for high part access[PR71309]


This patch could optimize(works for char/short/int/void*):

6: r119:TI=[r118:DI+0x10]
7: [r118:DI]=r119:TI
8: r121:DI=[r118:DI+0x8]

=>

6: r119:TI=[r118:DI+0x10]
16: r122:DI=r119:TI#8

Final ASM will be as below without partial load after full store(stxv+ld):
  ld 10,16(3)
  mr 9,3
  ld 3,24(3)
  std 10,0(9)
  std 3,8(9)
  blr

It could achieve ~25% performance improvement for typical cases on
Power9.  Bootstrap and regression tested on Power9-LE.

For AArch64, one ldr is replaced by mov with this patch:

ldp     x2, x3, [x0, 16]
stp     x2, x3, [x0]
ldr     x0, [x0, 8]

=>

mov     x1, x0
ldp     x2, x0, [x0, 16]
stp     x2, x0, [x1]

gcc/ChangeLog:

2020-07-24  Xionghu Luo  <luoxhu@linux.ibm.com>

	PR rtl-optimization/71309
	* dse.c (find_shift_sequence): Use subreg of shifted from high part
	register to avoid loading from address.

gcc/testsuite/ChangeLog:

2020-07-24  Xionghu Luo  <luoxhu@linux.ibm.com>

	PR rtl-optimization/71309
	* gcc.target/powerpc/pr71309.c: New test.
---
 gcc/dse.c                                  | 25 ++++++++++++++--
 gcc/testsuite/gcc.target/powerpc/pr71309.c | 33 ++++++++++++++++++++++
 2 files changed, 56 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c

Comments

Xionghu Luo July 28, 2020, 11:06 p.m. UTC | #1
Gentle ping in case this mail is missed, Thanks :)

https://gcc.gnu.org/pipermail/gcc-patches/2020-July/550602.html

Xionghu


On 2020/7/24 18:47, luoxhu via Gcc-patches wrote:
> Hi Richard,
> 
> This is the updated version that could pass all regression test on
> Power9-LE.
> Just need another  "maybe_lt (GET_MODE_SIZE (new_mode), access_size)"
> before generating shift for store_info->const_rhs to ensure correct
> constant is generated, take testsuite/gfortran1/equiv_2.x for example:
> 
> equiv_2.x-equiv_2.f90.264r.cse2:
> 5: r119:DI=0x6867666564636261
> 6: [sfp:DI+0x20]=r119:DI
> 18: r123:SI=0x65646362
> 19: r124:SI=[sfp:DI+0x21]
> 20: r125:CC=cmp(r124:SI,r123:SI)
> 
> Bad code due to get SImode subreg0 and shift right 8 bits got 0x646362 in
> insn #32, equiv_2.x-equiv_2.f90.265r.dse1:
> 5:  r119:DI=0x6867666564636261
> 32: r126:SI=0x646362
> 6:  [sfp:DI+0x20]=r119:DI
> 18: r123:SI=0x65646362
> 19: r124:SI=r126:SI
> 20: r125:CC=cmp(r124:SI,r123:SI)
> 
> Good code, get 0x65646362 in insn #32, equiv_2.x-equiv_2.f90.265r.dse1:
> 5:  r119:DI=0x6867666564636261
> 32: r126:SI=0x65646362
> 6:  [sfp:DI+0x20]=r119:DI
> 18: r123:SI=0x65646362
> 19: r124:SI=r126:SI
> 20: r125:CC=cmp(r124:SI,r123:SI)
> 
> So the new_mode size should also be GE than access_size for const_rhs shift.
> It seems a bit more complicated now...
> 
> 
> 
> [PATCH v4] dse: Remove partial load after full store for high part access[PR71309]
> 
> 
> This patch could optimize(works for char/short/int/void*):
> 
> 6: r119:TI=[r118:DI+0x10]
> 7: [r118:DI]=r119:TI
> 8: r121:DI=[r118:DI+0x8]
> 
> =>
> 
> 6: r119:TI=[r118:DI+0x10]
> 16: r122:DI=r119:TI#8
> 
> Final ASM will be as below without partial load after full store(stxv+ld):
>    ld 10,16(3)
>    mr 9,3
>    ld 3,24(3)
>    std 10,0(9)
>    std 3,8(9)
>    blr
> 
> It could achieve ~25% performance improvement for typical cases on
> Power9.  Bootstrap and regression tested on Power9-LE.
> 
> For AArch64, one ldr is replaced by mov with this patch:
> 
> ldp     x2, x3, [x0, 16]
> stp     x2, x3, [x0]
> ldr     x0, [x0, 8]
> 
> =>
> 
> mov     x1, x0
> ldp     x2, x0, [x0, 16]
> stp     x2, x0, [x1]
> 
> gcc/ChangeLog:
> 
> 2020-07-24  Xionghu Luo  <luoxhu@linux.ibm.com>
> 
> 	PR rtl-optimization/71309
> 	* dse.c (find_shift_sequence): Use subreg of shifted from high part
> 	register to avoid loading from address.
> 
> gcc/testsuite/ChangeLog:
> 
> 2020-07-24  Xionghu Luo  <luoxhu@linux.ibm.com>
> 
> 	PR rtl-optimization/71309
> 	* gcc.target/powerpc/pr71309.c: New test.
> ---
>   gcc/dse.c                                  | 25 ++++++++++++++--
>   gcc/testsuite/gcc.target/powerpc/pr71309.c | 33 ++++++++++++++++++++++
>   2 files changed, 56 insertions(+), 2 deletions(-)
>   create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c
> 
> diff --git a/gcc/dse.c b/gcc/dse.c
> index bbe792e48e8..a7cdc43a182 100644
> --- a/gcc/dse.c
> +++ b/gcc/dse.c
> @@ -1728,8 +1728,7 @@ find_shift_sequence (poly_int64 access_size,
>        the machine.  */
> 
>     opt_scalar_int_mode new_mode_iter;
> -  FOR_EACH_MODE_FROM (new_mode_iter,
> -		      smallest_int_mode_for_size (access_size * BITS_PER_UNIT))
> +  FOR_EACH_MODE_IN_CLASS (new_mode_iter, MODE_INT)
>       {
>         rtx target, new_reg, new_lhs;
>         rtx_insn *shift_seq, *insn;
> @@ -1744,6 +1743,9 @@ find_shift_sequence (poly_int64 access_size,
>   	 e.g. at -Os, even when no actual shift will be needed.  */
>         if (store_info->const_rhs)
>   	{
> +	  if (maybe_lt (GET_MODE_SIZE (new_mode), access_size))
> +	    continue;
> +
>   	  poly_uint64 byte = subreg_lowpart_offset (new_mode, store_mode);
>   	  rtx ret = simplify_subreg (new_mode, store_info->const_rhs,
>   				     store_mode, byte);
> @@ -1779,6 +1781,25 @@ find_shift_sequence (poly_int64 access_size,
>   	  && !targetm.modes_tieable_p (new_mode, store_mode))
>   	continue;
> 
> +      if (multiple_p (GET_MODE_BITSIZE (new_mode), shift)
> +	  && known_le (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode)))
> +	{
> +	  /* Try to implement the shift using a subreg.  */
> +	  poly_int64 offset = subreg_offset_from_lsb (new_mode,
> +						      store_mode, shift);
> +	  rtx rhs_subreg = simplify_gen_subreg (new_mode, store_info->rhs,
> +						store_mode, offset);
> +	  if (rhs_subreg)
> +	    {
> +	      read_reg
> +		= extract_low_bits (read_mode, new_mode, copy_rtx (rhs_subreg));
> +	      break;
> +	    }
> +	}
> +
> +      if (maybe_lt (GET_MODE_SIZE (new_mode), access_size))
> +	continue;
> +
>         new_reg = gen_reg_rtx (new_mode);
> 
>         start_sequence ();
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr71309.c b/gcc/testsuite/gcc.target/powerpc/pr71309.c
> new file mode 100644
> index 00000000000..94d727a8ed9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr71309.c
> @@ -0,0 +1,33 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target powerpc_p9vector_ok } */
> +/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
> +
> +#define TYPE void*
> +#define TYPE2 void*
> +
> +struct path {
> +    TYPE2 mnt;
> +    TYPE dentry;
> +};
> +
> +struct nameidata {
> +    struct path path;
> +    struct path root;
> +};
> +
> +__attribute__ ((noinline))
> +TYPE foo(struct nameidata *nd)
> +{
> +  TYPE d;
> +  TYPE2 d2;
> +
> +  nd->path = nd->root;
> +  d = nd->path.dentry;
> +  d2 = nd->path.mnt;
> +  return d;
> +}
> +
> +/* { dg-final { scan-assembler-not {\mlxv\M} } } */
> +/* { dg-final { scan-assembler-not {\mstxv\M} } } */
> +/* { dg-final { scan-assembler-times {\mld\M} 2 } } */
> +/* { dg-final { scan-assembler-times {\mstd\M} 2 } } */
>
Richard Sandiford July 31, 2020, 4:29 p.m. UTC | #2
Sorry for the slow reply.

luoxhu <luoxhu@linux.ibm.com> writes:
> Hi Richard,
>
> This is the updated version that could pass all regression test on
> Power9-LE. 
> Just need another  "maybe_lt (GET_MODE_SIZE (new_mode), access_size)"
> before generating shift for store_info->const_rhs to ensure correct
> constant is generated, take testsuite/gfortran1/equiv_2.x for example:
>
> equiv_2.x-equiv_2.f90.264r.cse2:
> 5: r119:DI=0x6867666564636261
> 6: [sfp:DI+0x20]=r119:DI
> 18: r123:SI=0x65646362
> 19: r124:SI=[sfp:DI+0x21]
> 20: r125:CC=cmp(r124:SI,r123:SI)
>
> Bad code due to get SImode subreg0 and shift right 8 bits got 0x646362 in
> insn #32, equiv_2.x-equiv_2.f90.265r.dse1:
> 5:  r119:DI=0x6867666564636261
> 32: r126:SI=0x646362
> 6:  [sfp:DI+0x20]=r119:DI
> 18: r123:SI=0x65646362
> 19: r124:SI=r126:SI
> 20: r125:CC=cmp(r124:SI,r123:SI)
>
> Good code, get 0x65646362 in insn #32, equiv_2.x-equiv_2.f90.265r.dse1:
> 5:  r119:DI=0x6867666564636261
> 32: r126:SI=0x65646362
> 6:  [sfp:DI+0x20]=r119:DI
> 18: r123:SI=0x65646362
> 19: r124:SI=r126:SI
> 20: r125:CC=cmp(r124:SI,r123:SI)
>
> So the new_mode size should also be GE than access_size for const_rhs shift.
> It seems a bit more complicated now...

Yeah.  The structure of this code makes it hard to make changes to it. :-/

Since the const_rhs code is only trying to fold the operation at compile
time, without regard for what the target supports at runtime, I think it
*does* make sense to use smallest_int_mode_for_size there.  I.e. we should
be able to hoist the code out of the loop and do:

  if (store_info->const_rhs)
    {
      auto new_mode = smallest_int_mode_for_size (access_size * BITS_PER_UNIT);
      auto byte = subreg_lowpart_offset (new_mode, store_mode);
      rtx ret = simplify_subreg (new_mode, store_info->const_rhs,
                                 store_mode, byte);
      if (ret && CONSTANT_P (ret))
        …
    }

  if (require_cst)
    return NULL_RTX;

  …the current loop…

I don't see any reason to restrict the const_rhs case to BITS_PER_WORD.
And if the simplify_subreg above fails for new_mode, I can't think it
would succeed for a wider mode (which would have to provide *more*
constant data than the failed subreg).

> @@ -1728,8 +1728,7 @@ find_shift_sequence (poly_int64 access_size,
>       the machine.  */
>  
>    opt_scalar_int_mode new_mode_iter;
> -  FOR_EACH_MODE_FROM (new_mode_iter,
> -		      smallest_int_mode_for_size (access_size * BITS_PER_UNIT))
> +  FOR_EACH_MODE_IN_CLASS (new_mode_iter, MODE_INT)

Sorry, I now realise it would be better to start at:

smallest_int_mode_for_size (GET_MODE_BITSIZE (read_mode))

Thanks,
Richard
diff mbox series

Patch

diff --git a/gcc/dse.c b/gcc/dse.c
index bbe792e48e8..a7cdc43a182 100644
--- a/gcc/dse.c
+++ b/gcc/dse.c
@@ -1728,8 +1728,7 @@  find_shift_sequence (poly_int64 access_size,
      the machine.  */
 
   opt_scalar_int_mode new_mode_iter;
-  FOR_EACH_MODE_FROM (new_mode_iter,
-		      smallest_int_mode_for_size (access_size * BITS_PER_UNIT))
+  FOR_EACH_MODE_IN_CLASS (new_mode_iter, MODE_INT)
     {
       rtx target, new_reg, new_lhs;
       rtx_insn *shift_seq, *insn;
@@ -1744,6 +1743,9 @@  find_shift_sequence (poly_int64 access_size,
 	 e.g. at -Os, even when no actual shift will be needed.  */
       if (store_info->const_rhs)
 	{
+	  if (maybe_lt (GET_MODE_SIZE (new_mode), access_size))
+	    continue;
+
 	  poly_uint64 byte = subreg_lowpart_offset (new_mode, store_mode);
 	  rtx ret = simplify_subreg (new_mode, store_info->const_rhs,
 				     store_mode, byte);
@@ -1779,6 +1781,25 @@  find_shift_sequence (poly_int64 access_size,
 	  && !targetm.modes_tieable_p (new_mode, store_mode))
 	continue;
 
+      if (multiple_p (GET_MODE_BITSIZE (new_mode), shift)
+	  && known_le (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode)))
+	{
+	  /* Try to implement the shift using a subreg.  */
+	  poly_int64 offset = subreg_offset_from_lsb (new_mode,
+						      store_mode, shift);
+	  rtx rhs_subreg = simplify_gen_subreg (new_mode, store_info->rhs,
+						store_mode, offset);
+	  if (rhs_subreg)
+	    {
+	      read_reg
+		= extract_low_bits (read_mode, new_mode, copy_rtx (rhs_subreg));
+	      break;
+	    }
+	}
+
+      if (maybe_lt (GET_MODE_SIZE (new_mode), access_size))
+	continue;
+
       new_reg = gen_reg_rtx (new_mode);
 
       start_sequence ();
diff --git a/gcc/testsuite/gcc.target/powerpc/pr71309.c b/gcc/testsuite/gcc.target/powerpc/pr71309.c
new file mode 100644
index 00000000000..94d727a8ed9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr71309.c
@@ -0,0 +1,33 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
+
+#define TYPE void*
+#define TYPE2 void*
+
+struct path {
+    TYPE2 mnt;
+    TYPE dentry;
+};
+
+struct nameidata {
+    struct path path;
+    struct path root;
+};
+
+__attribute__ ((noinline))
+TYPE foo(struct nameidata *nd)
+{
+  TYPE d;
+  TYPE2 d2;
+
+  nd->path = nd->root;
+  d = nd->path.dentry;
+  d2 = nd->path.mnt;
+  return d;
+}
+
+/* { dg-final { scan-assembler-not {\mlxv\M} } } */
+/* { dg-final { scan-assembler-not {\mstxv\M} } } */
+/* { dg-final { scan-assembler-times {\mld\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mstd\M} 2 } } */