diff mbox series

combine: Narrow comparison of memory and constant

Message ID 20230612075737.1801-1-stefansf@linux.ibm.com
State New
Headers show
Series combine: Narrow comparison of memory and constant | expand

Commit Message

Stefan Schulze Frielinghaus June 12, 2023, 7:57 a.m. UTC
Comparisons between memory and constants might be done in a smaller mode
resulting in smaller constants which might finally end up as immediates
instead of in the literal pool.

For example, on s390x a non-symmetric comparison like
  x <= 0x3fffffffffffffff
results in the constant being spilled to the literal pool and an 8 byte
memory comparison is emitted.  Ideally, an equivalent comparison
  x0 <= 0x3f
where x0 is the most significant byte of x, is emitted where the
constant is smaller and more likely to materialize as an immediate.

Similarly, comparisons of the form
  x >= 0x4000000000000000
can be shortened into x0 >= 0x40.

I'm not entirely sure whether combine is the right place to implement
something like this.  In my first try I implemented it in
TARGET_CANONICALIZE_COMPARISON but then thought other targets might
profit from it, too.  simplify_context::simplify_relational_operation_1
seems to be the wrong place since code/mode may change.  Any opinions?

gcc/ChangeLog:

	* combine.cc (simplify_compare_const): Narrow comparison of
	memory and constant.
	(try_combine): Adapt new function signature.
	(simplify_comparison): Adapt new function signature.

gcc/testsuite/ChangeLog:

	* gcc.target/s390/cmp-mem-const-1.c: New test.
	* gcc.target/s390/cmp-mem-const-2.c: New test.
---
 gcc/combine.cc                                | 82 ++++++++++++++-
 .../gcc.target/s390/cmp-mem-const-1.c         | 99 +++++++++++++++++++
 .../gcc.target/s390/cmp-mem-const-2.c         | 23 +++++
 3 files changed, 200 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c
 create mode 100644 gcc/testsuite/gcc.target/s390/cmp-mem-const-2.c

Comments

Jeff Law June 12, 2023, 9:29 p.m. UTC | #1
On 6/12/23 01:57, Stefan Schulze Frielinghaus via Gcc-patches wrote:
> Comparisons between memory and constants might be done in a smaller mode
> resulting in smaller constants which might finally end up as immediates
> instead of in the literal pool.
> 
> For example, on s390x a non-symmetric comparison like
>    x <= 0x3fffffffffffffff
> results in the constant being spilled to the literal pool and an 8 byte
> memory comparison is emitted.  Ideally, an equivalent comparison
>    x0 <= 0x3f
> where x0 is the most significant byte of x, is emitted where the
> constant is smaller and more likely to materialize as an immediate.
> 
> Similarly, comparisons of the form
>    x >= 0x4000000000000000
> can be shortened into x0 >= 0x40.
> 
> I'm not entirely sure whether combine is the right place to implement
> something like this.  In my first try I implemented it in
> TARGET_CANONICALIZE_COMPARISON but then thought other targets might
> profit from it, too.  simplify_context::simplify_relational_operation_1
> seems to be the wrong place since code/mode may change.  Any opinions?
> 
> gcc/ChangeLog:
> 
> 	* combine.cc (simplify_compare_const): Narrow comparison of
> 	memory and constant.
> 	(try_combine): Adapt new function signature.
> 	(simplify_comparison): Adapt new function signature.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.target/s390/cmp-mem-const-1.c: New test.
> 	* gcc.target/s390/cmp-mem-const-2.c: New test.
This does seem more general than we'd want to do in the canonicalization 
hook.  So thanks for going the extra mile and doing a generic 
implementation.




> @@ -11987,6 +11988,79 @@ simplify_compare_const (enum rtx_code code, machine_mode mode,
>         break;
>       }
>   
> +  /* Narrow non-symmetric comparison of memory and constant as e.g.
> +     x0...x7 <= 0x3fffffffffffffff into x0 <= 0x3f where x0 is the most
> +     significant byte.  Likewise, transform x0...x7 >= 0x4000000000000000 into
> +     x0 >= 0x40.  */
> +  if ((code == LEU || code == LTU || code == GEU || code == GTU)
> +      && is_a <scalar_int_mode> (GET_MODE (op0), &int_mode)
> +      && MEM_P (op0)
> +      && !MEM_VOLATILE_P (op0)
> +      && (unsigned HOST_WIDE_INT)const_op > 0xff)
> +    {
> +      unsigned HOST_WIDE_INT n = (unsigned HOST_WIDE_INT)const_op;
> +      enum rtx_code adjusted_code = code;
> +
> +      /* If the least significant bit is already zero, then adjust the
> +	 comparison in the hope that we hit cases like
> +	   op0  <= 0x3ffffdfffffffffe
> +	 where the adjusted comparison
> +	   op0  <  0x3ffffdffffffffff
> +	 can be shortened into
> +	   op0' <  0x3ffffd.  */
> +      if (code == LEU && (n & 1) == 0)
> +	{
> +	  ++n;
> +	  adjusted_code = LTU;
> +	}
> +      /* or e.g. op0 < 0x4020000000000000  */
> +      else if (code == LTU && (n & 1) == 0)
> +	{
> +	  --n;
> +	  adjusted_code = LEU;
> +	}
> +      /* or op0 >= 0x4000000000000001  */
> +      else if (code == GEU && (n & 1) == 1)
> +	{
> +	  --n;
> +	  adjusted_code = GTU;
> +	}
> +      /* or op0 > 0x3fffffffffffffff.  */
> +      else if (code == GTU && (n & 1) == 1)
> +	{
> +	  ++n;
> +	  adjusted_code = GEU;
> +	}
> +
> +      scalar_int_mode narrow_mode_iter;
> +      bool lower_p = code == LEU || code == LTU;
> +      bool greater_p = !lower_p;
> +      FOR_EACH_MODE_UNTIL (narrow_mode_iter, int_mode)
> +	{
> +	  unsigned nbits = GET_MODE_PRECISION (int_mode)
> +			  - GET_MODE_PRECISION (narrow_mode_iter);
> +	  unsigned HOST_WIDE_INT mask = (HOST_WIDE_INT_1U << nbits) - 1;
> +	  unsigned HOST_WIDE_INT lower_bits = n & mask;
> +	  if ((lower_p && lower_bits == mask)
> +	      || (greater_p && lower_bits == 0))
> +	    {
> +	      n >>= nbits;
> +	      break;
> +	    }
> +	}
> +
> +      if (narrow_mode_iter < int_mode)
> +	{
> +	  poly_int64 offset = BYTES_BIG_ENDIAN
> +				? 0
> +				: GET_MODE_SIZE (int_mode)
> +				  - GET_MODE_SIZE (narrow_mode_iter);
Go ahead and add some parenthesis here.  I'd add one pair around the 
whole RHS of that assignment.  The '?' and ':' would line up under the 
'B' in that case.  Similarly add them around the false arm of the 
ternary.  The '-' will line up under the 'G'.

Going to trust you got the little endian adjustment correct here ;-)


>   
>         /* Compute some predicates to simplify code below.  */
> diff --git a/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c b/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c
> new file mode 100644
> index 00000000000..b90c2a8c224
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c
> @@ -0,0 +1,99 @@
> +/* { dg-do compile { target { lp64 } } } */
> +/* { dg-options "-O1 -march=z13 -mzarch" } */
> +/* { dg-final { scan-assembler-not {\tclc\t} } } */
> +
> +int
> +ge_8b (unsigned long *x)
> +{
> +  return *x >= 0x4000000000000000;
> +}
Would it be possible to add some debugging output in 
simplify_compare_const so that you could search for that debugging 
output and make these tests generic?

Jeff
Stefan Schulze Frielinghaus June 19, 2023, 2:19 p.m. UTC | #2
On Mon, Jun 12, 2023 at 03:29:00PM -0600, Jeff Law wrote:
> 
> 
> On 6/12/23 01:57, Stefan Schulze Frielinghaus via Gcc-patches wrote:
> > Comparisons between memory and constants might be done in a smaller mode
> > resulting in smaller constants which might finally end up as immediates
> > instead of in the literal pool.
> > 
> > For example, on s390x a non-symmetric comparison like
> >    x <= 0x3fffffffffffffff
> > results in the constant being spilled to the literal pool and an 8 byte
> > memory comparison is emitted.  Ideally, an equivalent comparison
> >    x0 <= 0x3f
> > where x0 is the most significant byte of x, is emitted where the
> > constant is smaller and more likely to materialize as an immediate.
> > 
> > Similarly, comparisons of the form
> >    x >= 0x4000000000000000
> > can be shortened into x0 >= 0x40.
> > 
> > I'm not entirely sure whether combine is the right place to implement
> > something like this.  In my first try I implemented it in
> > TARGET_CANONICALIZE_COMPARISON but then thought other targets might
> > profit from it, too.  simplify_context::simplify_relational_operation_1
> > seems to be the wrong place since code/mode may change.  Any opinions?
> > 
> > gcc/ChangeLog:
> > 
> > 	* combine.cc (simplify_compare_const): Narrow comparison of
> > 	memory and constant.
> > 	(try_combine): Adapt new function signature.
> > 	(simplify_comparison): Adapt new function signature.
> > 
> > gcc/testsuite/ChangeLog:
> > 
> > 	* gcc.target/s390/cmp-mem-const-1.c: New test.
> > 	* gcc.target/s390/cmp-mem-const-2.c: New test.
> This does seem more general than we'd want to do in the canonicalization
> hook.  So thanks for going the extra mile and doing a generic
> implementation.
> 
> 
> 
> 
> > @@ -11987,6 +11988,79 @@ simplify_compare_const (enum rtx_code code, machine_mode mode,
> >         break;
> >       }
> > +  /* Narrow non-symmetric comparison of memory and constant as e.g.
> > +     x0...x7 <= 0x3fffffffffffffff into x0 <= 0x3f where x0 is the most
> > +     significant byte.  Likewise, transform x0...x7 >= 0x4000000000000000 into
> > +     x0 >= 0x40.  */
> > +  if ((code == LEU || code == LTU || code == GEU || code == GTU)
> > +      && is_a <scalar_int_mode> (GET_MODE (op0), &int_mode)
> > +      && MEM_P (op0)
> > +      && !MEM_VOLATILE_P (op0)
> > +      && (unsigned HOST_WIDE_INT)const_op > 0xff)
> > +    {
> > +      unsigned HOST_WIDE_INT n = (unsigned HOST_WIDE_INT)const_op;
> > +      enum rtx_code adjusted_code = code;
> > +
> > +      /* If the least significant bit is already zero, then adjust the
> > +	 comparison in the hope that we hit cases like
> > +	   op0  <= 0x3ffffdfffffffffe
> > +	 where the adjusted comparison
> > +	   op0  <  0x3ffffdffffffffff
> > +	 can be shortened into
> > +	   op0' <  0x3ffffd.  */
> > +      if (code == LEU && (n & 1) == 0)
> > +	{
> > +	  ++n;
> > +	  adjusted_code = LTU;
> > +	}
> > +      /* or e.g. op0 < 0x4020000000000000  */
> > +      else if (code == LTU && (n & 1) == 0)
> > +	{
> > +	  --n;
> > +	  adjusted_code = LEU;
> > +	}
> > +      /* or op0 >= 0x4000000000000001  */
> > +      else if (code == GEU && (n & 1) == 1)
> > +	{
> > +	  --n;
> > +	  adjusted_code = GTU;
> > +	}
> > +      /* or op0 > 0x3fffffffffffffff.  */
> > +      else if (code == GTU && (n & 1) == 1)
> > +	{
> > +	  ++n;
> > +	  adjusted_code = GEU;
> > +	}
> > +
> > +      scalar_int_mode narrow_mode_iter;
> > +      bool lower_p = code == LEU || code == LTU;
> > +      bool greater_p = !lower_p;
> > +      FOR_EACH_MODE_UNTIL (narrow_mode_iter, int_mode)
> > +	{
> > +	  unsigned nbits = GET_MODE_PRECISION (int_mode)
> > +			  - GET_MODE_PRECISION (narrow_mode_iter);
> > +	  unsigned HOST_WIDE_INT mask = (HOST_WIDE_INT_1U << nbits) - 1;
> > +	  unsigned HOST_WIDE_INT lower_bits = n & mask;
> > +	  if ((lower_p && lower_bits == mask)
> > +	      || (greater_p && lower_bits == 0))
> > +	    {
> > +	      n >>= nbits;
> > +	      break;
> > +	    }
> > +	}
> > +
> > +      if (narrow_mode_iter < int_mode)
> > +	{
> > +	  poly_int64 offset = BYTES_BIG_ENDIAN
> > +				? 0
> > +				: GET_MODE_SIZE (int_mode)
> > +				  - GET_MODE_SIZE (narrow_mode_iter);
> Go ahead and add some parenthesis here.  I'd add one pair around the whole
> RHS of that assignment.  The '?' and ':' would line up under the 'B' in that
> case.  Similarly add them around the false arm of the ternary.  The '-' will
> line up under the 'G'.

Done.

> 
> Going to trust you got the little endian adjustment correct here ;-)

Sadly I gave it a try on x64, aarch64, and powerpc64le and in all cases
the resulting instructions were rejected either because the costs were
higher or because the new instructions failed to match.  Thus currently I
have tested this only thoroughly on s390x.

> 
> 
> >         /* Compute some predicates to simplify code below.  */
> > diff --git a/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c b/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c
> > new file mode 100644
> > index 00000000000..b90c2a8c224
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c
> > @@ -0,0 +1,99 @@
> > +/* { dg-do compile { target { lp64 } } } */
> > +/* { dg-options "-O1 -march=z13 -mzarch" } */
> > +/* { dg-final { scan-assembler-not {\tclc\t} } } */
> > +
> > +int
> > +ge_8b (unsigned long *x)
> > +{
> > +  return *x >= 0x4000000000000000;
> > +}
> Would it be possible to add some debugging output in simplify_compare_const
> so that you could search for that debugging output and make these tests
> generic?

I very much like this idea, i.e., made those tests generic by adding
some debug output.

Beside that I cleaned up the code a bit and will send a V2 shortly.

Cheers,
Stefan
diff mbox series

Patch

diff --git a/gcc/combine.cc b/gcc/combine.cc
index 5aa0ec5c45a..6ad1600dc1b 100644
--- a/gcc/combine.cc
+++ b/gcc/combine.cc
@@ -460,7 +460,7 @@  static rtx simplify_shift_const (rtx, enum rtx_code, machine_mode, rtx,
 static int recog_for_combine (rtx *, rtx_insn *, rtx *);
 static rtx gen_lowpart_for_combine (machine_mode, rtx);
 static enum rtx_code simplify_compare_const (enum rtx_code, machine_mode,
-					     rtx, rtx *);
+					     rtx *, rtx *);
 static enum rtx_code simplify_comparison (enum rtx_code, rtx *, rtx *);
 static void update_table_tick (rtx);
 static void record_value_for_reg (rtx, rtx_insn *, rtx);
@@ -3185,7 +3185,7 @@  try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, rtx_insn *i0,
 	  compare_code = orig_compare_code = GET_CODE (*cc_use_loc);
 	  if (is_a <scalar_int_mode> (GET_MODE (i2dest), &mode))
 	    compare_code = simplify_compare_const (compare_code, mode,
-						   op0, &op1);
+						   &op0, &op1);
 	  target_canonicalize_comparison (&compare_code, &op0, &op1, 1);
 	}
 
@@ -11800,9 +11800,10 @@  gen_lowpart_for_combine (machine_mode omode, rtx x)
 
 static enum rtx_code
 simplify_compare_const (enum rtx_code code, machine_mode mode,
-			rtx op0, rtx *pop1)
+			rtx *pop0, rtx *pop1)
 {
   scalar_int_mode int_mode;
+  rtx op0 = *pop0;
   HOST_WIDE_INT const_op = INTVAL (*pop1);
 
   /* Get the constant we are comparing against and turn off all bits
@@ -11987,6 +11988,79 @@  simplify_compare_const (enum rtx_code code, machine_mode mode,
       break;
     }
 
+  /* Narrow non-symmetric comparison of memory and constant as e.g.
+     x0...x7 <= 0x3fffffffffffffff into x0 <= 0x3f where x0 is the most
+     significant byte.  Likewise, transform x0...x7 >= 0x4000000000000000 into
+     x0 >= 0x40.  */
+  if ((code == LEU || code == LTU || code == GEU || code == GTU)
+      && is_a <scalar_int_mode> (GET_MODE (op0), &int_mode)
+      && MEM_P (op0)
+      && !MEM_VOLATILE_P (op0)
+      && (unsigned HOST_WIDE_INT)const_op > 0xff)
+    {
+      unsigned HOST_WIDE_INT n = (unsigned HOST_WIDE_INT)const_op;
+      enum rtx_code adjusted_code = code;
+
+      /* If the least significant bit is already zero, then adjust the
+	 comparison in the hope that we hit cases like
+	   op0  <= 0x3ffffdfffffffffe
+	 where the adjusted comparison
+	   op0  <  0x3ffffdffffffffff
+	 can be shortened into
+	   op0' <  0x3ffffd.  */
+      if (code == LEU && (n & 1) == 0)
+	{
+	  ++n;
+	  adjusted_code = LTU;
+	}
+      /* or e.g. op0 < 0x4020000000000000  */
+      else if (code == LTU && (n & 1) == 0)
+	{
+	  --n;
+	  adjusted_code = LEU;
+	}
+      /* or op0 >= 0x4000000000000001  */
+      else if (code == GEU && (n & 1) == 1)
+	{
+	  --n;
+	  adjusted_code = GTU;
+	}
+      /* or op0 > 0x3fffffffffffffff.  */
+      else if (code == GTU && (n & 1) == 1)
+	{
+	  ++n;
+	  adjusted_code = GEU;
+	}
+
+      scalar_int_mode narrow_mode_iter;
+      bool lower_p = code == LEU || code == LTU;
+      bool greater_p = !lower_p;
+      FOR_EACH_MODE_UNTIL (narrow_mode_iter, int_mode)
+	{
+	  unsigned nbits = GET_MODE_PRECISION (int_mode)
+			  - GET_MODE_PRECISION (narrow_mode_iter);
+	  unsigned HOST_WIDE_INT mask = (HOST_WIDE_INT_1U << nbits) - 1;
+	  unsigned HOST_WIDE_INT lower_bits = n & mask;
+	  if ((lower_p && lower_bits == mask)
+	      || (greater_p && lower_bits == 0))
+	    {
+	      n >>= nbits;
+	      break;
+	    }
+	}
+
+      if (narrow_mode_iter < int_mode)
+	{
+	  poly_int64 offset = BYTES_BIG_ENDIAN
+				? 0
+				: GET_MODE_SIZE (int_mode)
+				  - GET_MODE_SIZE (narrow_mode_iter);
+	  *pop0 = adjust_address (op0, narrow_mode_iter, offset);
+	  *pop1 = GEN_INT (n);
+	  return adjusted_code;
+	}
+    }
+
   *pop1 = GEN_INT (const_op);
   return code;
 }
@@ -12179,7 +12253,7 @@  simplify_comparison (enum rtx_code code, rtx *pop0, rtx *pop1)
 
       /* Try to simplify the compare to constant, possibly changing the
 	 comparison op, and/or changing op1 to zero.  */
-      code = simplify_compare_const (code, raw_mode, op0, &op1);
+      code = simplify_compare_const (code, raw_mode, &op0, &op1);
       const_op = INTVAL (op1);
 
       /* Compute some predicates to simplify code below.  */
diff --git a/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c b/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c
new file mode 100644
index 00000000000..b90c2a8c224
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c
@@ -0,0 +1,99 @@ 
+/* { dg-do compile { target { lp64 } } } */
+/* { dg-options "-O1 -march=z13 -mzarch" } */
+/* { dg-final { scan-assembler-not {\tclc\t} } } */
+
+int
+ge_8b (unsigned long *x)
+{
+  return *x >= 0x4000000000000000;
+}
+
+int
+ge_8b_adjust (unsigned long *x)
+{
+  return *x >= 0x4000000000000001;
+}
+
+int
+ge_6b (unsigned long *x)
+{
+  return *x >= 0x400000000000;
+}
+
+int
+ge_6b_adjust (unsigned long *x)
+{
+  return *x >= 0x400000000001;
+}
+
+int
+gt_8b (unsigned long *x)
+{
+  return *x > 0x4000000000000000;
+}
+
+int
+gt_8b_adjust (unsigned long *x)
+{
+  return *x > 0x3fffffffffffffff;
+}
+
+int
+gt_6b (unsigned long *x)
+{
+  return *x > 0x400000000000;
+}
+
+int
+gt_6b_adjust (unsigned long *x)
+{
+  return *x > 0x3fffffffffff;
+}
+
+int
+le_8b (unsigned long *x)
+{
+  return *x <= 0x3ff7ffffffffffff;
+}
+
+int
+le_6b (unsigned long *x)
+{
+  return *x <= 0x3f7fffffffff;
+}
+
+int
+le_8b_adjust (unsigned long *x)
+{
+  return *x <= 0x3ffffdfffffffffe;
+}
+
+int
+le_6b_adjust (unsigned long *x)
+{
+  return *x <= 0x3ffffffffffe;
+}
+
+int
+lt_8b (unsigned long *x)
+{
+  return *x < 0x3fffffffffffffff;
+}
+
+int
+lt_6b (unsigned long *x)
+{
+  return *x < 0x3fffffffffff;
+}
+
+int
+lt_8b_adjust (unsigned long *x)
+{
+  return *x < 0x4020000000000000;
+}
+
+int
+lt_6b_adjust (unsigned long *x)
+{
+  return *x < 0x400000000000;
+}
diff --git a/gcc/testsuite/gcc.target/s390/cmp-mem-const-2.c b/gcc/testsuite/gcc.target/s390/cmp-mem-const-2.c
new file mode 100644
index 00000000000..e3951870d94
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/cmp-mem-const-2.c
@@ -0,0 +1,23 @@ 
+/* { dg-do compile { target { lp64 } } } */
+/* { dg-options "-O1 -march=z13 -mzarch" } */
+/* { dg-final { scan-assembler-not {\tclc\t} } } */
+
+struct s
+{
+  long a;
+  unsigned b : 1;
+  unsigned c : 1;
+};
+
+int foo (struct s *x)
+{
+  /* Expression
+       x->b || x->c
+     is transformed into
+       _1 = BIT_FIELD_REF <*x_4(D), 64, 64>;
+       _2 = _1 > 0x3FFFFFFFFFFFFFFF;
+     where the constant may materialize in the literal pool and an 8 byte CLC
+     may be emitted.  Ensure this is not the case.
+  */
+  return x->b || x->c;
+}