Message ID | 20230619142356.345159-1-stefansf@linux.ibm.com |
---|---|
State | New |
Headers | show |
Series | [v2] combine: Narrow comparison of memory and constant | expand |
ping On Mon, Jun 19, 2023 at 04:23:57PM +0200, Stefan Schulze Frielinghaus wrote: > Comparisons between memory and constants might be done in a smaller mode > resulting in smaller constants which might finally end up as immediates > instead of in the literal pool. > > For example, on s390x a non-symmetric comparison like > x <= 0x3fffffffffffffff > results in the constant being spilled to the literal pool and an 8 byte > memory comparison is emitted. Ideally, an equivalent comparison > x0 <= 0x3f > where x0 is the most significant byte of x, is emitted where the > constant is smaller and more likely to materialize as an immediate. > > Similarly, comparisons of the form > x >= 0x4000000000000000 > can be shortened into x0 >= 0x40. > > Bootstrapped and regtested on s390x, x64, aarch64, and powerpc64le. > Note, the new tests show that for the mentioned little-endian targets > the optimization does not materialize since either the costs of the new > instructions are higher or they do not match. Still ok for mainline? > > gcc/ChangeLog: > > * combine.cc (simplify_compare_const): Narrow comparison of > memory and constant. > (try_combine): Adapt new function signature. > (simplify_comparison): Adapt new function signature. > > gcc/testsuite/ChangeLog: > > * gcc.dg/cmp-mem-const-1.c: New test. > * gcc.dg/cmp-mem-const-2.c: New test. > * gcc.dg/cmp-mem-const-3.c: New test. > * gcc.dg/cmp-mem-const-4.c: New test. > * gcc.dg/cmp-mem-const-5.c: New test. > * gcc.dg/cmp-mem-const-6.c: New test. > * gcc.target/s390/cmp-mem-const-1.c: New test. > --- > gcc/combine.cc | 79 +++++++++++++++++-- > gcc/testsuite/gcc.dg/cmp-mem-const-1.c | 17 ++++ > gcc/testsuite/gcc.dg/cmp-mem-const-2.c | 17 ++++ > gcc/testsuite/gcc.dg/cmp-mem-const-3.c | 17 ++++ > gcc/testsuite/gcc.dg/cmp-mem-const-4.c | 17 ++++ > gcc/testsuite/gcc.dg/cmp-mem-const-5.c | 17 ++++ > gcc/testsuite/gcc.dg/cmp-mem-const-6.c | 17 ++++ > .../gcc.target/s390/cmp-mem-const-1.c | 24 ++++++ > 8 files changed, 200 insertions(+), 5 deletions(-) > create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-1.c > create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-2.c > create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-3.c > create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-4.c > create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-5.c > create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-6.c > create mode 100644 gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c > > diff --git a/gcc/combine.cc b/gcc/combine.cc > index 5aa0ec5c45a..56e15a93409 100644 > --- a/gcc/combine.cc > +++ b/gcc/combine.cc > @@ -460,7 +460,7 @@ static rtx simplify_shift_const (rtx, enum rtx_code, machine_mode, rtx, > static int recog_for_combine (rtx *, rtx_insn *, rtx *); > static rtx gen_lowpart_for_combine (machine_mode, rtx); > static enum rtx_code simplify_compare_const (enum rtx_code, machine_mode, > - rtx, rtx *); > + rtx *, rtx *); > static enum rtx_code simplify_comparison (enum rtx_code, rtx *, rtx *); > static void update_table_tick (rtx); > static void record_value_for_reg (rtx, rtx_insn *, rtx); > @@ -3185,7 +3185,7 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, rtx_insn *i0, > compare_code = orig_compare_code = GET_CODE (*cc_use_loc); > if (is_a <scalar_int_mode> (GET_MODE (i2dest), &mode)) > compare_code = simplify_compare_const (compare_code, mode, > - op0, &op1); > + &op0, &op1); > target_canonicalize_comparison (&compare_code, &op0, &op1, 1); > } > > @@ -11796,13 +11796,14 @@ gen_lowpart_for_combine (machine_mode omode, rtx x) > (CODE OP0 const0_rtx) form. > > The result is a possibly different comparison code to use. > - *POP1 may be updated. */ > + *POP0 and *POP1 may be updated. */ > > static enum rtx_code > simplify_compare_const (enum rtx_code code, machine_mode mode, > - rtx op0, rtx *pop1) > + rtx *pop0, rtx *pop1) > { > scalar_int_mode int_mode; > + rtx op0 = *pop0; > HOST_WIDE_INT const_op = INTVAL (*pop1); > > /* Get the constant we are comparing against and turn off all bits > @@ -11987,6 +11988,74 @@ simplify_compare_const (enum rtx_code code, machine_mode mode, > break; > } > > + /* Narrow non-symmetric comparison of memory and constant as e.g. > + x0...x7 <= 0x3fffffffffffffff into x0 <= 0x3f where x0 is the most > + significant byte. Likewise, transform x0...x7 >= 0x4000000000000000 into > + x0 >= 0x40. */ > + if ((code == LEU || code == LTU || code == GEU || code == GTU) > + && is_a <scalar_int_mode> (GET_MODE (op0), &int_mode) > + && MEM_P (op0) > + && !MEM_VOLATILE_P (op0) > + /* The optimization makes only sense for constants which are big enough > + so that we have a chance to chop off something at all. */ > + && (unsigned HOST_WIDE_INT) const_op > 0xff > + /* Ensure that we do not overflow during normalization. */ > + && (code != GTU || (unsigned HOST_WIDE_INT) const_op < HOST_WIDE_INT_M1U)) > + { > + unsigned HOST_WIDE_INT n = (unsigned HOST_WIDE_INT) const_op; > + enum rtx_code adjusted_code; > + > + /* Normalize code to either LEU or GEU. */ > + if (code == LTU) > + { > + --n; > + adjusted_code = LEU; > + } > + else if (code == GTU) > + { > + ++n; > + adjusted_code = GEU; > + } > + else > + adjusted_code = code; > + > + scalar_int_mode narrow_mode_iter; > + FOR_EACH_MODE_UNTIL (narrow_mode_iter, int_mode) > + { > + unsigned nbits = GET_MODE_PRECISION (int_mode) > + - GET_MODE_PRECISION (narrow_mode_iter); > + unsigned HOST_WIDE_INT mask = (HOST_WIDE_INT_1U << nbits) - 1; > + unsigned HOST_WIDE_INT lower_bits = n & mask; > + if ((adjusted_code == LEU && lower_bits == mask) > + || (adjusted_code == GEU && lower_bits == 0)) > + { > + n >>= nbits; > + break; > + } > + } > + > + if (narrow_mode_iter < int_mode) > + { > + if (dump_file && (dump_flags & TDF_DETAILS)) > + { > + fprintf ( > + dump_file, "narrow comparison from mode %s to %s: (MEM %s " > + HOST_WIDE_INT_PRINT_HEX ") to (MEM %s " > + HOST_WIDE_INT_PRINT_HEX ").\n", GET_MODE_NAME (int_mode), > + GET_MODE_NAME (narrow_mode_iter), GET_RTX_NAME (code), > + (unsigned HOST_WIDE_INT)const_op, GET_RTX_NAME (adjusted_code), > + n); > + } > + poly_int64 offset = (BYTES_BIG_ENDIAN > + ? 0 > + : (GET_MODE_SIZE (int_mode) > + - GET_MODE_SIZE (narrow_mode_iter))); > + *pop0 = adjust_address_nv (op0, narrow_mode_iter, offset); > + *pop1 = GEN_INT (n); > + return adjusted_code; > + } > + } > + > *pop1 = GEN_INT (const_op); > return code; > } > @@ -12179,7 +12248,7 @@ simplify_comparison (enum rtx_code code, rtx *pop0, rtx *pop1) > > /* Try to simplify the compare to constant, possibly changing the > comparison op, and/or changing op1 to zero. */ > - code = simplify_compare_const (code, raw_mode, op0, &op1); > + code = simplify_compare_const (code, raw_mode, &op0, &op1); > const_op = INTVAL (op1); > > /* Compute some predicates to simplify code below. */ > diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-1.c b/gcc/testsuite/gcc.dg/cmp-mem-const-1.c > new file mode 100644 > index 00000000000..263ad98af79 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-1.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile { target { lp64 } } } */ > +/* { dg-options "-O1 -fdump-rtl-combine-details" } */ > +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to QI" "combine" } } */ > + > +typedef __UINT64_TYPE__ uint64_t; > + > +int > +le_1byte_a (uint64_t *x) > +{ > + return *x <= 0x3fffffffffffffff; > +} > + > +int > +le_1byte_b (uint64_t *x) > +{ > + return *x < 0x4000000000000000; > +} > diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-2.c b/gcc/testsuite/gcc.dg/cmp-mem-const-2.c > new file mode 100644 > index 00000000000..a7cc5348295 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-2.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile { target { lp64 } } } */ > +/* { dg-options "-O1 -fdump-rtl-combine-details" } */ > +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to QI" "combine" } } */ > + > +typedef __UINT64_TYPE__ uint64_t; > + > +int > +ge_1byte_a (uint64_t *x) > +{ > + return *x > 0x3fffffffffffffff; > +} > + > +int > +ge_1byte_b (uint64_t *x) > +{ > + return *x >= 0x4000000000000000; > +} > diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-3.c b/gcc/testsuite/gcc.dg/cmp-mem-const-3.c > new file mode 100644 > index 00000000000..06f80bf72d8 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-3.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile { target { lp64 } } } */ > +/* { dg-options "-O1 -fdump-rtl-combine-details" } */ > +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to HI" "combine" } } */ > + > +typedef __UINT64_TYPE__ uint64_t; > + > +int > +le_2bytes_a (uint64_t *x) > +{ > + return *x <= 0x3ffdffffffffffff; > +} > + > +int > +le_2bytes_b (uint64_t *x) > +{ > + return *x < 0x3ffe000000000000; > +} > diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-4.c b/gcc/testsuite/gcc.dg/cmp-mem-const-4.c > new file mode 100644 > index 00000000000..407999abf7e > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-4.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile { target { lp64 } } } */ > +/* { dg-options "-O1 -fdump-rtl-combine-details" } */ > +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to HI" "combine" } } */ > + > +typedef __UINT64_TYPE__ uint64_t; > + > +int > +ge_2bytes_a (uint64_t *x) > +{ > + return *x > 0x400cffffffffffff; > +} > + > +int > +ge_2bytes_b (uint64_t *x) > +{ > + return *x >= 0x400d000000000000; > +} > diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-5.c b/gcc/testsuite/gcc.dg/cmp-mem-const-5.c > new file mode 100644 > index 00000000000..e16773f5bcf > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-5.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile { target { lp64 } } } */ > +/* { dg-options "-O1 -fdump-rtl-combine-details" } */ > +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to SI" "combine" } } */ > + > +typedef __UINT64_TYPE__ uint64_t; > + > +int > +le_4bytes_a (uint64_t *x) > +{ > + return *x <= 0x3ffffdffffffffff; > +} > + > +int > +le_4bytes_b (uint64_t *x) > +{ > + return *x < 0x3ffffe0000000000; > +} > diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-6.c b/gcc/testsuite/gcc.dg/cmp-mem-const-6.c > new file mode 100644 > index 00000000000..8f53b5678bd > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-6.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile { target { lp64 } } } */ > +/* { dg-options "-O1 -fdump-rtl-combine-details" } */ > +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to SI" "combine" } } */ > + > +typedef __UINT64_TYPE__ uint64_t; > + > +int > +ge_4bytes_a (uint64_t *x) > +{ > + return *x > 0x4000cfffffffffff; > +} > + > +int > +ge_4bytes_b (uint64_t *x) > +{ > + return *x >= 0x4000d00000000000; > +} > diff --git a/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c b/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c > new file mode 100644 > index 00000000000..309aafbec01 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c > @@ -0,0 +1,24 @@ > +/* { dg-do compile { target { lp64 } } } */ > +/* { dg-options "-O1 -march=z13 -mzarch -fdump-rtl-combine-details" } */ > +/* { dg-final { scan-assembler-not {\tclc\t} } } */ > +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to QI" "combine" } } */ > + > +struct s > +{ > + long a; > + unsigned b : 1; > + unsigned c : 1; > +}; > + > +int foo (struct s *x) > +{ > + /* Expression > + x->b || x->c > + is transformed into > + _1 = BIT_FIELD_REF <*x_4(D), 64, 64>; > + _2 = _1 > 0x3FFFFFFFFFFFFFFF; > + where the constant may materialize in the literal pool and an 8 byte CLC > + may be emitted. Ensure this is not the case. > + */ > + return x->b || x->c; > +} > -- > 2.39.2 >
On 6/19/23 08:23, Stefan Schulze Frielinghaus via Gcc-patches wrote: > Comparisons between memory and constants might be done in a smaller mode > resulting in smaller constants which might finally end up as immediates > instead of in the literal pool. > > For example, on s390x a non-symmetric comparison like > x <= 0x3fffffffffffffff > results in the constant being spilled to the literal pool and an 8 byte > memory comparison is emitted. Ideally, an equivalent comparison > x0 <= 0x3f > where x0 is the most significant byte of x, is emitted where the > constant is smaller and more likely to materialize as an immediate. > > Similarly, comparisons of the form > x >= 0x4000000000000000 > can be shortened into x0 >= 0x40. > > Bootstrapped and regtested on s390x, x64, aarch64, and powerpc64le. > Note, the new tests show that for the mentioned little-endian targets > the optimization does not materialize since either the costs of the new > instructions are higher or they do not match. Still ok for mainline? > > gcc/ChangeLog: > > * combine.cc (simplify_compare_const): Narrow comparison of > memory and constant. > (try_combine): Adapt new function signature. > (simplify_comparison): Adapt new function signature. > > gcc/testsuite/ChangeLog: > > * gcc.dg/cmp-mem-const-1.c: New test. > * gcc.dg/cmp-mem-const-2.c: New test. > * gcc.dg/cmp-mem-const-3.c: New test. > * gcc.dg/cmp-mem-const-4.c: New test. > * gcc.dg/cmp-mem-const-5.c: New test. > * gcc.dg/cmp-mem-const-6.c: New test. > * gcc.target/s390/cmp-mem-const-1.c: New test. Sorry. I'd looked at this a while back, wanted to take another looksie and totally forgot about it. OK for the trunk. Thanks for your patience. jeff
On Mon, 19 Jun 2023 at 19:59, Stefan Schulze Frielinghaus via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > Comparisons between memory and constants might be done in a smaller mode > resulting in smaller constants which might finally end up as immediates > instead of in the literal pool. > > For example, on s390x a non-symmetric comparison like > x <= 0x3fffffffffffffff > results in the constant being spilled to the literal pool and an 8 byte > memory comparison is emitted. Ideally, an equivalent comparison > x0 <= 0x3f > where x0 is the most significant byte of x, is emitted where the > constant is smaller and more likely to materialize as an immediate. > > Similarly, comparisons of the form > x >= 0x4000000000000000 > can be shortened into x0 >= 0x40. > > Bootstrapped and regtested on s390x, x64, aarch64, and powerpc64le. > Note, the new tests show that for the mentioned little-endian targets > the optimization does not materialize since either the costs of the new > instructions are higher or they do not match. Still ok for mainline? Hi Stefan, Unfortunately this patch (committed in 7cdd0860949c6c3232e6cff1d7ca37bb5234074c) caused the following ICE on armv8l-unknown-linux-gnu: during RTL pass: combine ../../../gcc/libgcc/fixed-bit.c: In function ‘__gnu_saturate1sq’: ../../../gcc/libgcc/fixed-bit.c:210:1: internal compiler error: in decompose, at rtl.h:2297 210 | } | ^ 0xaa23e3 wi::int_traits<std::pair<rtx_def*, machine_mode> >::decompose(long long*, unsigned int, std::pair<rtx_def*, machine_mode> const&) ../../gcc/gcc/rtl.h:2297 0xaf5ab3 wide_int_ref_storage<false, true>::wide_int_ref_storage<std::pair<rtx_def*, machine_mode> >(std::pair<rtx_def*, machine_mode> const&) ../../gcc/gcc/wide-int.h:1030 0xaf5023 generic_wide_int<wide_int_ref_storage<false, true> >::generic_wide_int<std::pair<rtx_def*, machine_mode> >(std::pair<rtx_def*, machine_mode> const&) ../../gcc/gcc/wide-int.h:788 0xf916f9 simplify_const_unary_operation(rtx_code, machine_mode, rtx_def*, machine_mode) ../../gcc/gcc/simplify-rtx.cc:2131 0xf8bad5 simplify_context::simplify_unary_operation(rtx_code, machine_mode, rtx_def*, machine_mode) ../../gcc/gcc/simplify-rtx.cc:889 0xf8a591 simplify_context::simplify_gen_unary(rtx_code, machine_mode, rtx_def*, machine_mode) ../../gcc/gcc/simplify-rtx.cc:360 0x9bd1b7 simplify_gen_unary(rtx_code, machine_mode, rtx_def*, machine_mode) ../../gcc/gcc/rtl.h:3520 0x1bd5677 simplify_comparison ../../gcc/gcc/combine.cc:13125 0x1bc2b2b simplify_set ../../gcc/gcc/combine.cc:6848 0x1bc1647 combine_simplify_rtx ../../gcc/gcc/combine.cc:6353 0x1bbf97f subst ../../gcc/gcc/combine.cc:5609 0x1bb864b try_combine ../../gcc/gcc/combine.cc:3302 0x1bb30fb combine_instructions ../../gcc/gcc/combine.cc:1264 0x1bd8d25 rest_of_handle_combine ../../gcc/gcc/combine.cc:15059 0x1bd8dd5 execute ../../gcc/gcc/combine.cc:15103 Please submit a full bug report, with preprocessed source (by using -freport-bug). Please include the complete backtrace with any bug report. See <https://gcc.gnu.org/bugs/> for instructions. Could you please take a look ? Thanks, Prathamesh > > gcc/ChangeLog: > > * combine.cc (simplify_compare_const): Narrow comparison of > memory and constant. > (try_combine): Adapt new function signature. > (simplify_comparison): Adapt new function signature. > > gcc/testsuite/ChangeLog: > > * gcc.dg/cmp-mem-const-1.c: New test. > * gcc.dg/cmp-mem-const-2.c: New test. > * gcc.dg/cmp-mem-const-3.c: New test. > * gcc.dg/cmp-mem-const-4.c: New test. > * gcc.dg/cmp-mem-const-5.c: New test. > * gcc.dg/cmp-mem-const-6.c: New test. > * gcc.target/s390/cmp-mem-const-1.c: New test. > --- > gcc/combine.cc | 79 +++++++++++++++++-- > gcc/testsuite/gcc.dg/cmp-mem-const-1.c | 17 ++++ > gcc/testsuite/gcc.dg/cmp-mem-const-2.c | 17 ++++ > gcc/testsuite/gcc.dg/cmp-mem-const-3.c | 17 ++++ > gcc/testsuite/gcc.dg/cmp-mem-const-4.c | 17 ++++ > gcc/testsuite/gcc.dg/cmp-mem-const-5.c | 17 ++++ > gcc/testsuite/gcc.dg/cmp-mem-const-6.c | 17 ++++ > .../gcc.target/s390/cmp-mem-const-1.c | 24 ++++++ > 8 files changed, 200 insertions(+), 5 deletions(-) > create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-1.c > create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-2.c > create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-3.c > create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-4.c > create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-5.c > create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-6.c > create mode 100644 gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c > > diff --git a/gcc/combine.cc b/gcc/combine.cc > index 5aa0ec5c45a..56e15a93409 100644 > --- a/gcc/combine.cc > +++ b/gcc/combine.cc > @@ -460,7 +460,7 @@ static rtx simplify_shift_const (rtx, enum rtx_code, machine_mode, rtx, > static int recog_for_combine (rtx *, rtx_insn *, rtx *); > static rtx gen_lowpart_for_combine (machine_mode, rtx); > static enum rtx_code simplify_compare_const (enum rtx_code, machine_mode, > - rtx, rtx *); > + rtx *, rtx *); > static enum rtx_code simplify_comparison (enum rtx_code, rtx *, rtx *); > static void update_table_tick (rtx); > static void record_value_for_reg (rtx, rtx_insn *, rtx); > @@ -3185,7 +3185,7 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, rtx_insn *i0, > compare_code = orig_compare_code = GET_CODE (*cc_use_loc); > if (is_a <scalar_int_mode> (GET_MODE (i2dest), &mode)) > compare_code = simplify_compare_const (compare_code, mode, > - op0, &op1); > + &op0, &op1); > target_canonicalize_comparison (&compare_code, &op0, &op1, 1); > } > > @@ -11796,13 +11796,14 @@ gen_lowpart_for_combine (machine_mode omode, rtx x) > (CODE OP0 const0_rtx) form. > > The result is a possibly different comparison code to use. > - *POP1 may be updated. */ > + *POP0 and *POP1 may be updated. */ > > static enum rtx_code > simplify_compare_const (enum rtx_code code, machine_mode mode, > - rtx op0, rtx *pop1) > + rtx *pop0, rtx *pop1) > { > scalar_int_mode int_mode; > + rtx op0 = *pop0; > HOST_WIDE_INT const_op = INTVAL (*pop1); > > /* Get the constant we are comparing against and turn off all bits > @@ -11987,6 +11988,74 @@ simplify_compare_const (enum rtx_code code, machine_mode mode, > break; > } > > + /* Narrow non-symmetric comparison of memory and constant as e.g. > + x0...x7 <= 0x3fffffffffffffff into x0 <= 0x3f where x0 is the most > + significant byte. Likewise, transform x0...x7 >= 0x4000000000000000 into > + x0 >= 0x40. */ > + if ((code == LEU || code == LTU || code == GEU || code == GTU) > + && is_a <scalar_int_mode> (GET_MODE (op0), &int_mode) > + && MEM_P (op0) > + && !MEM_VOLATILE_P (op0) > + /* The optimization makes only sense for constants which are big enough > + so that we have a chance to chop off something at all. */ > + && (unsigned HOST_WIDE_INT) const_op > 0xff > + /* Ensure that we do not overflow during normalization. */ > + && (code != GTU || (unsigned HOST_WIDE_INT) const_op < HOST_WIDE_INT_M1U)) > + { > + unsigned HOST_WIDE_INT n = (unsigned HOST_WIDE_INT) const_op; > + enum rtx_code adjusted_code; > + > + /* Normalize code to either LEU or GEU. */ > + if (code == LTU) > + { > + --n; > + adjusted_code = LEU; > + } > + else if (code == GTU) > + { > + ++n; > + adjusted_code = GEU; > + } > + else > + adjusted_code = code; > + > + scalar_int_mode narrow_mode_iter; > + FOR_EACH_MODE_UNTIL (narrow_mode_iter, int_mode) > + { > + unsigned nbits = GET_MODE_PRECISION (int_mode) > + - GET_MODE_PRECISION (narrow_mode_iter); > + unsigned HOST_WIDE_INT mask = (HOST_WIDE_INT_1U << nbits) - 1; > + unsigned HOST_WIDE_INT lower_bits = n & mask; > + if ((adjusted_code == LEU && lower_bits == mask) > + || (adjusted_code == GEU && lower_bits == 0)) > + { > + n >>= nbits; > + break; > + } > + } > + > + if (narrow_mode_iter < int_mode) > + { > + if (dump_file && (dump_flags & TDF_DETAILS)) > + { > + fprintf ( > + dump_file, "narrow comparison from mode %s to %s: (MEM %s " > + HOST_WIDE_INT_PRINT_HEX ") to (MEM %s " > + HOST_WIDE_INT_PRINT_HEX ").\n", GET_MODE_NAME (int_mode), > + GET_MODE_NAME (narrow_mode_iter), GET_RTX_NAME (code), > + (unsigned HOST_WIDE_INT)const_op, GET_RTX_NAME (adjusted_code), > + n); > + } > + poly_int64 offset = (BYTES_BIG_ENDIAN > + ? 0 > + : (GET_MODE_SIZE (int_mode) > + - GET_MODE_SIZE (narrow_mode_iter))); > + *pop0 = adjust_address_nv (op0, narrow_mode_iter, offset); > + *pop1 = GEN_INT (n); > + return adjusted_code; > + } > + } > + > *pop1 = GEN_INT (const_op); > return code; > } > @@ -12179,7 +12248,7 @@ simplify_comparison (enum rtx_code code, rtx *pop0, rtx *pop1) > > /* Try to simplify the compare to constant, possibly changing the > comparison op, and/or changing op1 to zero. */ > - code = simplify_compare_const (code, raw_mode, op0, &op1); > + code = simplify_compare_const (code, raw_mode, &op0, &op1); > const_op = INTVAL (op1); > > /* Compute some predicates to simplify code below. */ > diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-1.c b/gcc/testsuite/gcc.dg/cmp-mem-const-1.c > new file mode 100644 > index 00000000000..263ad98af79 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-1.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile { target { lp64 } } } */ > +/* { dg-options "-O1 -fdump-rtl-combine-details" } */ > +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to QI" "combine" } } */ > + > +typedef __UINT64_TYPE__ uint64_t; > + > +int > +le_1byte_a (uint64_t *x) > +{ > + return *x <= 0x3fffffffffffffff; > +} > + > +int > +le_1byte_b (uint64_t *x) > +{ > + return *x < 0x4000000000000000; > +} > diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-2.c b/gcc/testsuite/gcc.dg/cmp-mem-const-2.c > new file mode 100644 > index 00000000000..a7cc5348295 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-2.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile { target { lp64 } } } */ > +/* { dg-options "-O1 -fdump-rtl-combine-details" } */ > +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to QI" "combine" } } */ > + > +typedef __UINT64_TYPE__ uint64_t; > + > +int > +ge_1byte_a (uint64_t *x) > +{ > + return *x > 0x3fffffffffffffff; > +} > + > +int > +ge_1byte_b (uint64_t *x) > +{ > + return *x >= 0x4000000000000000; > +} > diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-3.c b/gcc/testsuite/gcc.dg/cmp-mem-const-3.c > new file mode 100644 > index 00000000000..06f80bf72d8 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-3.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile { target { lp64 } } } */ > +/* { dg-options "-O1 -fdump-rtl-combine-details" } */ > +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to HI" "combine" } } */ > + > +typedef __UINT64_TYPE__ uint64_t; > + > +int > +le_2bytes_a (uint64_t *x) > +{ > + return *x <= 0x3ffdffffffffffff; > +} > + > +int > +le_2bytes_b (uint64_t *x) > +{ > + return *x < 0x3ffe000000000000; > +} > diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-4.c b/gcc/testsuite/gcc.dg/cmp-mem-const-4.c > new file mode 100644 > index 00000000000..407999abf7e > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-4.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile { target { lp64 } } } */ > +/* { dg-options "-O1 -fdump-rtl-combine-details" } */ > +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to HI" "combine" } } */ > + > +typedef __UINT64_TYPE__ uint64_t; > + > +int > +ge_2bytes_a (uint64_t *x) > +{ > + return *x > 0x400cffffffffffff; > +} > + > +int > +ge_2bytes_b (uint64_t *x) > +{ > + return *x >= 0x400d000000000000; > +} > diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-5.c b/gcc/testsuite/gcc.dg/cmp-mem-const-5.c > new file mode 100644 > index 00000000000..e16773f5bcf > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-5.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile { target { lp64 } } } */ > +/* { dg-options "-O1 -fdump-rtl-combine-details" } */ > +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to SI" "combine" } } */ > + > +typedef __UINT64_TYPE__ uint64_t; > + > +int > +le_4bytes_a (uint64_t *x) > +{ > + return *x <= 0x3ffffdffffffffff; > +} > + > +int > +le_4bytes_b (uint64_t *x) > +{ > + return *x < 0x3ffffe0000000000; > +} > diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-6.c b/gcc/testsuite/gcc.dg/cmp-mem-const-6.c > new file mode 100644 > index 00000000000..8f53b5678bd > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-6.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile { target { lp64 } } } */ > +/* { dg-options "-O1 -fdump-rtl-combine-details" } */ > +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to SI" "combine" } } */ > + > +typedef __UINT64_TYPE__ uint64_t; > + > +int > +ge_4bytes_a (uint64_t *x) > +{ > + return *x > 0x4000cfffffffffff; > +} > + > +int > +ge_4bytes_b (uint64_t *x) > +{ > + return *x >= 0x4000d00000000000; > +} > diff --git a/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c b/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c > new file mode 100644 > index 00000000000..309aafbec01 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c > @@ -0,0 +1,24 @@ > +/* { dg-do compile { target { lp64 } } } */ > +/* { dg-options "-O1 -march=z13 -mzarch -fdump-rtl-combine-details" } */ > +/* { dg-final { scan-assembler-not {\tclc\t} } } */ > +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to QI" "combine" } } */ > + > +struct s > +{ > + long a; > + unsigned b : 1; > + unsigned c : 1; > +}; > + > +int foo (struct s *x) > +{ > + /* Expression > + x->b || x->c > + is transformed into > + _1 = BIT_FIELD_REF <*x_4(D), 64, 64>; > + _2 = _1 > 0x3FFFFFFFFFFFFFFF; > + where the constant may materialize in the literal pool and an 8 byte CLC > + may be emitted. Ensure this is not the case. > + */ > + return x->b || x->c; > +} > -- > 2.39.2 >
On Tue, 1 Aug 2023 at 03:13, Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> wrote: > > On Mon, 19 Jun 2023 at 19:59, Stefan Schulze Frielinghaus via > Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > > > Comparisons between memory and constants might be done in a smaller mode > > resulting in smaller constants which might finally end up as immediates > > instead of in the literal pool. > > > > For example, on s390x a non-symmetric comparison like > > x <= 0x3fffffffffffffff > > results in the constant being spilled to the literal pool and an 8 byte > > memory comparison is emitted. Ideally, an equivalent comparison > > x0 <= 0x3f > > where x0 is the most significant byte of x, is emitted where the > > constant is smaller and more likely to materialize as an immediate. > > > > Similarly, comparisons of the form > > x >= 0x4000000000000000 > > can be shortened into x0 >= 0x40. > > > > Bootstrapped and regtested on s390x, x64, aarch64, and powerpc64le. > > Note, the new tests show that for the mentioned little-endian targets > > the optimization does not materialize since either the costs of the new > > instructions are higher or they do not match. Still ok for mainline? > Hi Stefan, > Unfortunately this patch (committed in 7cdd0860949c6c3232e6cff1d7ca37bb5234074c) > caused the following ICE on armv8l-unknown-linux-gnu: Sorry I meant armv8l-unknown-linux-gnueabihf. > during RTL pass: combine > ../../../gcc/libgcc/fixed-bit.c: In function ‘__gnu_saturate1sq’: > ../../../gcc/libgcc/fixed-bit.c:210:1: internal compiler error: in > decompose, at rtl.h:2297 > 210 | } > | ^ > 0xaa23e3 wi::int_traits<std::pair<rtx_def*, machine_mode> > >::decompose(long long*, unsigned int, std::pair<rtx_def*, > machine_mode> const&) > ../../gcc/gcc/rtl.h:2297 > 0xaf5ab3 wide_int_ref_storage<false, > true>::wide_int_ref_storage<std::pair<rtx_def*, machine_mode> > >(std::pair<rtx_def*, machine_mode> const&) > ../../gcc/gcc/wide-int.h:1030 > 0xaf5023 generic_wide_int<wide_int_ref_storage<false, true> > >::generic_wide_int<std::pair<rtx_def*, machine_mode> > >(std::pair<rtx_def*, machine_mode> const&) > ../../gcc/gcc/wide-int.h:788 > 0xf916f9 simplify_const_unary_operation(rtx_code, machine_mode, > rtx_def*, machine_mode) > ../../gcc/gcc/simplify-rtx.cc:2131 > 0xf8bad5 simplify_context::simplify_unary_operation(rtx_code, > machine_mode, rtx_def*, machine_mode) > ../../gcc/gcc/simplify-rtx.cc:889 > 0xf8a591 simplify_context::simplify_gen_unary(rtx_code, machine_mode, > rtx_def*, machine_mode) > ../../gcc/gcc/simplify-rtx.cc:360 > 0x9bd1b7 simplify_gen_unary(rtx_code, machine_mode, rtx_def*, machine_mode) > ../../gcc/gcc/rtl.h:3520 > 0x1bd5677 simplify_comparison > ../../gcc/gcc/combine.cc:13125 > 0x1bc2b2b simplify_set > ../../gcc/gcc/combine.cc:6848 > 0x1bc1647 combine_simplify_rtx > ../../gcc/gcc/combine.cc:6353 > 0x1bbf97f subst > ../../gcc/gcc/combine.cc:5609 > 0x1bb864b try_combine > ../../gcc/gcc/combine.cc:3302 > 0x1bb30fb combine_instructions > ../../gcc/gcc/combine.cc:1264 > 0x1bd8d25 rest_of_handle_combine > ../../gcc/gcc/combine.cc:15059 > 0x1bd8dd5 execute > ../../gcc/gcc/combine.cc:15103 > Please submit a full bug report, with preprocessed source (by using > -freport-bug). > Please include the complete backtrace with any bug report. > See <https://gcc.gnu.org/bugs/> for instructions. > > Could you please take a look ? > > Thanks, > Prathamesh > > > > gcc/ChangeLog: > > > > * combine.cc (simplify_compare_const): Narrow comparison of > > memory and constant. > > (try_combine): Adapt new function signature. > > (simplify_comparison): Adapt new function signature. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.dg/cmp-mem-const-1.c: New test. > > * gcc.dg/cmp-mem-const-2.c: New test. > > * gcc.dg/cmp-mem-const-3.c: New test. > > * gcc.dg/cmp-mem-const-4.c: New test. > > * gcc.dg/cmp-mem-const-5.c: New test. > > * gcc.dg/cmp-mem-const-6.c: New test. > > * gcc.target/s390/cmp-mem-const-1.c: New test. > > --- > > gcc/combine.cc | 79 +++++++++++++++++-- > > gcc/testsuite/gcc.dg/cmp-mem-const-1.c | 17 ++++ > > gcc/testsuite/gcc.dg/cmp-mem-const-2.c | 17 ++++ > > gcc/testsuite/gcc.dg/cmp-mem-const-3.c | 17 ++++ > > gcc/testsuite/gcc.dg/cmp-mem-const-4.c | 17 ++++ > > gcc/testsuite/gcc.dg/cmp-mem-const-5.c | 17 ++++ > > gcc/testsuite/gcc.dg/cmp-mem-const-6.c | 17 ++++ > > .../gcc.target/s390/cmp-mem-const-1.c | 24 ++++++ > > 8 files changed, 200 insertions(+), 5 deletions(-) > > create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-1.c > > create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-2.c > > create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-3.c > > create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-4.c > > create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-5.c > > create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-6.c > > create mode 100644 gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c > > > > diff --git a/gcc/combine.cc b/gcc/combine.cc > > index 5aa0ec5c45a..56e15a93409 100644 > > --- a/gcc/combine.cc > > +++ b/gcc/combine.cc > > @@ -460,7 +460,7 @@ static rtx simplify_shift_const (rtx, enum rtx_code, machine_mode, rtx, > > static int recog_for_combine (rtx *, rtx_insn *, rtx *); > > static rtx gen_lowpart_for_combine (machine_mode, rtx); > > static enum rtx_code simplify_compare_const (enum rtx_code, machine_mode, > > - rtx, rtx *); > > + rtx *, rtx *); > > static enum rtx_code simplify_comparison (enum rtx_code, rtx *, rtx *); > > static void update_table_tick (rtx); > > static void record_value_for_reg (rtx, rtx_insn *, rtx); > > @@ -3185,7 +3185,7 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, rtx_insn *i0, > > compare_code = orig_compare_code = GET_CODE (*cc_use_loc); > > if (is_a <scalar_int_mode> (GET_MODE (i2dest), &mode)) > > compare_code = simplify_compare_const (compare_code, mode, > > - op0, &op1); > > + &op0, &op1); > > target_canonicalize_comparison (&compare_code, &op0, &op1, 1); > > } > > > > @@ -11796,13 +11796,14 @@ gen_lowpart_for_combine (machine_mode omode, rtx x) > > (CODE OP0 const0_rtx) form. > > > > The result is a possibly different comparison code to use. > > - *POP1 may be updated. */ > > + *POP0 and *POP1 may be updated. */ > > > > static enum rtx_code > > simplify_compare_const (enum rtx_code code, machine_mode mode, > > - rtx op0, rtx *pop1) > > + rtx *pop0, rtx *pop1) > > { > > scalar_int_mode int_mode; > > + rtx op0 = *pop0; > > HOST_WIDE_INT const_op = INTVAL (*pop1); > > > > /* Get the constant we are comparing against and turn off all bits > > @@ -11987,6 +11988,74 @@ simplify_compare_const (enum rtx_code code, machine_mode mode, > > break; > > } > > > > + /* Narrow non-symmetric comparison of memory and constant as e.g. > > + x0...x7 <= 0x3fffffffffffffff into x0 <= 0x3f where x0 is the most > > + significant byte. Likewise, transform x0...x7 >= 0x4000000000000000 into > > + x0 >= 0x40. */ > > + if ((code == LEU || code == LTU || code == GEU || code == GTU) > > + && is_a <scalar_int_mode> (GET_MODE (op0), &int_mode) > > + && MEM_P (op0) > > + && !MEM_VOLATILE_P (op0) > > + /* The optimization makes only sense for constants which are big enough > > + so that we have a chance to chop off something at all. */ > > + && (unsigned HOST_WIDE_INT) const_op > 0xff > > + /* Ensure that we do not overflow during normalization. */ > > + && (code != GTU || (unsigned HOST_WIDE_INT) const_op < HOST_WIDE_INT_M1U)) > > + { > > + unsigned HOST_WIDE_INT n = (unsigned HOST_WIDE_INT) const_op; > > + enum rtx_code adjusted_code; > > + > > + /* Normalize code to either LEU or GEU. */ > > + if (code == LTU) > > + { > > + --n; > > + adjusted_code = LEU; > > + } > > + else if (code == GTU) > > + { > > + ++n; > > + adjusted_code = GEU; > > + } > > + else > > + adjusted_code = code; > > + > > + scalar_int_mode narrow_mode_iter; > > + FOR_EACH_MODE_UNTIL (narrow_mode_iter, int_mode) > > + { > > + unsigned nbits = GET_MODE_PRECISION (int_mode) > > + - GET_MODE_PRECISION (narrow_mode_iter); > > + unsigned HOST_WIDE_INT mask = (HOST_WIDE_INT_1U << nbits) - 1; > > + unsigned HOST_WIDE_INT lower_bits = n & mask; > > + if ((adjusted_code == LEU && lower_bits == mask) > > + || (adjusted_code == GEU && lower_bits == 0)) > > + { > > + n >>= nbits; > > + break; > > + } > > + } > > + > > + if (narrow_mode_iter < int_mode) > > + { > > + if (dump_file && (dump_flags & TDF_DETAILS)) > > + { > > + fprintf ( > > + dump_file, "narrow comparison from mode %s to %s: (MEM %s " > > + HOST_WIDE_INT_PRINT_HEX ") to (MEM %s " > > + HOST_WIDE_INT_PRINT_HEX ").\n", GET_MODE_NAME (int_mode), > > + GET_MODE_NAME (narrow_mode_iter), GET_RTX_NAME (code), > > + (unsigned HOST_WIDE_INT)const_op, GET_RTX_NAME (adjusted_code), > > + n); > > + } > > + poly_int64 offset = (BYTES_BIG_ENDIAN > > + ? 0 > > + : (GET_MODE_SIZE (int_mode) > > + - GET_MODE_SIZE (narrow_mode_iter))); > > + *pop0 = adjust_address_nv (op0, narrow_mode_iter, offset); > > + *pop1 = GEN_INT (n); > > + return adjusted_code; > > + } > > + } > > + > > *pop1 = GEN_INT (const_op); > > return code; > > } > > @@ -12179,7 +12248,7 @@ simplify_comparison (enum rtx_code code, rtx *pop0, rtx *pop1) > > > > /* Try to simplify the compare to constant, possibly changing the > > comparison op, and/or changing op1 to zero. */ > > - code = simplify_compare_const (code, raw_mode, op0, &op1); > > + code = simplify_compare_const (code, raw_mode, &op0, &op1); > > const_op = INTVAL (op1); > > > > /* Compute some predicates to simplify code below. */ > > diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-1.c b/gcc/testsuite/gcc.dg/cmp-mem-const-1.c > > new file mode 100644 > > index 00000000000..263ad98af79 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-1.c > > @@ -0,0 +1,17 @@ > > +/* { dg-do compile { target { lp64 } } } */ > > +/* { dg-options "-O1 -fdump-rtl-combine-details" } */ > > +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to QI" "combine" } } */ > > + > > +typedef __UINT64_TYPE__ uint64_t; > > + > > +int > > +le_1byte_a (uint64_t *x) > > +{ > > + return *x <= 0x3fffffffffffffff; > > +} > > + > > +int > > +le_1byte_b (uint64_t *x) > > +{ > > + return *x < 0x4000000000000000; > > +} > > diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-2.c b/gcc/testsuite/gcc.dg/cmp-mem-const-2.c > > new file mode 100644 > > index 00000000000..a7cc5348295 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-2.c > > @@ -0,0 +1,17 @@ > > +/* { dg-do compile { target { lp64 } } } */ > > +/* { dg-options "-O1 -fdump-rtl-combine-details" } */ > > +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to QI" "combine" } } */ > > + > > +typedef __UINT64_TYPE__ uint64_t; > > + > > +int > > +ge_1byte_a (uint64_t *x) > > +{ > > + return *x > 0x3fffffffffffffff; > > +} > > + > > +int > > +ge_1byte_b (uint64_t *x) > > +{ > > + return *x >= 0x4000000000000000; > > +} > > diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-3.c b/gcc/testsuite/gcc.dg/cmp-mem-const-3.c > > new file mode 100644 > > index 00000000000..06f80bf72d8 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-3.c > > @@ -0,0 +1,17 @@ > > +/* { dg-do compile { target { lp64 } } } */ > > +/* { dg-options "-O1 -fdump-rtl-combine-details" } */ > > +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to HI" "combine" } } */ > > + > > +typedef __UINT64_TYPE__ uint64_t; > > + > > +int > > +le_2bytes_a (uint64_t *x) > > +{ > > + return *x <= 0x3ffdffffffffffff; > > +} > > + > > +int > > +le_2bytes_b (uint64_t *x) > > +{ > > + return *x < 0x3ffe000000000000; > > +} > > diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-4.c b/gcc/testsuite/gcc.dg/cmp-mem-const-4.c > > new file mode 100644 > > index 00000000000..407999abf7e > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-4.c > > @@ -0,0 +1,17 @@ > > +/* { dg-do compile { target { lp64 } } } */ > > +/* { dg-options "-O1 -fdump-rtl-combine-details" } */ > > +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to HI" "combine" } } */ > > + > > +typedef __UINT64_TYPE__ uint64_t; > > + > > +int > > +ge_2bytes_a (uint64_t *x) > > +{ > > + return *x > 0x400cffffffffffff; > > +} > > + > > +int > > +ge_2bytes_b (uint64_t *x) > > +{ > > + return *x >= 0x400d000000000000; > > +} > > diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-5.c b/gcc/testsuite/gcc.dg/cmp-mem-const-5.c > > new file mode 100644 > > index 00000000000..e16773f5bcf > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-5.c > > @@ -0,0 +1,17 @@ > > +/* { dg-do compile { target { lp64 } } } */ > > +/* { dg-options "-O1 -fdump-rtl-combine-details" } */ > > +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to SI" "combine" } } */ > > + > > +typedef __UINT64_TYPE__ uint64_t; > > + > > +int > > +le_4bytes_a (uint64_t *x) > > +{ > > + return *x <= 0x3ffffdffffffffff; > > +} > > + > > +int > > +le_4bytes_b (uint64_t *x) > > +{ > > + return *x < 0x3ffffe0000000000; > > +} > > diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-6.c b/gcc/testsuite/gcc.dg/cmp-mem-const-6.c > > new file mode 100644 > > index 00000000000..8f53b5678bd > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-6.c > > @@ -0,0 +1,17 @@ > > +/* { dg-do compile { target { lp64 } } } */ > > +/* { dg-options "-O1 -fdump-rtl-combine-details" } */ > > +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to SI" "combine" } } */ > > + > > +typedef __UINT64_TYPE__ uint64_t; > > + > > +int > > +ge_4bytes_a (uint64_t *x) > > +{ > > + return *x > 0x4000cfffffffffff; > > +} > > + > > +int > > +ge_4bytes_b (uint64_t *x) > > +{ > > + return *x >= 0x4000d00000000000; > > +} > > diff --git a/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c b/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c > > new file mode 100644 > > index 00000000000..309aafbec01 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c > > @@ -0,0 +1,24 @@ > > +/* { dg-do compile { target { lp64 } } } */ > > +/* { dg-options "-O1 -march=z13 -mzarch -fdump-rtl-combine-details" } */ > > +/* { dg-final { scan-assembler-not {\tclc\t} } } */ > > +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to QI" "combine" } } */ > > + > > +struct s > > +{ > > + long a; > > + unsigned b : 1; > > + unsigned c : 1; > > +}; > > + > > +int foo (struct s *x) > > +{ > > + /* Expression > > + x->b || x->c > > + is transformed into > > + _1 = BIT_FIELD_REF <*x_4(D), 64, 64>; > > + _2 = _1 > 0x3FFFFFFFFFFFFFFF; > > + where the constant may materialize in the literal pool and an 8 byte CLC > > + may be emitted. Ensure this is not the case. > > + */ > > + return x->b || x->c; > > +} > > -- > > 2.39.2 > >
On 7/31/23 15:43, Prathamesh Kulkarni via Gcc-patches wrote: > On Mon, 19 Jun 2023 at 19:59, Stefan Schulze Frielinghaus via > Gcc-patches <gcc-patches@gcc.gnu.org> wrote: >> >> Comparisons between memory and constants might be done in a smaller mode >> resulting in smaller constants which might finally end up as immediates >> instead of in the literal pool. >> >> For example, on s390x a non-symmetric comparison like >> x <= 0x3fffffffffffffff >> results in the constant being spilled to the literal pool and an 8 byte >> memory comparison is emitted. Ideally, an equivalent comparison >> x0 <= 0x3f >> where x0 is the most significant byte of x, is emitted where the >> constant is smaller and more likely to materialize as an immediate. >> >> Similarly, comparisons of the form >> x >= 0x4000000000000000 >> can be shortened into x0 >= 0x40. >> >> Bootstrapped and regtested on s390x, x64, aarch64, and powerpc64le. >> Note, the new tests show that for the mentioned little-endian targets >> the optimization does not materialize since either the costs of the new >> instructions are higher or they do not match. Still ok for mainline? > Hi Stefan, > Unfortunately this patch (committed in 7cdd0860949c6c3232e6cff1d7ca37bb5234074c) > caused the following ICE on armv8l-unknown-linux-gnu: > during RTL pass: combine > ../../../gcc/libgcc/fixed-bit.c: In function ‘__gnu_saturate1sq’: > ../../../gcc/libgcc/fixed-bit.c:210:1: internal compiler error: in > decompose, at rtl.h:2297 > 210 | } > | ^ > 0xaa23e3 wi::int_traits<std::pair<rtx_def*, machine_mode> >> ::decompose(long long*, unsigned int, std::pair<rtx_def*, > machine_mode> const&) > ../../gcc/gcc/rtl.h:2297 [ ... ] Yea, we're seeing something very similar on nios2-linux-gnu building the kernel. Prathamesh, can you extract the .i file for fixed-bit on armv8 and open a bug for this issue, attaching the .i file as well as the right command line options necessary to reproduce the failure. THat way Stefan can tackle it with a cross compiler. Thanks, jeff
On Tue, 1 Aug 2023 at 05:20, Jeff Law <jeffreyalaw@gmail.com> wrote: > > > > On 7/31/23 15:43, Prathamesh Kulkarni via Gcc-patches wrote: > > On Mon, 19 Jun 2023 at 19:59, Stefan Schulze Frielinghaus via > > Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > >> > >> Comparisons between memory and constants might be done in a smaller mode > >> resulting in smaller constants which might finally end up as immediates > >> instead of in the literal pool. > >> > >> For example, on s390x a non-symmetric comparison like > >> x <= 0x3fffffffffffffff > >> results in the constant being spilled to the literal pool and an 8 byte > >> memory comparison is emitted. Ideally, an equivalent comparison > >> x0 <= 0x3f > >> where x0 is the most significant byte of x, is emitted where the > >> constant is smaller and more likely to materialize as an immediate. > >> > >> Similarly, comparisons of the form > >> x >= 0x4000000000000000 > >> can be shortened into x0 >= 0x40. > >> > >> Bootstrapped and regtested on s390x, x64, aarch64, and powerpc64le. > >> Note, the new tests show that for the mentioned little-endian targets > >> the optimization does not materialize since either the costs of the new > >> instructions are higher or they do not match. Still ok for mainline? > > Hi Stefan, > > Unfortunately this patch (committed in 7cdd0860949c6c3232e6cff1d7ca37bb5234074c) > > caused the following ICE on armv8l-unknown-linux-gnu: > > during RTL pass: combine > > ../../../gcc/libgcc/fixed-bit.c: In function ‘__gnu_saturate1sq’: > > ../../../gcc/libgcc/fixed-bit.c:210:1: internal compiler error: in > > decompose, at rtl.h:2297 > > 210 | } > > | ^ > > 0xaa23e3 wi::int_traits<std::pair<rtx_def*, machine_mode> > >> ::decompose(long long*, unsigned int, std::pair<rtx_def*, > > machine_mode> const&) > > ../../gcc/gcc/rtl.h:2297 > [ ... ] > Yea, we're seeing something very similar on nios2-linux-gnu building the > kernel. > > Prathamesh, can you extract the .i file for fixed-bit on armv8 and open > a bug for this issue, attaching the .i file as well as the right command > line options necessary to reproduce the failure. THat way Stefan can > tackle it with a cross compiler. Hi Jeff, Filed the issue in https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110867 Thanks, Prathamesh > > Thanks, > jeff
On Tue, Aug 01, 2023 at 01:52:16PM +0530, Prathamesh Kulkarni wrote: > On Tue, 1 Aug 2023 at 05:20, Jeff Law <jeffreyalaw@gmail.com> wrote: > > > > > > > > On 7/31/23 15:43, Prathamesh Kulkarni via Gcc-patches wrote: > > > On Mon, 19 Jun 2023 at 19:59, Stefan Schulze Frielinghaus via > > > Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > >> > > >> Comparisons between memory and constants might be done in a smaller mode > > >> resulting in smaller constants which might finally end up as immediates > > >> instead of in the literal pool. > > >> > > >> For example, on s390x a non-symmetric comparison like > > >> x <= 0x3fffffffffffffff > > >> results in the constant being spilled to the literal pool and an 8 byte > > >> memory comparison is emitted. Ideally, an equivalent comparison > > >> x0 <= 0x3f > > >> where x0 is the most significant byte of x, is emitted where the > > >> constant is smaller and more likely to materialize as an immediate. > > >> > > >> Similarly, comparisons of the form > > >> x >= 0x4000000000000000 > > >> can be shortened into x0 >= 0x40. > > >> > > >> Bootstrapped and regtested on s390x, x64, aarch64, and powerpc64le. > > >> Note, the new tests show that for the mentioned little-endian targets > > >> the optimization does not materialize since either the costs of the new > > >> instructions are higher or they do not match. Still ok for mainline? > > > Hi Stefan, > > > Unfortunately this patch (committed in 7cdd0860949c6c3232e6cff1d7ca37bb5234074c) > > > caused the following ICE on armv8l-unknown-linux-gnu: > > > during RTL pass: combine > > > ../../../gcc/libgcc/fixed-bit.c: In function ‘__gnu_saturate1sq’: > > > ../../../gcc/libgcc/fixed-bit.c:210:1: internal compiler error: in > > > decompose, at rtl.h:2297 > > > 210 | } > > > | ^ > > > 0xaa23e3 wi::int_traits<std::pair<rtx_def*, machine_mode> > > >> ::decompose(long long*, unsigned int, std::pair<rtx_def*, > > > machine_mode> const&) > > > ../../gcc/gcc/rtl.h:2297 > > [ ... ] > > Yea, we're seeing something very similar on nios2-linux-gnu building the > > kernel. > > > > Prathamesh, can you extract the .i file for fixed-bit on armv8 and open > > a bug for this issue, attaching the .i file as well as the right command > > line options necessary to reproduce the failure. THat way Stefan can > > tackle it with a cross compiler. > Hi Jeff, > Filed the issue in https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110867 Hi Prathamesh, Sorry for the inconvenience. I will have a look at this and thanks for the small reproducer. I already started to come up with a cross compiler. Thanks, Stefan > > Thanks, > Prathamesh > > > > Thanks, > > jeff
diff --git a/gcc/combine.cc b/gcc/combine.cc index 5aa0ec5c45a..56e15a93409 100644 --- a/gcc/combine.cc +++ b/gcc/combine.cc @@ -460,7 +460,7 @@ static rtx simplify_shift_const (rtx, enum rtx_code, machine_mode, rtx, static int recog_for_combine (rtx *, rtx_insn *, rtx *); static rtx gen_lowpart_for_combine (machine_mode, rtx); static enum rtx_code simplify_compare_const (enum rtx_code, machine_mode, - rtx, rtx *); + rtx *, rtx *); static enum rtx_code simplify_comparison (enum rtx_code, rtx *, rtx *); static void update_table_tick (rtx); static void record_value_for_reg (rtx, rtx_insn *, rtx); @@ -3185,7 +3185,7 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, rtx_insn *i0, compare_code = orig_compare_code = GET_CODE (*cc_use_loc); if (is_a <scalar_int_mode> (GET_MODE (i2dest), &mode)) compare_code = simplify_compare_const (compare_code, mode, - op0, &op1); + &op0, &op1); target_canonicalize_comparison (&compare_code, &op0, &op1, 1); } @@ -11796,13 +11796,14 @@ gen_lowpart_for_combine (machine_mode omode, rtx x) (CODE OP0 const0_rtx) form. The result is a possibly different comparison code to use. - *POP1 may be updated. */ + *POP0 and *POP1 may be updated. */ static enum rtx_code simplify_compare_const (enum rtx_code code, machine_mode mode, - rtx op0, rtx *pop1) + rtx *pop0, rtx *pop1) { scalar_int_mode int_mode; + rtx op0 = *pop0; HOST_WIDE_INT const_op = INTVAL (*pop1); /* Get the constant we are comparing against and turn off all bits @@ -11987,6 +11988,74 @@ simplify_compare_const (enum rtx_code code, machine_mode mode, break; } + /* Narrow non-symmetric comparison of memory and constant as e.g. + x0...x7 <= 0x3fffffffffffffff into x0 <= 0x3f where x0 is the most + significant byte. Likewise, transform x0...x7 >= 0x4000000000000000 into + x0 >= 0x40. */ + if ((code == LEU || code == LTU || code == GEU || code == GTU) + && is_a <scalar_int_mode> (GET_MODE (op0), &int_mode) + && MEM_P (op0) + && !MEM_VOLATILE_P (op0) + /* The optimization makes only sense for constants which are big enough + so that we have a chance to chop off something at all. */ + && (unsigned HOST_WIDE_INT) const_op > 0xff + /* Ensure that we do not overflow during normalization. */ + && (code != GTU || (unsigned HOST_WIDE_INT) const_op < HOST_WIDE_INT_M1U)) + { + unsigned HOST_WIDE_INT n = (unsigned HOST_WIDE_INT) const_op; + enum rtx_code adjusted_code; + + /* Normalize code to either LEU or GEU. */ + if (code == LTU) + { + --n; + adjusted_code = LEU; + } + else if (code == GTU) + { + ++n; + adjusted_code = GEU; + } + else + adjusted_code = code; + + scalar_int_mode narrow_mode_iter; + FOR_EACH_MODE_UNTIL (narrow_mode_iter, int_mode) + { + unsigned nbits = GET_MODE_PRECISION (int_mode) + - GET_MODE_PRECISION (narrow_mode_iter); + unsigned HOST_WIDE_INT mask = (HOST_WIDE_INT_1U << nbits) - 1; + unsigned HOST_WIDE_INT lower_bits = n & mask; + if ((adjusted_code == LEU && lower_bits == mask) + || (adjusted_code == GEU && lower_bits == 0)) + { + n >>= nbits; + break; + } + } + + if (narrow_mode_iter < int_mode) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf ( + dump_file, "narrow comparison from mode %s to %s: (MEM %s " + HOST_WIDE_INT_PRINT_HEX ") to (MEM %s " + HOST_WIDE_INT_PRINT_HEX ").\n", GET_MODE_NAME (int_mode), + GET_MODE_NAME (narrow_mode_iter), GET_RTX_NAME (code), + (unsigned HOST_WIDE_INT)const_op, GET_RTX_NAME (adjusted_code), + n); + } + poly_int64 offset = (BYTES_BIG_ENDIAN + ? 0 + : (GET_MODE_SIZE (int_mode) + - GET_MODE_SIZE (narrow_mode_iter))); + *pop0 = adjust_address_nv (op0, narrow_mode_iter, offset); + *pop1 = GEN_INT (n); + return adjusted_code; + } + } + *pop1 = GEN_INT (const_op); return code; } @@ -12179,7 +12248,7 @@ simplify_comparison (enum rtx_code code, rtx *pop0, rtx *pop1) /* Try to simplify the compare to constant, possibly changing the comparison op, and/or changing op1 to zero. */ - code = simplify_compare_const (code, raw_mode, op0, &op1); + code = simplify_compare_const (code, raw_mode, &op0, &op1); const_op = INTVAL (op1); /* Compute some predicates to simplify code below. */ diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-1.c b/gcc/testsuite/gcc.dg/cmp-mem-const-1.c new file mode 100644 index 00000000000..263ad98af79 --- /dev/null +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-1.c @@ -0,0 +1,17 @@ +/* { dg-do compile { target { lp64 } } } */ +/* { dg-options "-O1 -fdump-rtl-combine-details" } */ +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to QI" "combine" } } */ + +typedef __UINT64_TYPE__ uint64_t; + +int +le_1byte_a (uint64_t *x) +{ + return *x <= 0x3fffffffffffffff; +} + +int +le_1byte_b (uint64_t *x) +{ + return *x < 0x4000000000000000; +} diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-2.c b/gcc/testsuite/gcc.dg/cmp-mem-const-2.c new file mode 100644 index 00000000000..a7cc5348295 --- /dev/null +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-2.c @@ -0,0 +1,17 @@ +/* { dg-do compile { target { lp64 } } } */ +/* { dg-options "-O1 -fdump-rtl-combine-details" } */ +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to QI" "combine" } } */ + +typedef __UINT64_TYPE__ uint64_t; + +int +ge_1byte_a (uint64_t *x) +{ + return *x > 0x3fffffffffffffff; +} + +int +ge_1byte_b (uint64_t *x) +{ + return *x >= 0x4000000000000000; +} diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-3.c b/gcc/testsuite/gcc.dg/cmp-mem-const-3.c new file mode 100644 index 00000000000..06f80bf72d8 --- /dev/null +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-3.c @@ -0,0 +1,17 @@ +/* { dg-do compile { target { lp64 } } } */ +/* { dg-options "-O1 -fdump-rtl-combine-details" } */ +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to HI" "combine" } } */ + +typedef __UINT64_TYPE__ uint64_t; + +int +le_2bytes_a (uint64_t *x) +{ + return *x <= 0x3ffdffffffffffff; +} + +int +le_2bytes_b (uint64_t *x) +{ + return *x < 0x3ffe000000000000; +} diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-4.c b/gcc/testsuite/gcc.dg/cmp-mem-const-4.c new file mode 100644 index 00000000000..407999abf7e --- /dev/null +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-4.c @@ -0,0 +1,17 @@ +/* { dg-do compile { target { lp64 } } } */ +/* { dg-options "-O1 -fdump-rtl-combine-details" } */ +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to HI" "combine" } } */ + +typedef __UINT64_TYPE__ uint64_t; + +int +ge_2bytes_a (uint64_t *x) +{ + return *x > 0x400cffffffffffff; +} + +int +ge_2bytes_b (uint64_t *x) +{ + return *x >= 0x400d000000000000; +} diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-5.c b/gcc/testsuite/gcc.dg/cmp-mem-const-5.c new file mode 100644 index 00000000000..e16773f5bcf --- /dev/null +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-5.c @@ -0,0 +1,17 @@ +/* { dg-do compile { target { lp64 } } } */ +/* { dg-options "-O1 -fdump-rtl-combine-details" } */ +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to SI" "combine" } } */ + +typedef __UINT64_TYPE__ uint64_t; + +int +le_4bytes_a (uint64_t *x) +{ + return *x <= 0x3ffffdffffffffff; +} + +int +le_4bytes_b (uint64_t *x) +{ + return *x < 0x3ffffe0000000000; +} diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-6.c b/gcc/testsuite/gcc.dg/cmp-mem-const-6.c new file mode 100644 index 00000000000..8f53b5678bd --- /dev/null +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-6.c @@ -0,0 +1,17 @@ +/* { dg-do compile { target { lp64 } } } */ +/* { dg-options "-O1 -fdump-rtl-combine-details" } */ +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to SI" "combine" } } */ + +typedef __UINT64_TYPE__ uint64_t; + +int +ge_4bytes_a (uint64_t *x) +{ + return *x > 0x4000cfffffffffff; +} + +int +ge_4bytes_b (uint64_t *x) +{ + return *x >= 0x4000d00000000000; +} diff --git a/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c b/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c new file mode 100644 index 00000000000..309aafbec01 --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c @@ -0,0 +1,24 @@ +/* { dg-do compile { target { lp64 } } } */ +/* { dg-options "-O1 -march=z13 -mzarch -fdump-rtl-combine-details" } */ +/* { dg-final { scan-assembler-not {\tclc\t} } } */ +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to QI" "combine" } } */ + +struct s +{ + long a; + unsigned b : 1; + unsigned c : 1; +}; + +int foo (struct s *x) +{ + /* Expression + x->b || x->c + is transformed into + _1 = BIT_FIELD_REF <*x_4(D), 64, 64>; + _2 = _1 > 0x3FFFFFFFFFFFFFFF; + where the constant may materialize in the literal pool and an 8 byte CLC + may be emitted. Ensure this is not the case. + */ + return x->b || x->c; +}