diff mbox series

[gcc-15,1/3] RISC-V: avoid LUI based const materialization ... [part of PR/106265]

Message ID 20240316173524.1147760-2-vineetg@rivosinc.com
State New
Headers show
Series RISC-V improve stack/array access by constant mat tweak | expand

Commit Message

Vineet Gupta March 16, 2024, 5:35 p.m. UTC
... if the constant can be represented as sum of two S12 values.
The two S12 values could instead be fused with subsequent ADD insn.
The helps
 - avoid an additional LUI insn
 - side benefits of not clobbering a reg

e.g.
                            w/o patch             w/ patch
long                  |                     |
plus(unsigned long i) |	li	a5,4096     |
{                     |	addi	a5,a5,-2032 | addi a0, a0, 2047
   return i + 2064;   |	add	a0,a0,a5    | addi a0, a0, 17
}                     | 	ret         | ret

NOTE: In theory not having const in a standalone reg might seem less
      CSE friendly, but for workloads in consideration these mat are
      from very late LRA reloads and follow on GCSE is not doing much
      currently.

The real benefit however is seen in base+offset computation for array
accesses and especially for stack accesses which are finalized late in
optim pipeline, during LRA register allocation. Often the finalized
offsets trigger LRA reloads resulting in mind boggling repetition of
exact same insn sequence including LUI based constant materialization.

This shaves off 290 billion dynamic instrustions (QEMU icounts) in
SPEC 2017 Cactu benchmark which is over 10% of workload. In the rest of
suite, there additional 10 billion shaved, with both gains and losses
in indiv workloads as is usual with compiler changes.

1,214,172,747,858 | 1,212,530,889,930 |  0.14%  <-
  740,618,658,160 |   739,515,555,243 |  0.15%  <-
  692,128,469,436 |   691,175,291,593 |  0.14%  <-
  190,811,495,310 |   190,854,437,311 | -0.02%
  225,751,808,189 |   225,818,881,019 | -0.03%
  220,364,237,915 |   220,405,868,038 | -0.02%
  179,157,361,261 |   179,186,059,187 | -0.02%
  219,313,921,837 |   219,337,232,829 | -0.01%
  278,733,265,382 |   278,733,264,380 |  0.00%
  442,397,437,578 |   442,397,436,026 |  0.00%
  344,112,144,242 |   344,112,142,910 |  0.00%
  417,561,390,319 |   417,561,388,877 |  0.00%
  669,319,255,911 |   669,318,761,114 |  0.00%
2,852,781,330,203 | 2,564,750,360,011 | 10.10% <--
1,855,884,349,001 | 1,855,881,122,280 |  0.00%
1,653,856,904,409 | 1,653,733,205,926 |  0.01%
2,974,347,350,401 | 2,974,174,999,505 |  0.01%
1,158,337,609,995 | 1,158,337,608,826 |  0.00%
1,021,799,191,806 | 1,021,425,634,319 |  0.04%
1,716,448,413,824 | 1,714,912,501,736 |  0.09%
  849,330,283,510 |   849,321,128,484 |  0.00%
  276,556,968,005 |   276,232,659,282 |  0.12%  <-
  913,352,953,686 |   912,822,592,719 |  0.06%
  903,792,550,116 |   903,107,637,917 |  0.08%
1,654,904,617,482 | 1,655,327,175,238 | -0.03%
1,495,841,421,311 | 1,493,418,761,599 |  0.16%  <-
1,641,969,530,523 | 1,642,126,596,979 | -0.01%
2,546,170,307,385 | 2,546,151,690,122 |  0.00%
1,983,551,321,388 | 1,983,525,296,345 |  0.00%
1,516,077,036,742 | 1,516,076,833,481 |  0.00%
2,055,868,055,165 | 2,055,865,373,175 |  0.00%
1,000,621,723,807 | 1,000,601,360,859 |  0.00%
1,024,149,313,694 | 1,024,127,472,779 |  0.00%
  363,827,037,541 |   363,057,012,239 |  0.21%  <-
  906,649,110,459 |   905,928,886,712 |  0.08%
  509,023,896,044 |   508,140,354,911 |  0.17%  <-
      403,238,430 |       403,238,485 |  0.00%
      403,238,430 |       403,238,485 |  0.00%

This should still be considered damage control as the real/deeper fix
would be to reduce number of LRA reloads or CSE/anchor those during
LRA constraint sub-pass (re)runs.

Implementation Details (for posterity)
--------------------------------------
 - basic idea is to have a splitter selected via a new predicate for constant
   being possible sum of two S12 and provide the transform.
   This is however a 2 -> 2 transform which combine can't handle.
   So we specify it using a define_insn_and_split.

 - the initial loose "i" constraint caused LRA to accept invalid insns thus
   needing a tighter new constraint as well.

 - An additional fallback alternate with catch-all "r" register
   constraint also needed to allow any "reloads" that LRA might
   require for ADDI with const larger than S12.

Testing
--------
This is testsuite clean (rv64 only).
I'll rely on post-commit CI multlib run for any possible fallout for
other setups such as rv32.

|                            |          gcc |          g++ |     gfortran |
| rv64imafdc_zba_zbb_zbs_zicond/  lp64d/ medlow |  181 /    44 |    4 /     1 |   72 /    12 |
| rv64imafdc_zba_zbb_zbs_zicond/  lp64d/ medlow |  181 /    44 |    4 /     1 |   73 /    13 |
|

I also threw this into a buildroot run, it obviously boots Linux to
userspace. bloat-o-meter on glibc and kernel show overall decrease in
staic instruction counts with some minor spot increases.
These are generally in the category of

 - LUI + ADDI are 2 byte each vs. two ADD being 4 byte each.
 - Knock on effects due to inlining changes.
 - Sometimes the slightly shorter 2-insn seq in a mult-exit function
   can cause in-place epilogue duplication (vs. a jump back).
   This is slightly larger but more efficient in execution.
In summary nothing to fret about.

| linux/scripts/bloat-o-meter build-gcc-240131/target/lib/libc.so.6 \
         build-gcc-240131-new-splitter-1-variant/target/lib/libc.so.6
|
| add/remove: 0/0 grow/shrink: 21/49 up/down: 520/-3056 (-2536)
| Function                                     old     new   delta
| getnameinfo                                 2756    2892    +136
...
| tempnam                                      136     144      +8
| padzero                                      276     284      +8
...
| __GI___register_printf_specifier             284     280      -4
| __EI_xdr_array                               468     464      -4
| try_file_lock                                268     260      -8
| pthread_create@GLIBC_2                      3520    3508     -12
| __pthread_create_2_1                        3520    3508     -12
...
| _nss_files_setnetgrent                       932     904     -28
| _nss_dns_gethostbyaddr2_r                   1524    1480     -44
| build_trtable                               3312    3208    -104
| printf_positional                          25000   22580   -2420
| Total: Before=2107999, After=2105463, chg -0.12%

gcc/ChangeLog:
	PR target/106265 (partially)
	* config/riscv/riscv.h: New macros to check for sum of two S12
	range.
	* config/riscv/constraints.md: New constraint.
	* config/riscv/predicates.md: New Predicate.
	* config/riscv/riscv.md: New splitter to not materialize
	constant in desired range.

gcc/testsuite/ChangeLog:
	* gcc.target/riscv/sum-of-two-s12-const-1.c: New test: checks for
	new patterns output.
	* gcc.target/riscv/sum-of-two-s12-const-2.c: New test: should not
	ICE.

Signed-off-by: Vineet Gupta <vineetg@rivosinc.com>
---
 gcc/config/riscv/constraints.md               |  6 +++
 gcc/config/riscv/predicates.md                |  6 +++
 gcc/config/riscv/riscv.h                      | 15 +++++++
 gcc/config/riscv/riscv.md                     | 34 ++++++++++++++
 .../gcc.target/riscv/sum-of-two-s12-const-1.c | 45 +++++++++++++++++++
 .../gcc.target/riscv/sum-of-two-s12-const-2.c | 22 +++++++++
 6 files changed, 128 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/sum-of-two-s12-const-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/sum-of-two-s12-const-2.c

Comments

Jeff Law March 16, 2024, 8:28 p.m. UTC | #1
On 3/16/24 11:35 AM, Vineet Gupta wrote:
> ... if the constant can be represented as sum of two S12 values.
> The two S12 values could instead be fused with subsequent ADD insn.
> The helps
>   - avoid an additional LUI insn
>   - side benefits of not clobbering a reg
> 
> e.g.
>                              w/o patch             w/ patch
> long                  |                     |
> plus(unsigned long i) |	li	a5,4096     |
> {                     |	addi	a5,a5,-2032 | addi a0, a0, 2047
>     return i + 2064;   |	add	a0,a0,a5    | addi a0, a0, 17
> }                     | 	ret         | ret
> 
> NOTE: In theory not having const in a standalone reg might seem less
>        CSE friendly, but for workloads in consideration these mat are
>        from very late LRA reloads and follow on GCSE is not doing much
>        currently.
As you note, there's a bit of natural tension between what to expose and 
when.  There's no clear cut answer and I might argue there never will be 
given the design and implementation of various parts of the RTL pipeline.

We have some ports that expose early and just say "tough" if it's a 
minor loss in some cases.  Others choose to expose late.  We're kindof a 
mix of both in RISC-V land.   The limited offsets we've got will tend to 
make this problem a bit worse for RISC-V compared to other architectures.



> 
> The real benefit however is seen in base+offset computation for array
> accesses and especially for stack accesses which are finalized late in
> optim pipeline, during LRA register allocation. Often the finalized
> offsets trigger LRA reloads resulting in mind boggling repetition of
> exact same insn sequence including LUI based constant materialization.
Yea, this is a known sore spot.  I spent a good deal of time working on 
the PA where we only have a 5 bit displacement for FP memory ops.  You 
can assume this caused reload fits and often resulted in horrific (and 
repetitive code) code.

We did some interesting tricks to avoid the worst of the codegen issues. 
  None of those tricks really apply here since we're in an LRA world and 
there's no difference in the allowed offset in a memory reference vs an 
addi instruction.



> 
> This shaves off 290 billion dynamic instrustions (QEMU icounts) in
> SPEC 2017 Cactu benchmark which is over 10% of workload. In the rest of
> suite, there additional 10 billion shaved, with both gains and losses
> in indiv workloads as is usual with compiler changes.
That's a fantastic result.

> 
> This should still be considered damage control as the real/deeper fix
> would be to reduce number of LRA reloads or CSE/anchor those during
> LRA constraint sub-pass (re)runs.
Agreed.  But I think it's worth tackling from both ends.  Generate 
better code when we do have these reloads and avoid the reloads when we can.


> 
> Implementation Details (for posterity)
> --------------------------------------
>   - basic idea is to have a splitter selected via a new predicate for constant
>     being possible sum of two S12 and provide the transform.
>     This is however a 2 -> 2 transform which combine can't handle.
>     So we specify it using a define_insn_and_split.
Right.  For the record it looks like a 2->2 case because of the 
mvconst_internal define_insn_and_split.

What I was talking about in the Tuesday meeting last week was the desire 
to rethink that pattern precisely because it drives us to need to 
implement patterns like yours rather than the more natural 3->2 or 4->3/2.

Furthermore, I have a suspicion that there's logicals where we're going 
to want to do something similar and if we keep the mvconst_internal 
pattern they're all going to have to be implemented as 2->2s with a 
define_insn_and_split rather than the more natural 3->2.

Having said all that, I suspect the case you're diving into is far more 
important than the logicals.



> 
> | linux/scripts/bloat-o-meter build-gcc-240131/target/lib/libc.so.6 \
>           build-gcc-240131-new-splitter-1-variant/target/lib/libc.so.6
> |
> | add/remove: 0/0 grow/shrink: 21/49 up/down: 520/-3056 (-2536)
> | Function                                     old     new   delta
> | getnameinfo                                 2756    2892    +136
> ...
> | tempnam                                      136     144      +8
> | padzero                                      276     284      +8
> ...
> | __GI___register_printf_specifier             284     280      -4
> | __EI_xdr_array                               468     464      -4
> | try_file_lock                                268     260      -8
> | pthread_create@GLIBC_2                      3520    3508     -12
> | __pthread_create_2_1                        3520    3508     -12
> ...
> | _nss_files_setnetgrent                       932     904     -28
> | _nss_dns_gethostbyaddr2_r                   1524    1480     -44
> | build_trtable                               3312    3208    -104
> | printf_positional                          25000   22580   -2420
> | Total: Before=2107999, After=2105463, chg -0.12%
That's a bit funny.  I was inside printf_positional Friday.  It's almost 
certainly the case that it sees a big improvement because it uses a lot 
of stack space and I believe also has a frame pointer.

BUt yea, I wouldn't let the minor regressions stand in the way here.


> 
> gcc/ChangeLog:
> 	PR target/106265 (partially)
> 	* config/riscv/riscv.h: New macros to check for sum of two S12
> 	range.
> 	* config/riscv/constraints.md: New constraint.
> 	* config/riscv/predicates.md: New Predicate.
> 	* config/riscv/riscv.md: New splitter to not materialize
> 	constant in desired range.
> 
> gcc/testsuite/ChangeLog:
> 	* gcc.target/riscv/sum-of-two-s12-const-1.c: New test: checks for
> 	new patterns output.
> 	* gcc.target/riscv/sum-of-two-s12-const-2.c: New test: should not
> 	ICE.
> 
> Signed-off-by: Vineet Gupta <vineetg@rivosinc.com>
> ---
>   gcc/config/riscv/constraints.md               |  6 +++
>   gcc/config/riscv/predicates.md                |  6 +++
>   gcc/config/riscv/riscv.h                      | 15 +++++++
>   gcc/config/riscv/riscv.md                     | 34 ++++++++++++++
>   .../gcc.target/riscv/sum-of-two-s12-const-1.c | 45 +++++++++++++++++++
>   .../gcc.target/riscv/sum-of-two-s12-const-2.c | 22 +++++++++
>   6 files changed, 128 insertions(+)
>   create mode 100644 gcc/testsuite/gcc.target/riscv/sum-of-two-s12-const-1.c
>   create mode 100644 gcc/testsuite/gcc.target/riscv/sum-of-two-s12-const-2.c
> 
> diff --git a/gcc/config/riscv/constraints.md b/gcc/config/riscv/constraints.md
> index 41acaea04eba..435461180c7e 100644
> --- a/gcc/config/riscv/constraints.md
> +++ b/gcc/config/riscv/constraints.md
> @@ -80,6 +80,12 @@
>     (and (match_code "const_int")
>          (match_test "LUI_OPERAND (ival)")))
>   
> +(define_constraint "MiG"
> +  "const can be represented as sum of any S12 values."
> +  (and (match_code "const_int")
> +       (ior (match_test "IN_RANGE (ival,  2048,  4094)")
> +	    (match_test "IN_RANGE (ival, -4096, -2049)"))))
> +
>   (define_constraint "Ds3"
>     "@internal
>      1, 2 or 3 immediate"
> diff --git a/gcc/config/riscv/predicates.md b/gcc/config/riscv/predicates.md
> index 6c87a7bd1f49..89490339c2da 100644
> --- a/gcc/config/riscv/predicates.md
> +++ b/gcc/config/riscv/predicates.md
> @@ -420,6 +420,12 @@
>   	return true;
>   })
>   
> +(define_predicate "const_two_s12"
> +  (match_code "const_int")
> +{
> +  return SUM_OF_TWO_S12 (INTVAL (op));
> +})
> +
>   ;; CORE-V Predicates:
>   (define_predicate "immediate_register_operand"
>     (ior (match_operand 0 "register_operand")
> diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h
> index da089a03e9d1..817661058896 100644
> --- a/gcc/config/riscv/riscv.h
> +++ b/gcc/config/riscv/riscv.h
> @@ -624,6 +624,21 @@ enum reg_class
>     (((VALUE) | ((1UL<<31) - IMM_REACH)) == ((1UL<<31) - IMM_REACH)	\
>      || ((VALUE) | ((1UL<<31) - IMM_REACH)) + IMM_REACH == 0)
>   
> +/* True if a VALUE (constant) can be expressed as sum of two S12 constants
> +   (in range -2048 to 2047).
> +   Range check logic:
> +     from: min S12 + 1 (or -1 depending on what side of zero)
> +       to: two times the min S12 value (to max out S12 bits).  */
> +
> +#define SUM_OF_TWO_S12_N(VALUE)						\
> +  (((VALUE) >= (-2048 * 2)) && ((VALUE) <= (-2048 - 1)))
> +
> +#define SUM_OF_TWO_S12_P(VALUE)						\
> +  (((VALUE) >= ( 2047 + 1)) && ((VALUE) <= ( 2047 * 2)))
> +
> +#define SUM_OF_TWO_S12(VALUE)						\
> +  (SUM_OF_TWO_S12_N (VALUE) || SUM_OF_TWO_S12_P (VALUE))
> +
>   /* If this is a single bit mask, then we can load it with bseti.  Special
>      handling of SImode 0x80000000 on RV64 is done in riscv_build_integer_1. */
>   #define SINGLE_BIT_MASK_OPERAND(VALUE)					\
> diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> index b16ed97909c0..79fe861ef91f 100644
> --- a/gcc/config/riscv/riscv.md
> +++ b/gcc/config/riscv/riscv.md
> @@ -751,6 +751,40 @@
>     [(set_attr "type" "arith")
>      (set_attr "mode" "DI")])
>   
> +;; Special case of adding a reg and constant if latter is sum of two S12
> +;; values (in range -2048 to 2047). Avoid materialized the const and fuse
> +;; into the add (with an additional add for 2nd value). Makes a 3 insn
> +;; sequence into 2 insn.
> +
> +(define_insn_and_split "*add<mode>3_const_sum_of_two_s12"
> +  [(set (match_operand:P	 0 "register_operand" "=r,r")
> +	(plus:P (match_operand:P 1 "register_operand" " r,r")
> +		(match_operand:P 2 "const_two_s12"    " MiG,r")))]
> +  ""
> +  "add %0,%1,%2"
> +  ""
> +  [(set (match_dup 0)
> +	(plus:P (match_dup 1) (match_dup 3)))
> +   (set (match_dup 0)
> +	(plus:P (match_dup 0) (match_dup 4)))]
> +{
> +  int val = INTVAL (operands[2]);
> +  if (SUM_OF_TWO_S12_P (val))
> +    {
> +       operands[3] = GEN_INT (2047);
> +       operands[4] = GEN_INT (val - 2047);
> +    }
> +  else if (SUM_OF_TWO_S12_N (val))
> +    {
> +       operands[3] = GEN_INT (-2048);
> +       operands[4] = GEN_INT (val + 2048);
> +    }
> +  else
> +      gcc_unreachable ();
> +}
> +  [(set_attr "type" "arith")
> +   (set_attr "mode" "<P:MODE>")])
So why use "P" for your mode iterator?  I would have expected GPR since 
I think this works for both SI and DI mode as-is.

For the output template "#" for alternative 0 and the add instruction 
for alternative 1 is probably better.  That way it's clear to everyone 
that alternative 0 is always meant to be split.


Don't you need some kind of condition on the split to avoid splitting 
when you've got a register for operands[2]?  It would seem to me that as 
written, it would still try to spit and trigger an RTL checking error 
when you try to extract INTVAL (operands[2]).


Jeff
Vineet Gupta March 19, 2024, 12:07 a.m. UTC | #2
On 3/16/24 13:28, Jeff Law wrote:
>> Implementation Details (for posterity)
>> --------------------------------------
>>   - basic idea is to have a splitter selected via a new predicate for constant
>>     being possible sum of two S12 and provide the transform.
>>     This is however a 2 -> 2 transform which combine can't handle.
>>     So we specify it using a define_insn_and_split.
> Right.  For the record it looks like a 2->2 case because of the 
> mvconst_internal define_insn_and_split.
>
> What I was talking about in the Tuesday meeting last week was the desire 
> to rethink that pattern precisely because it drives us to need to 
> implement patterns like yours rather than the more natural 3->2 or 4->3/2.

A few weeks, I did an experimental run removing that splitter isn't
catastrophic for SPEC, granted it is a pretty narrow view of the world :-)
Below is upstream vs. revert of mvconst_internal (for apples:apples just
the revert, none of my new splitter stuff)

     Baseline        Revert mvconst_int
1,214,172,747,858 |  1,225,245,908,131 |  -0.91% <--
  740,618,658,160 |    743,873,857,461 |  -0.44% <-
  692,128,469,436 |    695,695,894,602 |  -0.52% <--
  190,811,495,310 |    190,934,386,665 |  -0.06%
  225,751,808,189 |    225,909,747,497 |  -0.07%
  220,364,237,915 |    220,466,640,640 |  -0.05%
  179,157,361,261 |    179,357,613,835 |  -0.11%
  219,313,921,837 |    219,436,712,114 |  -0.06%
  281,344,210,410 |    281,344,197,305 |   0.00%
  446,517,079,816 |    446,517,058,807 |   0.00%
  347,300,137,757 |    347,300,119,942 |   0.00%
  421,496,082,529 |    421,496,063,144 |   0.00%
  669,319,255,911 |    673,977,112,732 |  -0.70% <--
2,852,810,623,629 |  2,852,809,986,901 |   0.00%
1,855,884,349,001 |  1,855,837,810,109 |   0.00%
1,653,853,403,482 |  1,653,714,171,270 |   0.01%
2,974,347,287,057 |  2,970,520,074,342 |   0.13%
1,158,337,609,995 |  1,158,337,607,076 |   0.00%
1,019,181,486,091 |  1,020,576,716,760 |  -0.14%
1,738,961,517,942 |  1,736,024,569,247 |   0.17%
  849,330,280,930 |    848,955,738,855 |   0.04%
  276,584,892,794 |    276,457,202,331 |   0.05%
  913,410,589,335 |    913,407,101,214 |   0.00%
  903,864,384,529 |    903,800,709,452 |   0.01%
1,654,905,086,567 |  1,656,684,048,052 |  -0.11%
1,513,514,546,262 |  1,510,815,435,668 |   0.18%
1,641,980,078,831 |  1,602,410,552,874 |   2.41% <--
2,546,170,307,336 |  2,546,206,790,578 |   0.00%
1,983,551,321,388 |  1,979,562,936,994 |   0.20%
1,516,077,036,742 |  1,515,038,668,667 |   0.07%
2,056,386,745,670 |  2,055,592,903,700 |   0.04%
1,008,224,427,448 |  1,008,027,321,669 |   0.02%
1,169,305,141,789 |  1,169,028,937,430 |   0.02%
  363,827,037,541 |    361,982,880,800 |   0.51% <--
  906,649,110,459 |    909,651,995,900 |  -0.33% <-
  509,023,896,044 |    508,578,027,604 |   0.09%
      403,238,430 |        398,492,922 |   1.18%
      403,238,430 |        398,492,922 |   1.18%
38,917,902,479,830  38,886,374,486,212


Do note however that this takes us back to constant pool way of loading
complex constants (vs. bit fiddling and stitching). The 2.4% gain in
deepsjeng is exactly that and we need to decide what to do about it:
keep it as such or tweak it with a cost model change and/or make this
for everyone or have this per cpu tune - hard to tell whats the right
thing to do here.

P.S. Perhaps obvious but the prerequisite to revert is to tend to the
original linked PRs which will now start failing so that will hopefully
improve some of the above.

> Furthermore, I have a suspicion that there's logicals where we're going 
> to want to do something similar and if we keep the mvconst_internal 
> pattern they're all going to have to be implemented as 2->2s with a 
> define_insn_and_split rather than the more natural 3->2.

Naive question: why is define_split more natural vs. define_insn_and_split.
Is it because of the syntax/implementation or that one is more Combine
"centric" and other is more of a Split* Pass thing (roughly speaking of
course) or something else altogether ?

Would we have to revisit the new splitter (and perhaps audit the
existing define_insn_and_split patterns) if we were to go ahead with
this revert ?
>> +(define_insn_and_split "*add<mode>3_const_sum_of_two_s12"
>> +  [(set (match_operand:P	 0 "register_operand" "=r,r")
>> +	(plus:P (match_operand:P 1 "register_operand" " r,r")
>> +		(match_operand:P 2 "const_two_s12"    " MiG,r")))]
>> +  ""
>> +  "add %0,%1,%2"
>> +  ""
>> +  [(set (match_dup 0)
>> +	(plus:P (match_dup 1) (match_dup 3)))
>> +   (set (match_dup 0)
>> +	(plus:P (match_dup 0) (match_dup 4)))]
>> +{
>> +  int val = INTVAL (operands[2]);
>> +  if (SUM_OF_TWO_S12_P (val))
>> +    {
>> +       operands[3] = GEN_INT (2047);
>> +       operands[4] = GEN_INT (val - 2047);
>> +    }
>> +  else if (SUM_OF_TWO_S12_N (val))
>> +    {
>> +       operands[3] = GEN_INT (-2048);
>> +       operands[4] = GEN_INT (val + 2048);
>> +    }
>> +  else
>> +      gcc_unreachable ();
>> +}
>> +  [(set_attr "type" "arith")
>> +   (set_attr "mode" "<P:MODE>")])
> So why use "P" for your mode iterator?  I would have expected GPR since 
> I think this works for both SI and DI mode as-is.

My intent at least was to have this work for either of rv64/rv32 and in
either of those environments, work for both SI or DI - certainly
rv64+{SI,DI}.
To that end I debated between GPR, X and P.
It seems GPR only supports DI if TARGET_64BIT.
But I could well be wrong about above requirements or the subtle
differences in these.

> For the output template "#" for alternative 0 and the add instruction 
> for alternative 1 is probably better.  That way it's clear to everyone 
> that alternative 0 is always meant to be split.

OK.

> Don't you need some kind of condition on the split to avoid splitting 
> when you've got a register for operands[2]? 

Won't the predicate "match_code" guarantee that ?

  (define_predicate "const_two_s12"
     (match_code "const_int")
   {
     return SUM_OF_TWO_S12 (INTVAL (op), false);
   })

> It would seem to me that as 
> written, it would still try to spit and trigger an RTL checking error 
> when you try to extract INTVAL (operands[2]).

Do I need to build gcc in a certain way to expose such errors - I wasn't
able to trigger such an error for the entire testsuite or SPEC build.

Thx for the detailed quick review.
-Vineet
Jeff Law March 23, 2024, 5:59 a.m. UTC | #3
On 3/18/24 6:07 PM, Vineet Gupta wrote:

> 
> Naive question: why is define_split more natural vs. define_insn_and_split.
> Is it because of the syntax/implementation or that one is more Combine
> "centric" and other is more of a Split* Pass thing (roughly speaking of
> course) or something else altogether ?
There are parts of combine that cost based on the number of insns.  This 
is primarily to keep combine from taking an insane amount of time.

So when we have a little white lie like mvconst_internal we muck up that 
costing aspect of the combiner.  That in turn stops the combiner from 
doing certain combinations.

As a concrete example consider this:

unsigned long long w32mem(unsigned long long w32)
{
     return w32 & ~(1U << 0);
}


Right now we use a bseti+addi to construct the constant, then do the 
logical and.   Prior to the combiner it looks like this:


> (insn 7 3 8 2 (set (reg:DI 137) 
>         (const_int 4294967294 [0xfffffffe])) "j.c":3:16 206 {*mvconst_internal}
>      (nil))
> (insn 8 7 13 2 (set (reg:DI 136 [ _2 ])
>         (and:DI (reg/v:DI 135 [ w32 ])
>             (reg:DI 137))) "j.c":3:16 102 {*anddi3}
>      (expr_list:REG_DEAD (reg:DI 137)
>         (expr_list:REG_DEAD (reg/v:DI 135 [ w32 ])
>             (expr_list:REG_EQUAL (and:DI (reg/v:DI 135 [ w32 ])
>                     (const_int 4294967294 [0xfffffffe]))
>                 (nil)))))

The combiner will refuse to match a splitter where the number of 
incoming insns matches the number of resulting insns.  So to match this 
we'd have to write another define_insn_and_split.



If we didn't have mvconst_internal, then we'd see something like this:

> (insn 6 3 7 2 (set (reg:DI 138)
>         (const_int 4294967296 [0x100000000])) "j.c":3:16 -1
>      (nil))
> (insn 7 6 8 2 (set (reg:DI 137)
>         (plus:DI (reg:DI 138)
>             (const_int -2 [0xfffffffffffffffe]))) "j.c":3:16 -1
>      (expr_list:REG_EQUAL (const_int 4294967294 [0xfffffffe])
>         (nil)))
> (insn 8 7 9 2 (set (reg:DI 136 [ _2 ])
>         (and:DI (reg/v:DI 135 [ w32 ])
>             (reg:DI 137))) "j.c":3:16 -1
>      (nil))

Which we could match with a define_split which would generate RTL for 
bclri+zext.w.


Note that I suspect there's a handful of these kinds of sequences for 
logical ops where the immediate doesn't fit a simm12.


Of course the point of the white lie is to expose complex constant 
synthesis in  away that the combiner can use to simplify things.  It's 
definitely a tradeoff either way.  What's been rattling around a little 
bit would be to narrow the set of constants allowed by mvconst_internal 
to those which require 3 or more to synthesize.  THe idea being cases 
like you're looking where we can use addi+add rather than lui+addi+add 
would be rejected by mvconst_internal, but the more complex constants 
that push combine over the 4 insn limit would be allowed by 
mvconst_internal.


> 
> Would we have to revisit the new splitter (and perhaps audit the
> existing define_insn_and_split patterns) if we were to go ahead with
> this revert ?
I don't recall follow-ups which used the result of mvconst_internal in 
subsequent combiner patterns, but it should be fairly simple to search 
for them.

We'd need to look back at the original bug which resulted in creating 
the mvconst_internal pattern.  My recollection is it had a fairly 
complex constant and we were unable to see enough insns to do the 
necessary simplification to fix that case.

I bet whatever goes on inside perlbench,  mcf and x264 (guessing based 
on instruction counts in your message) + the original bug report will 
provide reasonable testcases for evaluating reversion + adding patches 
to avoid the regressions.


>> So why use "P" for your mode iterator?  I would have expected GPR since
>> I think this works for both SI and DI mode as-is.
> 
> My intent at least was to have this work for either of rv64/rv32 and in
> either of those environments, work for both SI or DI - certainly
> rv64+{SI,DI}.
> To that end I debated between GPR, X and P.
> It seems GPR only supports DI if TARGET_64BIT.
> But I could well be wrong about above requirements or the subtle
> differences in these.
I wouldn't worry about GPR only support DI for TARGET_64BIT.  I don't 
think we generally expose DImode patterns for TARGET_32BIT.

> 
>> For the output template "#" for alternative 0 and the add instruction
>> for alternative 1 is probably better.  That way it's clear to everyone
>> that alternative 0 is always meant to be split.
> 
> OK.
> 
>> Don't you need some kind of condition on the split to avoid splitting
>> when you've got a register for operands[2]?
> 
> Won't the predicate "match_code" guarantee that ?
> 
>    (define_predicate "const_two_s12"
>       (match_code "const_int")
>     {
>       return SUM_OF_TWO_S12 (INTVAL (op), false);
>     })
You're right.  Missed the match_code.  Sorry for the bad steer.

> 
> Do I need to build gcc in a certain way to expose such errors - I wasn't
> able to trigger such an error for the entire testsuite or SPEC build.
There's a distinct RTL checking mode.  It's fairly expensive from a 
compile-time standpoint though.

Jeff
diff mbox series

Patch

diff --git a/gcc/config/riscv/constraints.md b/gcc/config/riscv/constraints.md
index 41acaea04eba..435461180c7e 100644
--- a/gcc/config/riscv/constraints.md
+++ b/gcc/config/riscv/constraints.md
@@ -80,6 +80,12 @@ 
   (and (match_code "const_int")
        (match_test "LUI_OPERAND (ival)")))
 
+(define_constraint "MiG"
+  "const can be represented as sum of any S12 values."
+  (and (match_code "const_int")
+       (ior (match_test "IN_RANGE (ival,  2048,  4094)")
+	    (match_test "IN_RANGE (ival, -4096, -2049)"))))
+
 (define_constraint "Ds3"
   "@internal
    1, 2 or 3 immediate"
diff --git a/gcc/config/riscv/predicates.md b/gcc/config/riscv/predicates.md
index 6c87a7bd1f49..89490339c2da 100644
--- a/gcc/config/riscv/predicates.md
+++ b/gcc/config/riscv/predicates.md
@@ -420,6 +420,12 @@ 
 	return true;
 })
 
+(define_predicate "const_two_s12"
+  (match_code "const_int")
+{
+  return SUM_OF_TWO_S12 (INTVAL (op));
+})
+
 ;; CORE-V Predicates:
 (define_predicate "immediate_register_operand"
   (ior (match_operand 0 "register_operand")
diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h
index da089a03e9d1..817661058896 100644
--- a/gcc/config/riscv/riscv.h
+++ b/gcc/config/riscv/riscv.h
@@ -624,6 +624,21 @@  enum reg_class
   (((VALUE) | ((1UL<<31) - IMM_REACH)) == ((1UL<<31) - IMM_REACH)	\
    || ((VALUE) | ((1UL<<31) - IMM_REACH)) + IMM_REACH == 0)
 
+/* True if a VALUE (constant) can be expressed as sum of two S12 constants
+   (in range -2048 to 2047).
+   Range check logic:
+     from: min S12 + 1 (or -1 depending on what side of zero)
+       to: two times the min S12 value (to max out S12 bits).  */
+
+#define SUM_OF_TWO_S12_N(VALUE)						\
+  (((VALUE) >= (-2048 * 2)) && ((VALUE) <= (-2048 - 1)))
+
+#define SUM_OF_TWO_S12_P(VALUE)						\
+  (((VALUE) >= ( 2047 + 1)) && ((VALUE) <= ( 2047 * 2)))
+
+#define SUM_OF_TWO_S12(VALUE)						\
+  (SUM_OF_TWO_S12_N (VALUE) || SUM_OF_TWO_S12_P (VALUE))
+
 /* If this is a single bit mask, then we can load it with bseti.  Special
    handling of SImode 0x80000000 on RV64 is done in riscv_build_integer_1. */
 #define SINGLE_BIT_MASK_OPERAND(VALUE)					\
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index b16ed97909c0..79fe861ef91f 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -751,6 +751,40 @@ 
   [(set_attr "type" "arith")
    (set_attr "mode" "DI")])
 
+;; Special case of adding a reg and constant if latter is sum of two S12
+;; values (in range -2048 to 2047). Avoid materialized the const and fuse
+;; into the add (with an additional add for 2nd value). Makes a 3 insn
+;; sequence into 2 insn.
+
+(define_insn_and_split "*add<mode>3_const_sum_of_two_s12"
+  [(set (match_operand:P	 0 "register_operand" "=r,r")
+	(plus:P (match_operand:P 1 "register_operand" " r,r")
+		(match_operand:P 2 "const_two_s12"    " MiG,r")))]
+  ""
+  "add %0,%1,%2"
+  ""
+  [(set (match_dup 0)
+	(plus:P (match_dup 1) (match_dup 3)))
+   (set (match_dup 0)
+	(plus:P (match_dup 0) (match_dup 4)))]
+{
+  int val = INTVAL (operands[2]);
+  if (SUM_OF_TWO_S12_P (val))
+    {
+       operands[3] = GEN_INT (2047);
+       operands[4] = GEN_INT (val - 2047);
+    }
+  else if (SUM_OF_TWO_S12_N (val))
+    {
+       operands[3] = GEN_INT (-2048);
+       operands[4] = GEN_INT (val + 2048);
+    }
+  else
+      gcc_unreachable ();
+}
+  [(set_attr "type" "arith")
+   (set_attr "mode" "<P:MODE>")])
+
 (define_expand "addv<mode>4"
   [(set (match_operand:GPR           0 "register_operand" "=r,r")
 	(plus:GPR (match_operand:GPR 1 "register_operand" " r,r")
diff --git a/gcc/testsuite/gcc.target/riscv/sum-of-two-s12-const-1.c b/gcc/testsuite/gcc.target/riscv/sum-of-two-s12-const-1.c
new file mode 100644
index 000000000000..4d6d135de5f5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/sum-of-two-s12-const-1.c
@@ -0,0 +1,45 @@ 
+// TBD: This doesn't quite work for rv32 yet
+/* { dg-do compile } */
+/* { dg-options { -march=rv64gcv -mabi=lp64d } } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" "-Os" "-Oz" } } */
+
+/* Ensure that gcc doesn't generate standlone li reg, 4096.  */
+long
+plus1(unsigned long i)
+{
+   return i + 2048;
+}
+
+long
+plus2(unsigned long i)
+{
+   return i + 4094;
+}
+
+long
+plus3(unsigned long i)
+{
+   return i + 2064;
+}
+
+/* Ensure that gcc doesn't generate standlone li reg, -4096.  */
+long
+minus1(unsigned long i)
+{
+   return i - 4096;
+}
+
+long
+minus2(unsigned long i)
+{
+   return i - 2049;
+}
+
+long
+minus3(unsigned long i)
+{
+   return i - 2064;
+}
+
+/* { dg-final { scan-assembler-not {li\t[a-x0-9]+,-4096} } } */
+/* { dg-final { scan-assembler-not {li\t[a-x0-9]+,4096} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/sum-of-two-s12-const-2.c b/gcc/testsuite/gcc.target/riscv/sum-of-two-s12-const-2.c
new file mode 100644
index 000000000000..5dcab52c2610
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/sum-of-two-s12-const-2.c
@@ -0,0 +1,22 @@ 
+/* Reduced version of c-c++-common/torture/builtin-convertvector-1.c.  */
+/* This should NOT ICE */
+
+/* { dg-do compile } */
+
+typedef long b __attribute__((vector_size(256 * sizeof(long))));
+typedef double c __attribute__((vector_size(256 * sizeof(double))));
+int d;
+void e(b *f, c *g) { *g = __builtin_convertvector(*f, c); }
+void h() {
+  struct {
+    b i;
+  } j;
+  union {
+    c i;
+    double a[6];
+  } k;
+  e(&j.i, &k.i);
+  if (k.a[d])
+    for (;;)
+      ;
+}