diff mbox series

[RISC-V] Use shNadd for constant synthesis

Message ID 146c1cf0-d829-4d15-890d-e32a2c4f16a9@ventanamicro.com
State New
Headers show
Series [RISC-V] Use shNadd for constant synthesis | expand

Commit Message

Jeff Law May 10, 2024, 3:36 a.m. UTC
So here's the next idiom to improve constant synthesis.

The basic idea here is to try and use shNadd to generate the constant 
when profitable.

Let's take 0x300000801.  Right now that generates:

         li      a0,3145728
         addi    a0,a0,1
         slli    a0,a0,12
         addi    a0,a0,-2047


But we can do better.  The constant is evenly divisible by 9 resulting 
in 0x55555639 which doesn't look terribly interesting.  But that 
constant can be generated with two instructions, then we can use a 
sh3add to multiply it by 9.  So the updated sequence looks like:

         li      a0,1431654400
         addi    a0,a0,1593
         sh3add  a0,a0,a0


This doesn't trigger a whole lot, but I haven't really set up a test to 
explore the most likely space where this might be useful.  The tests 
were found exploring a different class of constant synthesis problems.

If you were to dive into the before/after you'd see that the shNadd 
interacts quite nicely with the recent bseti work.   The joys of recursion.

Probably the most controversial thing in here is using the "FMA" opcode 
to stand in for when we want to use shNadd.  Essentially when we 
synthesize a constant we generate a series of RTL opcodes and constants 
for emission by another routine.   We don't really have a way to say we 
want a shift-add.  But you can think of shift-add as a limited form of 
multiply-accumulate.  It's a bit of a stretch, but not crazy bad IMHO.

Other approaches would be to store our own enum rather than an RTL 
opcode.  Or store an actual generator function rather than any kind of 
opcode.

It wouldn't take much pushback over (ab)using FMA in this manner to get 
me to use our own enums rather than RTL opcodes for this stuff.

Tested on rv64gc and rv32gcv.  Waiting on wider CI run before committing.

Jeff
gcc/

	* config/riscv/riscv.cc (riscv_build_integer_1): Recognize cases where
	we can use shNadd to improve constant synthesis.
	(riscv_move_integer): Handle code generation for shNadd.

gcc/testsuite
	* gcc.target/riscv/synthesis-1.c: Also count shNadd instructions.
	* gcc.target/riscv/synthesis-3.c: New test.
diff mbox series

Patch

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 2eac67b0ce0..75e828c81a7 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -880,6 +880,37 @@  riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
 	}
     }
 
+  if (cost > 2 && TARGET_64BIT && TARGET_ZBA)
+    {
+      if ((value % 9) == 0
+	  && (alt_cost = riscv_build_integer_1 (alt_codes, value / 9, mode) + 1) < cost)
+	{
+	   alt_codes[alt_cost - 1].code = FMA;
+	   alt_codes[alt_cost - 1].value = 9;
+	   alt_codes[alt_cost - 1].use_uw = false;
+	   memcpy (codes, alt_codes, sizeof (alt_codes));
+	   cost = alt_cost;
+	}
+      if ((value % 5) == 0
+	  && (alt_cost = riscv_build_integer_1 (alt_codes, value / 5, mode) + 1) < cost)
+	{
+	   alt_codes[alt_cost - 1].code = FMA;
+	   alt_codes[alt_cost - 1].value = 5;
+	   alt_codes[alt_cost - 1].use_uw = false;
+	   memcpy (codes, alt_codes, sizeof (alt_codes));
+	   cost = alt_cost;
+	}
+      if ((value % 3) == 0
+	  && (alt_cost = riscv_build_integer_1 (alt_codes, value / 3, mode) + 1) < cost)
+	{
+	   alt_codes[alt_cost - 1].code = FMA;
+	   alt_codes[alt_cost - 1].value = 3;
+	   alt_codes[alt_cost - 1].use_uw = false;
+	   memcpy (codes, alt_codes, sizeof (alt_codes));
+	   cost = alt_cost;
+	}
+    }
+
   /* Final cases, particularly focused on bseti.  */
   if (cost > 2 && TARGET_ZBS)
     {
@@ -2542,6 +2573,14 @@  riscv_move_integer (rtx temp, rtx dest, HOST_WIDE_INT value,
 	      x = gen_rtx_fmt_ee (AND, mode, x, GEN_INT (value));
 	      x = riscv_emit_set (t, x);
 	    }
+	  else if (codes[i].code == FMA)
+	    {
+	      HOST_WIDE_INT value = exact_log2 (codes[i].value - 1);
+	      rtx ashift = gen_rtx_fmt_ee (ASHIFT, mode, x, GEN_INT (value));
+	      x = gen_rtx_fmt_ee (PLUS, mode, ashift, x);
+	      rtx t = can_create_pseudo_p () ? gen_reg_rtx (mode) : temp;
+	      x = riscv_emit_set (t, x);
+	    }
 	  else
 	    x = gen_rtx_fmt_ee (codes[i].code, mode,
 				x, GEN_INT (codes[i].value));
diff --git a/gcc/testsuite/gcc.target/riscv/synthesis-1.c b/gcc/testsuite/gcc.target/riscv/synthesis-1.c
index 3384e488ade..9176d5f4989 100644
--- a/gcc/testsuite/gcc.target/riscv/synthesis-1.c
+++ b/gcc/testsuite/gcc.target/riscv/synthesis-1.c
@@ -12,7 +12,7 @@ 
    total number of instructions. 
 
    This isn't expected to change much and any change is worthy of a look.  */
-/* { dg-final { scan-assembler-times "\\t(add|addi|bseti|li|ret|slli)" 5822 } } */
+/* { dg-final { scan-assembler-times "\\t(add|addi|bseti|li|ret|sh1add|sh2add|sh3add|slli)" 5822 } } */
 
  unsigned long foo_0x3(void) { return 0x3UL; }
  unsigned long foo_0x5(void) { return 0x5UL; }
diff --git a/gcc/testsuite/gcc.target/riscv/synthesis-3.c b/gcc/testsuite/gcc.target/riscv/synthesis-3.c
new file mode 100644
index 00000000000..5d92ac8e309
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/synthesis-3.c
@@ -0,0 +1,81 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target rv64 } */
+/* We aggressively skip as we really just need to test the basic synthesis
+   which shouldn't vary based on the optimization level.  -O1 seems to work
+   and eliminates the usual sources of extraneous dead code that would throw
+   off the counts.  */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" "-O2" "-O3" "-Os" "-Oz" "-flto" } } */
+/* { dg-options "-march=rv64gc_zba_zbb_zbs" } */
+
+/* Rather than test for a specific synthesis of all these constants or
+   having thousands of tests each testing one variant, we just test the
+   total number of instructions. 
+
+   This isn't expected to change much and any change is worthy of a look.  */
+/* { dg-final { scan-assembler-times "\\t(add|addi|bseti|li|ret|sh1add|sh2add|sh3add|slli)" 248 } } */
+
+unsigned long foo_0x1200000120000000 (void) { return 0x1200000120000000UL; }
+unsigned long foo_0x120000402000000 (void) { return 0x120000402000000UL; }
+unsigned long foo_0x1200100020000 (void) { return 0x1200100020000UL; }
+unsigned long foo_0x14000000a0000000 (void) { return 0x14000000a0000000UL; }
+unsigned long foo_0x140000082000000 (void) { return 0x140000082000000UL; }
+unsigned long foo_0x14000080200000 (void) { return 0x14000080200000UL; }
+unsigned long foo_0x1400080020000 (void) { return 0x1400080020000UL; }
+unsigned long foo_0x140080002000 (void) { return 0x140080002000UL; }
+unsigned long foo_0x1800000120000000 (void) { return 0x1800000120000000UL; }
+unsigned long foo_0x180000102000000 (void) { return 0x180000102000000UL; }
+unsigned long foo_0x180000801 (void) { return 0x180000801UL; }
+unsigned long foo_0x180000802 (void) { return 0x180000802UL; }
+unsigned long foo_0x180001001 (void) { return 0x180001001UL; }
+unsigned long foo_0x18000100200000 (void) { return 0x18000100200000UL; }
+unsigned long foo_0x180001002 (void) { return 0x180001002UL; }
+unsigned long foo_0x180003000 (void) { return 0x180003000UL; }
+unsigned long foo_0x1800100020000 (void) { return 0x1800100020000UL; }
+unsigned long foo_0x180081000 (void) { return 0x180081000UL; }
+unsigned long foo_0x182001000 (void) { return 0x182001000UL; }
+unsigned long foo_0x2400000240000000 (void) { return 0x2400000240000000UL; }
+unsigned long foo_0x24000080400000 (void) { return 0x24000080400000UL; }
+unsigned long foo_0x2400200040000 (void) { return 0x2400200040000UL; }
+unsigned long foo_0x2800000140000000 (void) { return 0x2800000140000000UL; }
+unsigned long foo_0x280000104000000 (void) { return 0x280000104000000UL; }
+unsigned long foo_0x28000100400000 (void) { return 0x28000100400000UL; }
+unsigned long foo_0x2800100040000 (void) { return 0x2800100040000UL; }
+unsigned long foo_0x280011000 (void) { return 0x280011000UL; }
+unsigned long foo_0x280100004000 (void) { return 0x280100004000UL; }
+unsigned long foo_0x280401000 (void) { return 0x280401000UL; }
+unsigned long foo_0x290001000 (void) { return 0x290001000UL; }
+unsigned long foo_0x30000000c0000000 (void) { return 0x30000000c0000000UL; }
+unsigned long foo_0x300000084000000 (void) { return 0x300000084000000UL; }
+unsigned long foo_0x300000801 (void) { return 0x300000801UL; }
+unsigned long foo_0x30000080400000 (void) { return 0x30000080400000UL; }
+unsigned long foo_0x300002004 (void) { return 0x300002004UL; }
+unsigned long foo_0x300006000 (void) { return 0x300006000UL; }
+unsigned long foo_0x3000080040000 (void) { return 0x3000080040000UL; }
+unsigned long foo_0x300021000 (void) { return 0x300021000UL; }
+unsigned long foo_0x300080004000 (void) { return 0x300080004000UL; }
+unsigned long foo_0x300102000 (void) { return 0x300102000UL; }
+unsigned long foo_0x300801000 (void) { return 0x300801000UL; }
+unsigned long foo_0x304002000 (void) { return 0x304002000UL; }
+unsigned long foo_0x320001000 (void) { return 0x320001000UL; }
+unsigned long foo_0x4800000480000000 (void) { return 0x4800000480000000UL; }
+unsigned long foo_0x48000100800000 (void) { return 0x48000100800000UL; }
+unsigned long foo_0x4800400080000 (void) { return 0x4800400080000UL; }
+unsigned long foo_0x5000000280000000 (void) { return 0x5000000280000000UL; }
+unsigned long foo_0x500000208000000 (void) { return 0x500000208000000UL; }
+unsigned long foo_0x50000200800000 (void) { return 0x50000200800000UL; }
+unsigned long foo_0x5000200080000 (void) { return 0x5000200080000UL; }
+unsigned long foo_0x500200008000 (void) { return 0x500200008000UL; }
+unsigned long foo_0x6000000180000000 (void) { return 0x6000000180000000UL; }
+unsigned long foo_0x600000108000000 (void) { return 0x600000108000000UL; }
+unsigned long foo_0x60000100800000 (void) { return 0x60000100800000UL; }
+unsigned long foo_0x6000100080000 (void) { return 0x6000100080000UL; }
+unsigned long foo_0x600100008000 (void) { return 0x600100008000UL; }
+unsigned long foo_0x900000090000000 (void) { return 0x900000090000000UL; }
+unsigned long foo_0x90000201000000 (void) { return 0x90000201000000UL; }
+unsigned long foo_0x900080010000 (void) { return 0x900080010000UL; }
+unsigned long foo_0x90200001000 (void) { return 0x90200001000UL; }
+unsigned long foo_0xc00000090000000 (void) { return 0xc00000090000000UL; }
+unsigned long foo_0xc0000081000000 (void) { return 0xc0000081000000UL; }
+unsigned long foo_0xc000080100000 (void) { return 0xc000080100000UL; }
+unsigned long foo_0xc00080010000 (void) { return 0xc00080010000UL; }
+unsigned long foo_0xc0080001000 (void) { return 0xc0080001000UL; }