diff mbox series

[to-be-committed,RISC-V] Improve extraction of inverted single bit

Message ID 291b9b9c-77b5-4429-94b4-8faaa6778eef@gmail.com
State New
Headers show
Series [to-be-committed,RISC-V] Improve extraction of inverted single bit | expand

Commit Message

Jeff Law May 10, 2024, 10:28 p.m. UTC
So this patch fixes a minor code generation inefficiency that (IIRC) the 
RAU team discovered a while ago in spec.

If we want the inverted value of a single bit we can use bext to extract 
the bit, then seq to invert the value (if viewed as a 0/1 truth value).

The RTL is fairly convoluted, but it's basically a right shift to get 
the bit into position, bitwise-not then masking off all but the low bit. 
  So it's a 3->2 combine, hidden by the fact that and-not is a 
define_insn_and_split, so it actually looks like a 2->2 combine.

We've run this through Ventana's internal CI (which includes 
zba_zbb_zbs) and I've run it in my own tester (rv64gc, rv32gcv).  I'll 
wait for the upstream CI to finish with positive results before pushing.

Jeff
gcc/

	* config/riscv/riscv.cc (riscv_build_integer_1): Recognize cases where
	we can use shNadd to improve constant synthesis.
	(riscv_move_integer): Handle code generation for shNadd.

gcc/testsuite
	* gcc.target/riscv/synthesis-1.c: Also count shNadd instructions.
	* gcc.target/riscv/synthesis-3.c: New test.

Comments

Jeff Law May 10, 2024, 11:30 p.m. UTC | #1
On 5/10/24 4:28 PM, Jeff Law wrote:
> So this patch fixes a minor code generation inefficiency that (IIRC) the 
> RAU team discovered a while ago in spec.
> 
> If we want the inverted value of a single bit we can use bext to extract 
> the bit, then seq to invert the value (if viewed as a 0/1 truth value).
> 
> The RTL is fairly convoluted, but it's basically a right shift to get 
> the bit into position, bitwise-not then masking off all but the low bit. 
>   So it's a 3->2 combine, hidden by the fact that and-not is a 
> define_insn_and_split, so it actually looks like a 2->2 combine.
> 
> We've run this through Ventana's internal CI (which includes 
> zba_zbb_zbs) and I've run it in my own tester (rv64gc, rv32gcv).  I'll 
> wait for the upstream CI to finish with positive results before pushing.
[ ... ]
Whoops, sent the wrong patch.  The downside of doing work on one system, 
but handling email from another :(

Here's the right patch.
gcc/
	* config/riscv/bitmanip.md (*bextseqzdisi): New pattern.

gcc/testsuite/

	* gcc.target/riscv/zbs-bext-2.c: New test.

diff --git a/gcc/config/riscv/bitmanip.md b/gcc/config/riscv/bitmanip.md
index d76a72d30e0..cf2fa04d4c4 100644
--- a/gcc/config/riscv/bitmanip.md
+++ b/gcc/config/riscv/bitmanip.md
@@ -711,6 +711,30 @@ (define_insn "*bext<mode>"
   "bext\t%0,%1,%2"
   [(set_attr "type" "bitmanip")])
 
+;; This is a bext followed by a seqz.  Normally this would be a 3->2 split
+;; But the and-not pattern with a constant operand is a define_insn_and_split,
+;; so this looks like a 2->2 split, which combine rejects.  So implement it
+;; as a define_insn_and_split as well.
+(define_insn_and_split "*bextseqzdisi"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(and:DI
+	  (not:DI
+	    (subreg:DI
+	      (lshiftrt:SI
+	        (match_operand:SI 1 "register_operand" "r")
+		(match_operand:QI 2 "register_operand" "r")) 0))
+          (const_int 1)))]
+  "TARGET_64BIT && TARGET_ZBS"
+  "#"
+  "&& 1"
+  [(set (match_dup 0) (and:DI (subreg:DI
+			        (lshiftrt:SI (match_dup 1)
+					     (match_dup 2)) 0)
+			      (const_int 1)))
+   (set (match_dup 0) (eq:DI (match_dup 0) (const_int 0)))]
+  ""
+  [(set_attr "type" "bitmanip")])
+
 ;; When performing `(a & (1UL << bitno)) ? 0 : -1` the combiner
 ;; usually has the `bitno` typed as X-mode (i.e. no further
 ;; zero-extension is performed around the bitno).
diff --git a/gcc/testsuite/gcc.target/riscv/zbs-bext-2.c b/gcc/testsuite/gcc.target/riscv/zbs-bext-2.c
new file mode 100644
index 00000000000..53f47dc3afe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/zbs-bext-2.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gc_zbs -mabi=lp64" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" } } */
+
+
+_Bool match(const int ch, int fMap) {
+    return ((fMap & (1<<(ch))) == 0);
+}
+
+
+/* { dg-final { scan-assembler-times "bext\t" 1 } } */
+/* { dg-final { scan-assembler-times "seqz\t" 1 } } */
+/* { dg-final { scan-assembler-not "sraw\t" } } */
+/* { dg-final { scan-assembler-not "not\t" } } */
+/* { dg-final { scan-assembler-not "andi\t" } } */
diff mbox series

Patch

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 2eac67b0ce0..75e828c81a7 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -880,6 +880,37 @@  riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
 	}
     }
 
+  if (cost > 2 && TARGET_64BIT && TARGET_ZBA)
+    {
+      if ((value % 9) == 0
+	  && (alt_cost = riscv_build_integer_1 (alt_codes, value / 9, mode) + 1) < cost)
+	{
+	   alt_codes[alt_cost - 1].code = FMA;
+	   alt_codes[alt_cost - 1].value = 9;
+	   alt_codes[alt_cost - 1].use_uw = false;
+	   memcpy (codes, alt_codes, sizeof (alt_codes));
+	   cost = alt_cost;
+	}
+      if ((value % 5) == 0
+	  && (alt_cost = riscv_build_integer_1 (alt_codes, value / 5, mode) + 1) < cost)
+	{
+	   alt_codes[alt_cost - 1].code = FMA;
+	   alt_codes[alt_cost - 1].value = 5;
+	   alt_codes[alt_cost - 1].use_uw = false;
+	   memcpy (codes, alt_codes, sizeof (alt_codes));
+	   cost = alt_cost;
+	}
+      if ((value % 3) == 0
+	  && (alt_cost = riscv_build_integer_1 (alt_codes, value / 3, mode) + 1) < cost)
+	{
+	   alt_codes[alt_cost - 1].code = FMA;
+	   alt_codes[alt_cost - 1].value = 3;
+	   alt_codes[alt_cost - 1].use_uw = false;
+	   memcpy (codes, alt_codes, sizeof (alt_codes));
+	   cost = alt_cost;
+	}
+    }
+
   /* Final cases, particularly focused on bseti.  */
   if (cost > 2 && TARGET_ZBS)
     {
@@ -2542,6 +2573,14 @@  riscv_move_integer (rtx temp, rtx dest, HOST_WIDE_INT value,
 	      x = gen_rtx_fmt_ee (AND, mode, x, GEN_INT (value));
 	      x = riscv_emit_set (t, x);
 	    }
+	  else if (codes[i].code == FMA)
+	    {
+	      HOST_WIDE_INT value = exact_log2 (codes[i].value - 1);
+	      rtx ashift = gen_rtx_fmt_ee (ASHIFT, mode, x, GEN_INT (value));
+	      x = gen_rtx_fmt_ee (PLUS, mode, ashift, x);
+	      rtx t = can_create_pseudo_p () ? gen_reg_rtx (mode) : temp;
+	      x = riscv_emit_set (t, x);
+	    }
 	  else
 	    x = gen_rtx_fmt_ee (codes[i].code, mode,
 				x, GEN_INT (codes[i].value));
diff --git a/gcc/testsuite/gcc.target/riscv/synthesis-1.c b/gcc/testsuite/gcc.target/riscv/synthesis-1.c
index 3384e488ade..9176d5f4989 100644
--- a/gcc/testsuite/gcc.target/riscv/synthesis-1.c
+++ b/gcc/testsuite/gcc.target/riscv/synthesis-1.c
@@ -12,7 +12,7 @@ 
    total number of instructions. 
 
    This isn't expected to change much and any change is worthy of a look.  */
-/* { dg-final { scan-assembler-times "\\t(add|addi|bseti|li|ret|slli)" 5822 } } */
+/* { dg-final { scan-assembler-times "\\t(add|addi|bseti|li|ret|sh1add|sh2add|sh3add|slli)" 5822 } } */
 
  unsigned long foo_0x3(void) { return 0x3UL; }
  unsigned long foo_0x5(void) { return 0x5UL; }
diff --git a/gcc/testsuite/gcc.target/riscv/synthesis-3.c b/gcc/testsuite/gcc.target/riscv/synthesis-3.c
new file mode 100644
index 00000000000..5d92ac8e309
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/synthesis-3.c
@@ -0,0 +1,81 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target rv64 } */
+/* We aggressively skip as we really just need to test the basic synthesis
+   which shouldn't vary based on the optimization level.  -O1 seems to work
+   and eliminates the usual sources of extraneous dead code that would throw
+   off the counts.  */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" "-O2" "-O3" "-Os" "-Oz" "-flto" } } */
+/* { dg-options "-march=rv64gc_zba_zbb_zbs" } */
+
+/* Rather than test for a specific synthesis of all these constants or
+   having thousands of tests each testing one variant, we just test the
+   total number of instructions. 
+
+   This isn't expected to change much and any change is worthy of a look.  */
+/* { dg-final { scan-assembler-times "\\t(add|addi|bseti|li|ret|sh1add|sh2add|sh3add|slli)" 248 } } */
+
+unsigned long foo_0x1200000120000000 (void) { return 0x1200000120000000UL; }
+unsigned long foo_0x120000402000000 (void) { return 0x120000402000000UL; }
+unsigned long foo_0x1200100020000 (void) { return 0x1200100020000UL; }
+unsigned long foo_0x14000000a0000000 (void) { return 0x14000000a0000000UL; }
+unsigned long foo_0x140000082000000 (void) { return 0x140000082000000UL; }
+unsigned long foo_0x14000080200000 (void) { return 0x14000080200000UL; }
+unsigned long foo_0x1400080020000 (void) { return 0x1400080020000UL; }
+unsigned long foo_0x140080002000 (void) { return 0x140080002000UL; }
+unsigned long foo_0x1800000120000000 (void) { return 0x1800000120000000UL; }
+unsigned long foo_0x180000102000000 (void) { return 0x180000102000000UL; }
+unsigned long foo_0x180000801 (void) { return 0x180000801UL; }
+unsigned long foo_0x180000802 (void) { return 0x180000802UL; }
+unsigned long foo_0x180001001 (void) { return 0x180001001UL; }
+unsigned long foo_0x18000100200000 (void) { return 0x18000100200000UL; }
+unsigned long foo_0x180001002 (void) { return 0x180001002UL; }
+unsigned long foo_0x180003000 (void) { return 0x180003000UL; }
+unsigned long foo_0x1800100020000 (void) { return 0x1800100020000UL; }
+unsigned long foo_0x180081000 (void) { return 0x180081000UL; }
+unsigned long foo_0x182001000 (void) { return 0x182001000UL; }
+unsigned long foo_0x2400000240000000 (void) { return 0x2400000240000000UL; }
+unsigned long foo_0x24000080400000 (void) { return 0x24000080400000UL; }
+unsigned long foo_0x2400200040000 (void) { return 0x2400200040000UL; }
+unsigned long foo_0x2800000140000000 (void) { return 0x2800000140000000UL; }
+unsigned long foo_0x280000104000000 (void) { return 0x280000104000000UL; }
+unsigned long foo_0x28000100400000 (void) { return 0x28000100400000UL; }
+unsigned long foo_0x2800100040000 (void) { return 0x2800100040000UL; }
+unsigned long foo_0x280011000 (void) { return 0x280011000UL; }
+unsigned long foo_0x280100004000 (void) { return 0x280100004000UL; }
+unsigned long foo_0x280401000 (void) { return 0x280401000UL; }
+unsigned long foo_0x290001000 (void) { return 0x290001000UL; }
+unsigned long foo_0x30000000c0000000 (void) { return 0x30000000c0000000UL; }
+unsigned long foo_0x300000084000000 (void) { return 0x300000084000000UL; }
+unsigned long foo_0x300000801 (void) { return 0x300000801UL; }
+unsigned long foo_0x30000080400000 (void) { return 0x30000080400000UL; }
+unsigned long foo_0x300002004 (void) { return 0x300002004UL; }
+unsigned long foo_0x300006000 (void) { return 0x300006000UL; }
+unsigned long foo_0x3000080040000 (void) { return 0x3000080040000UL; }
+unsigned long foo_0x300021000 (void) { return 0x300021000UL; }
+unsigned long foo_0x300080004000 (void) { return 0x300080004000UL; }
+unsigned long foo_0x300102000 (void) { return 0x300102000UL; }
+unsigned long foo_0x300801000 (void) { return 0x300801000UL; }
+unsigned long foo_0x304002000 (void) { return 0x304002000UL; }
+unsigned long foo_0x320001000 (void) { return 0x320001000UL; }
+unsigned long foo_0x4800000480000000 (void) { return 0x4800000480000000UL; }
+unsigned long foo_0x48000100800000 (void) { return 0x48000100800000UL; }
+unsigned long foo_0x4800400080000 (void) { return 0x4800400080000UL; }
+unsigned long foo_0x5000000280000000 (void) { return 0x5000000280000000UL; }
+unsigned long foo_0x500000208000000 (void) { return 0x500000208000000UL; }
+unsigned long foo_0x50000200800000 (void) { return 0x50000200800000UL; }
+unsigned long foo_0x5000200080000 (void) { return 0x5000200080000UL; }
+unsigned long foo_0x500200008000 (void) { return 0x500200008000UL; }
+unsigned long foo_0x6000000180000000 (void) { return 0x6000000180000000UL; }
+unsigned long foo_0x600000108000000 (void) { return 0x600000108000000UL; }
+unsigned long foo_0x60000100800000 (void) { return 0x60000100800000UL; }
+unsigned long foo_0x6000100080000 (void) { return 0x6000100080000UL; }
+unsigned long foo_0x600100008000 (void) { return 0x600100008000UL; }
+unsigned long foo_0x900000090000000 (void) { return 0x900000090000000UL; }
+unsigned long foo_0x90000201000000 (void) { return 0x90000201000000UL; }
+unsigned long foo_0x900080010000 (void) { return 0x900080010000UL; }
+unsigned long foo_0x90200001000 (void) { return 0x90200001000UL; }
+unsigned long foo_0xc00000090000000 (void) { return 0xc00000090000000UL; }
+unsigned long foo_0xc0000081000000 (void) { return 0xc0000081000000UL; }
+unsigned long foo_0xc000080100000 (void) { return 0xc000080100000UL; }
+unsigned long foo_0xc00080010000 (void) { return 0xc00080010000UL; }
+unsigned long foo_0xc0080001000 (void) { return 0xc0080001000UL; }