diff mbox series

[x86_64] PR target/110104: Missing peephole2 for addcarry<mode>.

Message ID 030901d998c6$ac062250$041266f0$@nextmovesoftware.com
State New
Headers show
Series [x86_64] PR target/110104: Missing peephole2 for addcarry<mode>. | expand

Commit Message

Roger Sayle June 6, 2023, 10:31 p.m. UTC
This patch resolves PR target/110104, a missed optimization on x86 around
adc with memory operands.  In i386.md, there's a peephole2 after the
pattern for *add<mode>3_cc_overflow_1 that converts the sequence
reg = add(reg,mem); mem = reg [where the reg is dead afterwards] into
the equivalent mem = add(mem,reg).  The equivalent peephole2 for adc
is missing (after addcarry<mode>), and is added by this patch.

For the example code provided in the bugzilla PR:

Before:
        movq    %rsi, %rax
        mulq    %rdx
        addq    %rax, (%rdi)
        movq    %rdx, %rax
        adcq    8(%rdi), %rax
        adcq    $0, 16(%rdi)
        movq    %rax, 8(%rdi)
        ret

After:
        movq    %rsi, %rax
        mulq    %rdx
        addq    %rax, (%rdi)
        adcq    %rdx, 8(%rdi)
        adcq    $0, 16(%rdi)
        ret

Note that the addq in this example has been transformed by the
existing peephole2 described above.

This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32}
with no new failures.  Ok for mainline?


2023-06-07  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
        PR target/110104
        * config/i386/i386.md (define_peephole2): Transform reg=adc(reg,mem)
        followed by mem=reg into mem=adc(mem,reg) when applicable.

gcc/testsuite/ChangeLog
        PR target/110104
        * gcc.target/i386/pr110104.c: New test case.


Thanks in advance,
Roger
--

Comments

Jakub Jelinek June 6, 2023, 10:39 p.m. UTC | #1
On Tue, Jun 06, 2023 at 11:31:42PM +0100, Roger Sayle wrote:
> 
> This patch resolves PR target/110104, a missed optimization on x86 around
> adc with memory operands.  In i386.md, there's a peephole2 after the
> pattern for *add<mode>3_cc_overflow_1 that converts the sequence
> reg = add(reg,mem); mem = reg [where the reg is dead afterwards] into
> the equivalent mem = add(mem,reg).  The equivalent peephole2 for adc
> is missing (after addcarry<mode>), and is added by this patch.
> 
> For the example code provided in the bugzilla PR:

Seems to be pretty much the same as one of the 12 define_peephole2
patterns I've posted in
https://gcc.gnu.org/pipermail/gcc-patches/2023-June/620821.html

> 2023-06-07  Roger Sayle  <roger@nextmovesoftware.com>
> 
> gcc/ChangeLog
>         PR target/110104
>         * config/i386/i386.md (define_peephole2): Transform reg=adc(reg,mem)
>         followed by mem=reg into mem=adc(mem,reg) when applicable.
> 
> gcc/testsuite/ChangeLog
>         PR target/110104
>         * gcc.target/i386/pr110104.c: New test case.

The testcase will be useful though (but I'd go with including
the intrin header and using the intrinsic rather than builtin).

	Jakub
Roger Sayle June 6, 2023, 11:28 p.m. UTC | #2
Hi Jakub,
Jakub Jelinek wrote:
> Seems to be pretty much the same as one of the 12 define_peephole2
patterns I've posted in
https://gcc.gnu.org/pipermail/gcc-patches/2023-June/620821.html

Doh!  Impressive work.  I need to study how you handle constant carry flags.
Fingers-crossed that patches that touch both the middle-end and a backend
don't get delayed too long in the review/approval process.

> The testcase will be useful though (but I'd go with including the intrin
header and using the intrinsic rather than builtin).

I find the use of intrin headers a pain when running cc1 under gdb,
requiring additional paths to be
specified with -I etc.  Perhaps there's a trick that I'm missing?
__builtins are more free-standing,
and therefore work with cross-compilers to targets/development environments
that I don't have.

I withdraw my patch.  Please feel free to assign PR 110104 to yourself in
Bugzilla.

Cheers (and thanks),
Roger
diff mbox series

Patch

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index e6ebc46..33ec45f 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -7870,6 +7870,51 @@ 
    (set_attr "pent_pair" "pu")
    (set_attr "mode" "<MODE>")])
 
+;; peephole2 for addcarry<mode> matching one for *add<mode>3_cc_overflow_1.
+;; reg = adc(reg,mem); mem = reg  ->  mem = adc(mem,reg).
+(define_peephole2
+  [(parallel
+    [(set (reg:CCC FLAGS_REG) 
+	  (compare:CCC
+	    (zero_extend:<DWI>
+	      (plus:SWI48
+		(plus:SWI48
+		  (match_operator:SWI48 3 "ix86_carry_flag_operator"
+		    [(match_operand 2 "flags_reg_operand") (const_int 0)])
+		  (match_operand:SWI48 0 "general_reg_operand"))
+		(match_operand:SWI48 1 "memory_operand")))
+	    (plus:<DWI>
+	      (zero_extend:<DWI> (match_dup 1))
+		(match_operator:<DWI> 4 "ix86_carry_flag_operator"
+		  [(match_dup 2) (const_int 0)]))))
+     (set (match_dup 0)
+	  (plus:SWI48 (plus:SWI48 (match_op_dup 3
+				    [(match_dup 2) (const_int 0)])
+				  (match_dup 0))
+		      (match_dup 1)))])
+   (set (match_dup 1) (match_dup 0))]
+  "(TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
+   && peep2_reg_dead_p (2, operands[0])
+   && !reg_overlap_mentioned_p (operands[0], operands[1])"
+  [(parallel
+    [(set (reg:CCC FLAGS_REG)
+	  (compare:CCC
+	    (zero_extend:<DWI>
+	      (plus:SWI48
+		(plus:SWI48
+		  (match_op_dup 3 [(match_dup 2) (const_int 0)])
+		  (match_dup 1))
+		(match_dup 0)))
+	    (plus:<DWI>
+	      (zero_extend:<DWI> (match_dup 0))
+		(match_op_dup 4
+		  [(match_dup 2) (const_int 0)]))))
+     (set (match_dup 1)
+	  (plus:SWI48 (plus:SWI48 (match_op_dup 3
+				    [(match_dup 2) (const_int 0)])
+				  (match_dup 1))
+		      (match_dup 0)))])])
+
 (define_expand "addcarry<mode>_0"
   [(parallel
      [(set (reg:CCC FLAGS_REG)
diff --git a/gcc/testsuite/gcc.target/i386/pr110104.c b/gcc/testsuite/gcc.target/i386/pr110104.c
new file mode 100644
index 0000000..bd814f3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110104.c
@@ -0,0 +1,16 @@ 
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2" } */
+
+typedef unsigned long long u64;
+typedef unsigned __int128 u128;
+void testcase1(u64 *acc, u64 a, u64 b)
+{
+  u128 res = (u128)a*b;
+  u64 lo = res, hi = res >> 64;
+  unsigned char cf = 0;
+  cf = __builtin_ia32_addcarryx_u64 (cf, lo, acc[0], acc+0);
+  cf = __builtin_ia32_addcarryx_u64 (cf, hi, acc[1], acc+1);
+  cf = __builtin_ia32_addcarryx_u64 (cf,  0, acc[2], acc+2);
+}
+
+/* { dg-final { scan-assembler-times "movq" 1 } } */