Message ID | 20210127092038.GN4020736@tucnak |
---|---|
State | New |
Headers | show |
Series | i386: Add peephole2 for __atomic_sub_fetch (x, y, z) == 0 [PR98737] | expand |
On Wed, Jan 27, 2021 at 10:20 AM Jakub Jelinek <jakub@redhat.com> wrote: > > Hi! > > This patch adds a peephole2 for the optimization requested in the PR, > namely that we emit awful code for __atomic_sub_fetch (x, y, z) == 0 > or __atomic_sub_fetch (x, y, z) != 0 when y is not constant. > This can't be done in the combiner which punts on combining UNSPEC_VOLATILE > into other insns. > > For other ops we'd need different peephole2s, this one is specific with its > comparison instruction and negation that need to be matched. > > Bootstrapped/regtested on x86_64-linux and i686-linux. Is this ok for trunk > (as exception), or for GCC 12? If there is no urgent need, I'd rather see to obey stage-4 and wait for gcc-12. There is PR98375 meta bug to track gcc-12 pending patches. > 2021-01-27 Jakub Jelinek <jakub@redhat.com> > > PR target/98737 > * config/i386/sync.md (neg; mov; lock xadd; add peephole2): New > define_peephole2. > (*atomic_fetch_sub_cmp<mode>): New define_insn. > > * gcc.target/i386/pr98737.c: New test. OK, although this peephole is quite complex and matched sequence is easily perturbed. Please note that reg-reg move is due to RA to satisfy register constraint; if the value is already in the right register, then the sequence won't match. Do we need additional pattern with reg-reg move omitted? In the PR, Ulrich suggested to also handle other arith/logic operations, but matching these would be even harder, as they are emitted using cmpxchg loop. Maybe middle-end could emit a special version of the "boolean" atomic insn, if only flags are needed? Uros. > --- gcc/config/i386/sync.md.jj 2021-01-04 10:25:45.392159555 +0100 > +++ gcc/config/i386/sync.md 2021-01-26 16:03:13.911100510 +0100 > @@ -777,6 +777,63 @@ (define_insn "*atomic_fetch_add_cmp<mode > return "lock{%;} %K3add{<imodesuffix>}\t{%1, %0|%0, %1}"; > }) > > +;; Similarly, peephole for __sync_sub_fetch (x, b) == 0 into just > +;; lock sub followed by testing of flags instead of lock xadd, negation and > +;; comparison. > +(define_peephole2 > + [(parallel [(set (match_operand 0 "register_operand") > + (neg (match_dup 0))) > + (clobber (reg:CC FLAGS_REG))]) > + (set (match_operand:SWI 1 "register_operand") > + (match_operand:SWI 2 "register_operand")) > + (parallel [(set (match_operand:SWI 3 "register_operand") > + (unspec_volatile:SWI > + [(match_operand:SWI 4 "memory_operand") > + (match_operand:SI 5 "const_int_operand")] > + UNSPECV_XCHG)) > + (set (match_dup 4) > + (plus:SWI (match_dup 4) > + (match_dup 3))) > + (clobber (reg:CC FLAGS_REG))]) > + (parallel [(set (reg:CCZ FLAGS_REG) > + (compare:CCZ (neg:SWI > + (match_operand:SWI 6 "register_operand")) > + (match_dup 3))) > + (clobber (match_dup 3))])] > + "(GET_MODE (operands[0]) == <LEAMODE>mode > + || GET_MODE (operands[0]) == <MODE>mode) > + && reg_or_subregno (operands[0]) == reg_or_subregno (operands[2]) > + && (rtx_equal_p (operands[2], operands[3]) > + ? rtx_equal_p (operands[1], operands[6]) > + : (rtx_equal_p (operands[2], operands[6]) > + && rtx_equal_p (operands[1], operands[3]))) > + && peep2_reg_dead_p (4, operands[6]) > + && peep2_reg_dead_p (4, operands[3]) > + && !reg_overlap_mentioned_p (operands[1], operands[4]) > + && !reg_overlap_mentioned_p (operands[2], operands[4])" > + [(parallel [(set (reg:CCZ FLAGS_REG) > + (compare:CCZ > + (unspec_volatile:SWI [(match_dup 4) (match_dup 5)] > + UNSPECV_XCHG) > + (match_dup 2))) > + (set (match_dup 4) > + (minus:SWI (match_dup 4) > + (match_dup 2)))])]) > + > +(define_insn "*atomic_fetch_sub_cmp<mode>" > + [(set (reg:CCZ FLAGS_REG) > + (compare:CCZ > + (unspec_volatile:SWI > + [(match_operand:SWI 0 "memory_operand" "+m") > + (match_operand:SI 2 "const_int_operand")] ;; model > + UNSPECV_XCHG) > + (match_operand:SWI 1 "register_operand" "r"))) > + (set (match_dup 0) > + (minus:SWI (match_dup 0) > + (match_dup 1)))] > + "" > + "lock{%;} %K2sub{<imodesuffix>}\t{%1, %0|%0, %1}") > + > ;; Recall that xchg implicitly sets LOCK#, so adding it again wastes space. > ;; In addition, it is always a full barrier, so we can ignore the memory model. > (define_insn "atomic_exchange<mode>" > --- gcc/testsuite/gcc.target/i386/pr98737.c.jj 2021-01-26 15:59:24.640620178 +0100 > +++ gcc/testsuite/gcc.target/i386/pr98737.c 2021-01-26 16:00:02.898205888 +0100 > @@ -0,0 +1,38 @@ > +/* PR target/98737 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -masm=att" } */ > +/* { dg-additional-options "-march=i686" { target ia32 } } */ > +/* { dg-final { scan-assembler "lock\[^\n\r]\*subq\t" { target lp64 } } } */ > +/* { dg-final { scan-assembler "lock\[^\n\r]\*subl\t" } } */ > +/* { dg-final { scan-assembler "lock\[^\n\r]\*subw\t" } } */ > +/* { dg-final { scan-assembler "lock\[^\n\r]\*subb\t" } } */ > +/* { dg-final { scan-assembler-not "lock\[^\n\r]\*xadd" } } */ > + > +long a; > +int b; > +short c; > +char d; > + > +int > +foo (long x) > +{ > + return __atomic_sub_fetch (&a, x, __ATOMIC_RELEASE) == 0; > +} > + > +int > +bar (int x) > +{ > + return __atomic_sub_fetch (&b, x, __ATOMIC_RELEASE) == 0; > +} > + > +int > +baz (short x) > +{ > + return __atomic_sub_fetch (&c, x, __ATOMIC_RELEASE) == 0; > +} > + > +int > +qux (char x) > +{ > + return __atomic_sub_fetch (&d, x, __ATOMIC_RELEASE) == 0; > +} > > Jakub >
On Wed, Jan 27, 2021 at 11:22:57AM +0100, Uros Bizjak wrote: > > Bootstrapped/regtested on x86_64-linux and i686-linux. Is this ok for trunk > > (as exception), or for GCC 12? > > If there is no urgent need, I'd rather see to obey stage-4 and wait > for gcc-12. There is PR98375 meta bug to track gcc-12 pending patches. Okay. > > 2021-01-27 Jakub Jelinek <jakub@redhat.com> > > > > PR target/98737 > > * config/i386/sync.md (neg; mov; lock xadd; add peephole2): New > > define_peephole2. > > (*atomic_fetch_sub_cmp<mode>): New define_insn. > > > > * gcc.target/i386/pr98737.c: New test. > > OK, although this peephole is quite complex and matched sequence is > easily perturbed. Please note that reg-reg move is due to RA to > satisfy register constraint; if the value is already in the right > register, then the sequence won't match. Do we need additional pattern > with reg-reg move omitted? If there is no reg-reg move, then it is impossible to prove that it is a negation. The use of lock xadd forces addition instead of subtraction, and additionally modifies its result, so for the comparison one needs another register that holds the same value as the xadd initially. And we need to prove it is a negation. > In the PR, Ulrich suggested to also handle other arith/logic > operations, but matching these would be even harder, as they are > emitted using cmpxchg loop. Maybe middle-end could emit a special > version of the "boolean" atomic insn, if only flags are needed? I guess we could add new optabs for the atomic builtins whose result with the *_fetch operation rather than fetch_* is ==/!= compared against 0, not sure if we could do anything else easily, because what exact kind of comparison it is then is heavily machine dependent and the backend would then need to emit everything including branches (like e.g. the addv<mode>4 etc. expanders). Would equality comparison against 0 handle the most common cases. The user can write it as __atomic_sub_fetch (x, y, z) == 0 or __atomic_fetch_sub (x, y, z) - y == 0 thouch, so the expansion code would need to be able to cope with both. And the latter form is where all kinds of interfering optimizations pop up, e.g. for the subtraction it will be actually optimized into __atomic_fetch_sub (x, y, z) == y Jakub
On 1/27/21 11:37 AM, Jakub Jelinek wrote: > Would equality comparison against 0 handle the most common cases. > > The user can write it as > __atomic_sub_fetch (x, y, z) == 0 > or > __atomic_fetch_sub (x, y, z) - y == 0 > thouch, so the expansion code would need to be able to cope with both. Please also keep !=0, <0, <=0, >0, and >=0 in mind. They all can be useful and can be handled with the flags.
--- gcc/config/i386/sync.md.jj 2021-01-04 10:25:45.392159555 +0100 +++ gcc/config/i386/sync.md 2021-01-26 16:03:13.911100510 +0100 @@ -777,6 +777,63 @@ (define_insn "*atomic_fetch_add_cmp<mode return "lock{%;} %K3add{<imodesuffix>}\t{%1, %0|%0, %1}"; }) +;; Similarly, peephole for __sync_sub_fetch (x, b) == 0 into just +;; lock sub followed by testing of flags instead of lock xadd, negation and +;; comparison. +(define_peephole2 + [(parallel [(set (match_operand 0 "register_operand") + (neg (match_dup 0))) + (clobber (reg:CC FLAGS_REG))]) + (set (match_operand:SWI 1 "register_operand") + (match_operand:SWI 2 "register_operand")) + (parallel [(set (match_operand:SWI 3 "register_operand") + (unspec_volatile:SWI + [(match_operand:SWI 4 "memory_operand") + (match_operand:SI 5 "const_int_operand")] + UNSPECV_XCHG)) + (set (match_dup 4) + (plus:SWI (match_dup 4) + (match_dup 3))) + (clobber (reg:CC FLAGS_REG))]) + (parallel [(set (reg:CCZ FLAGS_REG) + (compare:CCZ (neg:SWI + (match_operand:SWI 6 "register_operand")) + (match_dup 3))) + (clobber (match_dup 3))])] + "(GET_MODE (operands[0]) == <LEAMODE>mode + || GET_MODE (operands[0]) == <MODE>mode) + && reg_or_subregno (operands[0]) == reg_or_subregno (operands[2]) + && (rtx_equal_p (operands[2], operands[3]) + ? rtx_equal_p (operands[1], operands[6]) + : (rtx_equal_p (operands[2], operands[6]) + && rtx_equal_p (operands[1], operands[3]))) + && peep2_reg_dead_p (4, operands[6]) + && peep2_reg_dead_p (4, operands[3]) + && !reg_overlap_mentioned_p (operands[1], operands[4]) + && !reg_overlap_mentioned_p (operands[2], operands[4])" + [(parallel [(set (reg:CCZ FLAGS_REG) + (compare:CCZ + (unspec_volatile:SWI [(match_dup 4) (match_dup 5)] + UNSPECV_XCHG) + (match_dup 2))) + (set (match_dup 4) + (minus:SWI (match_dup 4) + (match_dup 2)))])]) + +(define_insn "*atomic_fetch_sub_cmp<mode>" + [(set (reg:CCZ FLAGS_REG) + (compare:CCZ + (unspec_volatile:SWI + [(match_operand:SWI 0 "memory_operand" "+m") + (match_operand:SI 2 "const_int_operand")] ;; model + UNSPECV_XCHG) + (match_operand:SWI 1 "register_operand" "r"))) + (set (match_dup 0) + (minus:SWI (match_dup 0) + (match_dup 1)))] + "" + "lock{%;} %K2sub{<imodesuffix>}\t{%1, %0|%0, %1}") + ;; Recall that xchg implicitly sets LOCK#, so adding it again wastes space. ;; In addition, it is always a full barrier, so we can ignore the memory model. (define_insn "atomic_exchange<mode>" --- gcc/testsuite/gcc.target/i386/pr98737.c.jj 2021-01-26 15:59:24.640620178 +0100 +++ gcc/testsuite/gcc.target/i386/pr98737.c 2021-01-26 16:00:02.898205888 +0100 @@ -0,0 +1,38 @@ +/* PR target/98737 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -masm=att" } */ +/* { dg-additional-options "-march=i686" { target ia32 } } */ +/* { dg-final { scan-assembler "lock\[^\n\r]\*subq\t" { target lp64 } } } */ +/* { dg-final { scan-assembler "lock\[^\n\r]\*subl\t" } } */ +/* { dg-final { scan-assembler "lock\[^\n\r]\*subw\t" } } */ +/* { dg-final { scan-assembler "lock\[^\n\r]\*subb\t" } } */ +/* { dg-final { scan-assembler-not "lock\[^\n\r]\*xadd" } } */ + +long a; +int b; +short c; +char d; + +int +foo (long x) +{ + return __atomic_sub_fetch (&a, x, __ATOMIC_RELEASE) == 0; +} + +int +bar (int x) +{ + return __atomic_sub_fetch (&b, x, __ATOMIC_RELEASE) == 0; +} + +int +baz (short x) +{ + return __atomic_sub_fetch (&c, x, __ATOMIC_RELEASE) == 0; +} + +int +qux (char x) +{ + return __atomic_sub_fetch (&d, x, __ATOMIC_RELEASE) == 0; +}