Message ID | 20220210091309.GA20692@delia.home |
---|---|
State | New |
Headers | show |
Series | [committed,nvptx] Handle pre-sm_7x shared atomic store using atomic exchange | expand |
Hi Tom! For my understanding: On 2022-02-10T10:13:10+0100, Tom de Vries via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > The ptx isa specifies (for pre-sm_7x) that atomic operations on shared memory > locations do not guarantee atomicity with respect to normal store instructions > to the same address. > > This can be fixed by: > - inserting barriers between normal stores and atomic operations to a common > address > - using atom.exch to store to locations accessed by other atomic operations. > > It's not clearly spelled out which barriers are needed, and a barrier seem more > expensive than atomic exchange. > > Implement the pre-sm_7x shared atomic store using atomic exchange. > > That includes stores using generic addressing, since those may also point to > shared memory. It is expected that this changes, for example (similar elsewhere) 'nvptx-none/libatomic/store_4_.o', to use (a) 'atom.exch' (with a new dummy register allocated) and yet (b) 'membar.sys' remains before as well as after (a): .visible .func __atomic_store_4 (.param .u64 %in_ar0, .param .u32 %in_ar1, .param .u32 %in_ar2) { .reg .u64 %ar0; ld.param.u64 %ar0,[%in_ar0]; .reg .u32 %ar1; ld.param.u32 %ar1,[%in_ar1]; .reg .u32 %ar2; ld.param.u32 %ar2,[%in_ar2]; .reg .u64 %r22; .reg .u32 %r23; +.reg .u32 %r25; mov.u64 %r22,%ar0; mov.u32 %r23,%ar1; .loc 2 39 5 membar.sys; -st.u32 [%r22],%r23; +atom.exch.b32 %r25,[%r22],%r23; membar.sys; .loc 2 40 1 ret; } And, for example (similar elsewhere) 'nvptx-none/libgomp/single.o', a 'ld' that previously was issued after 'membar.sys' now moves before that one: .visible .func (.param .u64 %value_out) GOMP_single_copy_start { [...] .reg .u64 %r33; .reg .u64 %r34; [...] .reg .pred %r51; .reg .u64 %r50; .reg .u64 %r52; [...] ld.shared.u64 %r50,[nvptx_thrs]; add.u64 %r33,%r50,%r49; .loc 2 1317 32 ld.u64 %r34,[%r33+32]; .loc 2 1317 6 setp.eq.u64 %r51,%r34,0; @ %r51 bra $L6; .loc 4 66 3 -membar.sys; ld.u64 %r52,[%r33+24]; -st.u64 [%r34+80],%r52; +membar.sys; +atom.exch.b64 %r53,[%r34+80],%r52; $L6: [...] (But I see there is another 'ld.u64 %r34,[%r33+32]' that previously also already was issued before the 'membar.sys'.) Grüße Thomas > [nvptx] Handle pre-sm_7x shared atomic store using atomic exchange > > gcc/ChangeLog: > > 2022-02-02 Tom de Vries <tdevries@suse.de> > > * config/nvptx/nvptx-protos.h (nvptx_mem_maybe_shared_p): Declare. > * config/nvptx/nvptx.cc (nvptx_mem_data_area): New static function. > (nvptx_mem_maybe_shared_p): New function. > * config/nvptx/nvptx.md (define_expand "atomic_store<mode>"): New > define_expand. > > gcc/testsuite/ChangeLog: > > 2022-02-02 Tom de Vries <tdevries@suse.de> > > * gcc.target/nvptx/atomic-store-1.c: New test. > * gcc.target/nvptx/atomic-store-3.c: New test. > * gcc.target/nvptx/stack-atomics-run.c: Update. > > --- > gcc/config/nvptx/nvptx-protos.h | 1 + > gcc/config/nvptx/nvptx.cc | 22 ++++++++++++++++ > gcc/config/nvptx/nvptx.md | 30 ++++++++++++++++++++++ > gcc/testsuite/gcc.target/nvptx/atomic-store-1.c | 26 +++++++++++++++++++ > gcc/testsuite/gcc.target/nvptx/atomic-store-3.c | 25 ++++++++++++++++++ > gcc/testsuite/gcc.target/nvptx/stack-atomics-run.c | 6 ++++- > 6 files changed, 109 insertions(+), 1 deletion(-) > > diff --git a/gcc/config/nvptx/nvptx-protos.h b/gcc/config/nvptx/nvptx-protos.h > index a846e341917..0bf9af406a2 100644 > --- a/gcc/config/nvptx/nvptx-protos.h > +++ b/gcc/config/nvptx/nvptx-protos.h > @@ -60,5 +60,6 @@ extern const char *nvptx_output_simt_exit (rtx); > extern const char *nvptx_output_red_partition (rtx, rtx); > extern const char *nvptx_output_atomic_insn (const char *, rtx *, int, int); > extern bool nvptx_mem_local_p (rtx); > +extern bool nvptx_mem_maybe_shared_p (const_rtx); > #endif > #endif > diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc > index 1b0227a2c31..5b26c0f4c7d 100644 > --- a/gcc/config/nvptx/nvptx.cc > +++ b/gcc/config/nvptx/nvptx.cc > @@ -76,6 +76,7 @@ > #include "intl.h" > #include "opts.h" > #include "tree-pretty-print.h" > +#include "rtl-iter.h" > > /* This file should be included last. */ > #include "target-def.h" > @@ -2787,6 +2788,27 @@ nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr) > nvptx_print_address_operand (file, addr, mode); > } > > +static nvptx_data_area > +nvptx_mem_data_area (const_rtx x) > +{ > + gcc_assert (GET_CODE (x) == MEM); > + > + const_rtx addr = XEXP (x, 0); > + subrtx_iterator::array_type array; > + FOR_EACH_SUBRTX (iter, array, addr, ALL) > + if (SYMBOL_REF_P (*iter)) > + return SYMBOL_DATA_AREA (*iter); > + > + return DATA_AREA_GENERIC; > +} > + > +bool > +nvptx_mem_maybe_shared_p (const_rtx x) > +{ > + nvptx_data_area area = nvptx_mem_data_area (x); > + return area == DATA_AREA_SHARED || area == DATA_AREA_GENERIC; > +} > + > /* Print an operand, X, to FILE, with an optional modifier in CODE. > > Meaning of CODE: > diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md > index cced68e0d4a..1a283b41922 100644 > --- a/gcc/config/nvptx/nvptx.md > +++ b/gcc/config/nvptx/nvptx.md > @@ -2051,6 +2051,36 @@ (define_insn "atomic_exchange<mode>" > } > [(set_attr "atomic" "true")]) > > +(define_expand "atomic_store<mode>" > + [(match_operand:SDIM 0 "memory_operand" "=m") ;; memory > + (match_operand:SDIM 1 "nvptx_nonmemory_operand" "Ri") ;; input > + (match_operand:SI 2 "const_int_operand")] ;; model > + "" > +{ > + struct address_info info; > + decompose_mem_address (&info, operands[0]); > + if (info.base != NULL && REG_P (*info.base) > + && REGNO_PTR_FRAME_P (REGNO (*info.base))) > + { > + emit_insn (gen_mov<mode> (operands[0], operands[1])); > + DONE; > + } > + > + if (TARGET_SM70) > + /* Fall back to expand_atomic_store. */ > + FAIL; > + > + bool maybe_shared_p = nvptx_mem_maybe_shared_p (operands[0]); > + if (!maybe_shared_p) > + /* Fall back to expand_atomic_store. */ > + FAIL; > + > + rtx tmpreg = gen_reg_rtx (<MODE>mode); > + emit_insn (gen_atomic_exchange<mode> (tmpreg, operands[0], operands[1], > + operands[2])); > + DONE; > +}) > + > (define_insn "atomic_fetch_add<mode>" > [(set (match_operand:SDIM 1 "memory_operand" "+m") > (unspec_volatile:SDIM > diff --git a/gcc/testsuite/gcc.target/nvptx/atomic-store-1.c b/gcc/testsuite/gcc.target/nvptx/atomic-store-1.c > new file mode 100644 > index 00000000000..cee3815eda5 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/nvptx/atomic-store-1.c > @@ -0,0 +1,26 @@ > +/* Test the atomic store expansion for sm <= sm_6x targets, > + shared state space. */ > + > +/* { dg-do compile } */ > +/* { dg-options "-misa=sm_53" } */ > + > +enum memmodel > +{ > + MEMMODEL_SEQ_CST = 5 > +}; > + > +unsigned int u32 __attribute__((shared)); > +unsigned long long int u64 __attribute__((shared)); > + > +int > +main() > +{ > + __atomic_store_n (&u32, 0, MEMMODEL_SEQ_CST); > + __atomic_store_n (&u64, 0, MEMMODEL_SEQ_CST); > + > + return 0; > +} > + > +/* { dg-final { scan-assembler-times "atom.shared.exch.b32" 1 } } */ > +/* { dg-final { scan-assembler-times "atom.shared.exch.b64" 1 } } */ > +/* { dg-final { scan-assembler-times "membar.cta" 4 } } */ > diff --git a/gcc/testsuite/gcc.target/nvptx/atomic-store-3.c b/gcc/testsuite/gcc.target/nvptx/atomic-store-3.c > new file mode 100644 > index 00000000000..cc0264f2b06 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/nvptx/atomic-store-3.c > @@ -0,0 +1,25 @@ > +/* Test the atomic store expansion, global state space. */ > + > +/* { dg-do compile } */ > +/* { dg-additional-options "-Wno-long-long" } */ > + > +enum memmodel > +{ > + MEMMODEL_SEQ_CST = 5 > +}; > + > +unsigned int u32; > +unsigned long long int u64; > + > +int > +main() > +{ > + __atomic_store_n (&u32, 0, MEMMODEL_SEQ_CST); > + __atomic_store_n (&u64, 0, MEMMODEL_SEQ_CST); > + > + return 0; > +} > + > +/* { dg-final { scan-assembler-times "st.global.u32" 1 } } */ > +/* { dg-final { scan-assembler-times "st.global.u64" 1 } } */ > +/* { dg-final { scan-assembler-times "membar.sys" 4 } } */ > diff --git a/gcc/testsuite/gcc.target/nvptx/stack-atomics-run.c b/gcc/testsuite/gcc.target/nvptx/stack-atomics-run.c > index ad8e2f842fb..cd045964dfe 100644 > --- a/gcc/testsuite/gcc.target/nvptx/stack-atomics-run.c > +++ b/gcc/testsuite/gcc.target/nvptx/stack-atomics-run.c > @@ -39,6 +39,10 @@ main (void) > if (b != 1) > __builtin_abort (); > > - > + a = 1; > + __atomic_store_n (&a, 0, MEMMODEL_RELAXED); > + if (a != 0) > + __builtin_abort (); > + > return 0; > } ----------------- Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
On 2/15/22 08:34, Thomas Schwinge wrote: > Hi Tom! > > For my understanding: > > On 2022-02-10T10:13:10+0100, Tom de Vries via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: >> The ptx isa specifies (for pre-sm_7x) that atomic operations on shared memory >> locations do not guarantee atomicity with respect to normal store instructions >> to the same address. >> >> This can be fixed by: >> - inserting barriers between normal stores and atomic operations to a common >> address >> - using atom.exch to store to locations accessed by other atomic operations. >> >> It's not clearly spelled out which barriers are needed, and a barrier seem more >> expensive than atomic exchange. >> >> Implement the pre-sm_7x shared atomic store using atomic exchange. >> >> That includes stores using generic addressing, since those may also point to >> shared memory. > > It is expected that this changes, for example (similar elsewhere) > 'nvptx-none/libatomic/store_4_.o', to use (a) 'atom.exch' (with a new > dummy register allocated) Yes. We could do slightly better by emitting that as: ... membar.sys; { .reg .u32 dummy; atom.exch.b32 dummy,[%r22],%r23; } membar.sys; ... which could improve register pressure. I just wrote a patch for that (attached, ftr), but using a scratch register, and it seems that this similar code: ... void foo (U_4 *mptr, U_4 newval) { __atomic_exchange_n (mptr, newval, 5); } ... still maps to: ... .reg .u32 %r24; membar.sys; atom.exch.b32 %r24,[%r22],%r23; membar.sys; ... so that may not be the right way to do it. > and yet (b) 'membar.sys' remains before as well > as after (a): > > .visible .func __atomic_store_4 (.param .u64 %in_ar0, .param .u32 %in_ar1, .param .u32 %in_ar2) > { > .reg .u64 %ar0; > ld.param.u64 %ar0,[%in_ar0]; > .reg .u32 %ar1; > ld.param.u32 %ar1,[%in_ar1]; > .reg .u32 %ar2; > ld.param.u32 %ar2,[%in_ar2]; > .reg .u64 %r22; > .reg .u32 %r23; > +.reg .u32 %r25; > mov.u64 %r22,%ar0; > mov.u32 %r23,%ar1; > .loc 2 39 5 > membar.sys; > -st.u32 [%r22],%r23; > +atom.exch.b32 %r25,[%r22],%r23; > membar.sys; > .loc 2 40 1 > ret; > } > Yes. Previously, the barriers where there to guarantee atomicity of the store, as well as to implement the memory model MEMMODEL_SEQ_CST. Now the atomic exchange garantees atomicity of the store and the barriers implement the memory model. In this particular instance (we're using the most severe barrier both before and after), we might be able to get away with using a store instead of atomic exchange. But this is a fallback, and we don't care so much about performance (besides it's already pretty bad with two times membar.sys). And I rather have atomicity guarantueed by using an atomic insn than by using barriers: there's less room for error by the driver JIT. > And, for example (similar elsewhere) 'nvptx-none/libgomp/single.o', a > 'ld' that previously was issued after 'membar.sys' now moves before that > one: > > .visible .func (.param .u64 %value_out) GOMP_single_copy_start > { > [...] > .reg .u64 %r33; > .reg .u64 %r34; > [...] > .reg .pred %r51; > .reg .u64 %r50; > .reg .u64 %r52; > [...] > ld.shared.u64 %r50,[nvptx_thrs]; > add.u64 %r33,%r50,%r49; > .loc 2 1317 32 > ld.u64 %r34,[%r33+32]; > .loc 2 1317 6 > setp.eq.u64 %r51,%r34,0; > @ %r51 bra $L6; > .loc 4 66 3 > -membar.sys; > ld.u64 %r52,[%r33+24]; > -st.u64 [%r34+80],%r52; > +membar.sys; > +atom.exch.b64 %r53,[%r34+80],%r52; > $L6: > [...] > > (But I see there is another 'ld.u64 %r34,[%r33+32]' that previously also > already was issued before the 'membar.sys'.) > Before, an atomic store resulted in two seperate insns, a membar and a regular store, and so it could be that they became separated. F.i., if alias analysis could prove that the load did not alias with the store, it could move the load inbetween. Now the memmodel is an operand of the insn, and the membar is only implemented when printing the insn, so nothing can come inbetween anymore. Note btw that we had here this pattern: ... membar.sys; st.u64 [%r34+80],%r52; ... and there's no memory barrier after the store to guarantee atomicity with respect to atomic operations afterwards. Thanks, - Tom > > Grüße > Thomas > > >> [nvptx] Handle pre-sm_7x shared atomic store using atomic exchange >> >> gcc/ChangeLog: >> >> 2022-02-02 Tom de Vries <tdevries@suse.de> >> >> * config/nvptx/nvptx-protos.h (nvptx_mem_maybe_shared_p): Declare. >> * config/nvptx/nvptx.cc (nvptx_mem_data_area): New static function. >> (nvptx_mem_maybe_shared_p): New function. >> * config/nvptx/nvptx.md (define_expand "atomic_store<mode>"): New >> define_expand. >> >> gcc/testsuite/ChangeLog: >> >> 2022-02-02 Tom de Vries <tdevries@suse.de> >> >> * gcc.target/nvptx/atomic-store-1.c: New test. >> * gcc.target/nvptx/atomic-store-3.c: New test. >> * gcc.target/nvptx/stack-atomics-run.c: Update. >> >> --- >> gcc/config/nvptx/nvptx-protos.h | 1 + >> gcc/config/nvptx/nvptx.cc | 22 ++++++++++++++++ >> gcc/config/nvptx/nvptx.md | 30 ++++++++++++++++++++++ >> gcc/testsuite/gcc.target/nvptx/atomic-store-1.c | 26 +++++++++++++++++++ >> gcc/testsuite/gcc.target/nvptx/atomic-store-3.c | 25 ++++++++++++++++++ >> gcc/testsuite/gcc.target/nvptx/stack-atomics-run.c | 6 ++++- >> 6 files changed, 109 insertions(+), 1 deletion(-) >> >> diff --git a/gcc/config/nvptx/nvptx-protos.h b/gcc/config/nvptx/nvptx-protos.h >> index a846e341917..0bf9af406a2 100644 >> --- a/gcc/config/nvptx/nvptx-protos.h >> +++ b/gcc/config/nvptx/nvptx-protos.h >> @@ -60,5 +60,6 @@ extern const char *nvptx_output_simt_exit (rtx); >> extern const char *nvptx_output_red_partition (rtx, rtx); >> extern const char *nvptx_output_atomic_insn (const char *, rtx *, int, int); >> extern bool nvptx_mem_local_p (rtx); >> +extern bool nvptx_mem_maybe_shared_p (const_rtx); >> #endif >> #endif >> diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc >> index 1b0227a2c31..5b26c0f4c7d 100644 >> --- a/gcc/config/nvptx/nvptx.cc >> +++ b/gcc/config/nvptx/nvptx.cc >> @@ -76,6 +76,7 @@ >> #include "intl.h" >> #include "opts.h" >> #include "tree-pretty-print.h" >> +#include "rtl-iter.h" >> >> /* This file should be included last. */ >> #include "target-def.h" >> @@ -2787,6 +2788,27 @@ nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr) >> nvptx_print_address_operand (file, addr, mode); >> } >> >> +static nvptx_data_area >> +nvptx_mem_data_area (const_rtx x) >> +{ >> + gcc_assert (GET_CODE (x) == MEM); >> + >> + const_rtx addr = XEXP (x, 0); >> + subrtx_iterator::array_type array; >> + FOR_EACH_SUBRTX (iter, array, addr, ALL) >> + if (SYMBOL_REF_P (*iter)) >> + return SYMBOL_DATA_AREA (*iter); >> + >> + return DATA_AREA_GENERIC; >> +} >> + >> +bool >> +nvptx_mem_maybe_shared_p (const_rtx x) >> +{ >> + nvptx_data_area area = nvptx_mem_data_area (x); >> + return area == DATA_AREA_SHARED || area == DATA_AREA_GENERIC; >> +} >> + >> /* Print an operand, X, to FILE, with an optional modifier in CODE. >> >> Meaning of CODE: >> diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md >> index cced68e0d4a..1a283b41922 100644 >> --- a/gcc/config/nvptx/nvptx.md >> +++ b/gcc/config/nvptx/nvptx.md >> @@ -2051,6 +2051,36 @@ (define_insn "atomic_exchange<mode>" >> } >> [(set_attr "atomic" "true")]) >> >> +(define_expand "atomic_store<mode>" >> + [(match_operand:SDIM 0 "memory_operand" "=m") ;; memory >> + (match_operand:SDIM 1 "nvptx_nonmemory_operand" "Ri") ;; input >> + (match_operand:SI 2 "const_int_operand")] ;; model >> + "" >> +{ >> + struct address_info info; >> + decompose_mem_address (&info, operands[0]); >> + if (info.base != NULL && REG_P (*info.base) >> + && REGNO_PTR_FRAME_P (REGNO (*info.base))) >> + { >> + emit_insn (gen_mov<mode> (operands[0], operands[1])); >> + DONE; >> + } >> + >> + if (TARGET_SM70) >> + /* Fall back to expand_atomic_store. */ >> + FAIL; >> + >> + bool maybe_shared_p = nvptx_mem_maybe_shared_p (operands[0]); >> + if (!maybe_shared_p) >> + /* Fall back to expand_atomic_store. */ >> + FAIL; >> + >> + rtx tmpreg = gen_reg_rtx (<MODE>mode); >> + emit_insn (gen_atomic_exchange<mode> (tmpreg, operands[0], operands[1], >> + operands[2])); >> + DONE; >> +}) >> + >> (define_insn "atomic_fetch_add<mode>" >> [(set (match_operand:SDIM 1 "memory_operand" "+m") >> (unspec_volatile:SDIM >> diff --git a/gcc/testsuite/gcc.target/nvptx/atomic-store-1.c b/gcc/testsuite/gcc.target/nvptx/atomic-store-1.c >> new file mode 100644 >> index 00000000000..cee3815eda5 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/nvptx/atomic-store-1.c >> @@ -0,0 +1,26 @@ >> +/* Test the atomic store expansion for sm <= sm_6x targets, >> + shared state space. */ >> + >> +/* { dg-do compile } */ >> +/* { dg-options "-misa=sm_53" } */ >> + >> +enum memmodel >> +{ >> + MEMMODEL_SEQ_CST = 5 >> +}; >> + >> +unsigned int u32 __attribute__((shared)); >> +unsigned long long int u64 __attribute__((shared)); >> + >> +int >> +main() >> +{ >> + __atomic_store_n (&u32, 0, MEMMODEL_SEQ_CST); >> + __atomic_store_n (&u64, 0, MEMMODEL_SEQ_CST); >> + >> + return 0; >> +} >> + >> +/* { dg-final { scan-assembler-times "atom.shared.exch.b32" 1 } } */ >> +/* { dg-final { scan-assembler-times "atom.shared.exch.b64" 1 } } */ >> +/* { dg-final { scan-assembler-times "membar.cta" 4 } } */ >> diff --git a/gcc/testsuite/gcc.target/nvptx/atomic-store-3.c b/gcc/testsuite/gcc.target/nvptx/atomic-store-3.c >> new file mode 100644 >> index 00000000000..cc0264f2b06 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/nvptx/atomic-store-3.c >> @@ -0,0 +1,25 @@ >> +/* Test the atomic store expansion, global state space. */ >> + >> +/* { dg-do compile } */ >> +/* { dg-additional-options "-Wno-long-long" } */ >> + >> +enum memmodel >> +{ >> + MEMMODEL_SEQ_CST = 5 >> +}; >> + >> +unsigned int u32; >> +unsigned long long int u64; >> + >> +int >> +main() >> +{ >> + __atomic_store_n (&u32, 0, MEMMODEL_SEQ_CST); >> + __atomic_store_n (&u64, 0, MEMMODEL_SEQ_CST); >> + >> + return 0; >> +} >> + >> +/* { dg-final { scan-assembler-times "st.global.u32" 1 } } */ >> +/* { dg-final { scan-assembler-times "st.global.u64" 1 } } */ >> +/* { dg-final { scan-assembler-times "membar.sys" 4 } } */ >> diff --git a/gcc/testsuite/gcc.target/nvptx/stack-atomics-run.c b/gcc/testsuite/gcc.target/nvptx/stack-atomics-run.c >> index ad8e2f842fb..cd045964dfe 100644 >> --- a/gcc/testsuite/gcc.target/nvptx/stack-atomics-run.c >> +++ b/gcc/testsuite/gcc.target/nvptx/stack-atomics-run.c >> @@ -39,6 +39,10 @@ main (void) >> if (b != 1) >> __builtin_abort (); >> >> - >> + a = 1; >> + __atomic_store_n (&a, 0, MEMMODEL_RELAXED); >> + if (a != 0) >> + __builtin_abort (); >> + >> return 0; >> } > ----------------- > Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
Hi Tom! On 2022-02-15T11:52:29+0100, Tom de Vries <tdevries@suse.de> wrote: > On 2/15/22 08:34, Thomas Schwinge wrote: >> For my understanding: Thanks for your explanations! >> It is expected that this changes, for example (similar elsewhere) >> 'nvptx-none/libatomic/store_4_.o', to use (a) 'atom.exch' (with a new >> dummy register allocated) > > Yes. > > We could do slightly better by emitting that as: > ... > membar.sys; > { .reg .u32 dummy; > atom.exch.b32 dummy,[%r22],%r23; > } > membar.sys; > ... > which could improve register pressure. Or, use the "bit bucket" operand -- assuming that's applicable here? atom.exch.b32 _,[%r22],%r23; For example, see PTX 3.1, 8.2 "PTX Instructions". Grüße Thomas > I just wrote a patch for that (attached, ftr), but using a scratch > register, and it seems that this similar code: > ... > void > foo (U_4 *mptr, U_4 newval) > { > __atomic_exchange_n (mptr, newval, 5); > } > ... > still maps to: > ... > .reg .u32 %r24; > membar.sys; > atom.exch.b32 %r24,[%r22],%r23; > membar.sys; > ... > so that may not be the right way to do it. > --- a/gcc/config/nvptx/nvptx.md > +++ b/gcc/config/nvptx/nvptx.md > @@ -89,9 +89,10 @@ > ;; only literal constants, which differ from the generic ones, which > ;; permit subregs and symbolc constants (as appropriate) > (define_predicate "nvptx_register_operand" > - (match_code "reg") > + (match_code "reg,scratch") > { > - return register_operand (op, mode); > + return (register_operand (op, mode) > + || (GET_CODE (op) == SCRATCH && GET_MODE (op) == mode)); > }) > > (define_predicate "nvptx_nonimmediate_operand" > @@ -188,7 +189,7 @@ > > (define_constraint "R" > "A pseudo register." > - (match_code "reg")) > + (ior (match_code "reg") (match_code "scratch"))) > > (define_constraint "Ia" > "Any integer constant." > @@ -2036,6 +2037,7 @@ > (match_operand:SDIM 2 "nvptx_nonmemory_operand" "Ri"))] ;; input > "" > { > + bool scratch_dst_p = GET_CODE (operands[0]) == SCRATCH; > if (nvptx_mem_local_p (operands[1])) > { > output_asm_insn ("{", NULL); > @@ -2047,7 +2049,9 @@ > return ""; > } > const char *t > - = "%.\tatom%A1.exch.b%T0\t%0, %1, %2;"; > + = (scratch_dst_p > + ? "{ .reg.u%T0 dummy; %.\tatom%A1.exch.b%T0\t dummy,%1, %2; }" > + : "%.\tatom%A1.exch.b%T0\t%0, %1, %2;"); > return nvptx_output_atomic_insn (t, operands, 1, 3); > } > [(set_attr "atomic" "true")]) > @@ -2079,7 +2083,7 @@ > /* Fall back to expand_atomic_store. */ > FAIL; > > - rtx tmpreg = gen_reg_rtx (<MODE>mode); > + rtx tmpreg = gen_rtx_SCRATCH (<MODE>mode); > emit_insn (gen_atomic_exchange<mode> (tmpreg, operands[0], operands[1], > operands[2])); > DONE; ----------------- Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
On 2/15/22 12:08, Thomas Schwinge wrote: > Hi Tom! > > On 2022-02-15T11:52:29+0100, Tom de Vries <tdevries@suse.de> wrote: >> On 2/15/22 08:34, Thomas Schwinge wrote: >>> For my understanding: > > Thanks for your explanations! > >>> It is expected that this changes, for example (similar elsewhere) >>> 'nvptx-none/libatomic/store_4_.o', to use (a) 'atom.exch' (with a new >>> dummy register allocated) >> >> Yes. >> >> We could do slightly better by emitting that as: >> ... >> membar.sys; >> { .reg .u32 dummy; >> atom.exch.b32 dummy,[%r22],%r23; >> } >> membar.sys; >> ... >> which could improve register pressure. > > Or, use the "bit bucket" operand -- assuming that's applicable here? > > atom.exch.b32 _,[%r22],%r23; > Ah, yes, that looks exactly what we need, thanks for pointing that out :) I'll try to create a patch for this. Thanks, - Tom > For example, see PTX 3.1, 8.2 "PTX Instructions". > > > Grüße > Thomas > > >> I just wrote a patch for that (attached, ftr), but using a scratch >> register, and it seems that this similar code: >> ... >> void >> foo (U_4 *mptr, U_4 newval) >> { >> __atomic_exchange_n (mptr, newval, 5); >> } >> ... >> still maps to: >> ... >> .reg .u32 %r24; >> membar.sys; >> atom.exch.b32 %r24,[%r22],%r23; >> membar.sys; >> ... >> so that may not be the right way to do it. > >> --- a/gcc/config/nvptx/nvptx.md >> +++ b/gcc/config/nvptx/nvptx.md >> @@ -89,9 +89,10 @@ >> ;; only literal constants, which differ from the generic ones, which >> ;; permit subregs and symbolc constants (as appropriate) >> (define_predicate "nvptx_register_operand" >> - (match_code "reg") >> + (match_code "reg,scratch") >> { >> - return register_operand (op, mode); >> + return (register_operand (op, mode) >> + || (GET_CODE (op) == SCRATCH && GET_MODE (op) == mode)); >> }) >> >> (define_predicate "nvptx_nonimmediate_operand" >> @@ -188,7 +189,7 @@ >> >> (define_constraint "R" >> "A pseudo register." >> - (match_code "reg")) >> + (ior (match_code "reg") (match_code "scratch"))) >> >> (define_constraint "Ia" >> "Any integer constant." >> @@ -2036,6 +2037,7 @@ >> (match_operand:SDIM 2 "nvptx_nonmemory_operand" "Ri"))] ;; input >> "" >> { >> + bool scratch_dst_p = GET_CODE (operands[0]) == SCRATCH; >> if (nvptx_mem_local_p (operands[1])) >> { >> output_asm_insn ("{", NULL); >> @@ -2047,7 +2049,9 @@ >> return ""; >> } >> const char *t >> - = "%.\tatom%A1.exch.b%T0\t%0, %1, %2;"; >> + = (scratch_dst_p >> + ? "{ .reg.u%T0 dummy; %.\tatom%A1.exch.b%T0\t dummy,%1, %2; }" >> + : "%.\tatom%A1.exch.b%T0\t%0, %1, %2;"); >> return nvptx_output_atomic_insn (t, operands, 1, 3); >> } >> [(set_attr "atomic" "true")]) >> @@ -2079,7 +2083,7 @@ >> /* Fall back to expand_atomic_store. */ >> FAIL; >> >> - rtx tmpreg = gen_reg_rtx (<MODE>mode); >> + rtx tmpreg = gen_rtx_SCRATCH (<MODE>mode); >> emit_insn (gen_atomic_exchange<mode> (tmpreg, operands[0], operands[1], >> operands[2])); >> DONE; > ----------------- > Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
diff --git a/gcc/config/nvptx/nvptx-protos.h b/gcc/config/nvptx/nvptx-protos.h index a846e341917..0bf9af406a2 100644 --- a/gcc/config/nvptx/nvptx-protos.h +++ b/gcc/config/nvptx/nvptx-protos.h @@ -60,5 +60,6 @@ extern const char *nvptx_output_simt_exit (rtx); extern const char *nvptx_output_red_partition (rtx, rtx); extern const char *nvptx_output_atomic_insn (const char *, rtx *, int, int); extern bool nvptx_mem_local_p (rtx); +extern bool nvptx_mem_maybe_shared_p (const_rtx); #endif #endif diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc index 1b0227a2c31..5b26c0f4c7d 100644 --- a/gcc/config/nvptx/nvptx.cc +++ b/gcc/config/nvptx/nvptx.cc @@ -76,6 +76,7 @@ #include "intl.h" #include "opts.h" #include "tree-pretty-print.h" +#include "rtl-iter.h" /* This file should be included last. */ #include "target-def.h" @@ -2787,6 +2788,27 @@ nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr) nvptx_print_address_operand (file, addr, mode); } +static nvptx_data_area +nvptx_mem_data_area (const_rtx x) +{ + gcc_assert (GET_CODE (x) == MEM); + + const_rtx addr = XEXP (x, 0); + subrtx_iterator::array_type array; + FOR_EACH_SUBRTX (iter, array, addr, ALL) + if (SYMBOL_REF_P (*iter)) + return SYMBOL_DATA_AREA (*iter); + + return DATA_AREA_GENERIC; +} + +bool +nvptx_mem_maybe_shared_p (const_rtx x) +{ + nvptx_data_area area = nvptx_mem_data_area (x); + return area == DATA_AREA_SHARED || area == DATA_AREA_GENERIC; +} + /* Print an operand, X, to FILE, with an optional modifier in CODE. Meaning of CODE: diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index cced68e0d4a..1a283b41922 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -2051,6 +2051,36 @@ (define_insn "atomic_exchange<mode>" } [(set_attr "atomic" "true")]) +(define_expand "atomic_store<mode>" + [(match_operand:SDIM 0 "memory_operand" "=m") ;; memory + (match_operand:SDIM 1 "nvptx_nonmemory_operand" "Ri") ;; input + (match_operand:SI 2 "const_int_operand")] ;; model + "" +{ + struct address_info info; + decompose_mem_address (&info, operands[0]); + if (info.base != NULL && REG_P (*info.base) + && REGNO_PTR_FRAME_P (REGNO (*info.base))) + { + emit_insn (gen_mov<mode> (operands[0], operands[1])); + DONE; + } + + if (TARGET_SM70) + /* Fall back to expand_atomic_store. */ + FAIL; + + bool maybe_shared_p = nvptx_mem_maybe_shared_p (operands[0]); + if (!maybe_shared_p) + /* Fall back to expand_atomic_store. */ + FAIL; + + rtx tmpreg = gen_reg_rtx (<MODE>mode); + emit_insn (gen_atomic_exchange<mode> (tmpreg, operands[0], operands[1], + operands[2])); + DONE; +}) + (define_insn "atomic_fetch_add<mode>" [(set (match_operand:SDIM 1 "memory_operand" "+m") (unspec_volatile:SDIM diff --git a/gcc/testsuite/gcc.target/nvptx/atomic-store-1.c b/gcc/testsuite/gcc.target/nvptx/atomic-store-1.c new file mode 100644 index 00000000000..cee3815eda5 --- /dev/null +++ b/gcc/testsuite/gcc.target/nvptx/atomic-store-1.c @@ -0,0 +1,26 @@ +/* Test the atomic store expansion for sm <= sm_6x targets, + shared state space. */ + +/* { dg-do compile } */ +/* { dg-options "-misa=sm_53" } */ + +enum memmodel +{ + MEMMODEL_SEQ_CST = 5 +}; + +unsigned int u32 __attribute__((shared)); +unsigned long long int u64 __attribute__((shared)); + +int +main() +{ + __atomic_store_n (&u32, 0, MEMMODEL_SEQ_CST); + __atomic_store_n (&u64, 0, MEMMODEL_SEQ_CST); + + return 0; +} + +/* { dg-final { scan-assembler-times "atom.shared.exch.b32" 1 } } */ +/* { dg-final { scan-assembler-times "atom.shared.exch.b64" 1 } } */ +/* { dg-final { scan-assembler-times "membar.cta" 4 } } */ diff --git a/gcc/testsuite/gcc.target/nvptx/atomic-store-3.c b/gcc/testsuite/gcc.target/nvptx/atomic-store-3.c new file mode 100644 index 00000000000..cc0264f2b06 --- /dev/null +++ b/gcc/testsuite/gcc.target/nvptx/atomic-store-3.c @@ -0,0 +1,25 @@ +/* Test the atomic store expansion, global state space. */ + +/* { dg-do compile } */ +/* { dg-additional-options "-Wno-long-long" } */ + +enum memmodel +{ + MEMMODEL_SEQ_CST = 5 +}; + +unsigned int u32; +unsigned long long int u64; + +int +main() +{ + __atomic_store_n (&u32, 0, MEMMODEL_SEQ_CST); + __atomic_store_n (&u64, 0, MEMMODEL_SEQ_CST); + + return 0; +} + +/* { dg-final { scan-assembler-times "st.global.u32" 1 } } */ +/* { dg-final { scan-assembler-times "st.global.u64" 1 } } */ +/* { dg-final { scan-assembler-times "membar.sys" 4 } } */ diff --git a/gcc/testsuite/gcc.target/nvptx/stack-atomics-run.c b/gcc/testsuite/gcc.target/nvptx/stack-atomics-run.c index ad8e2f842fb..cd045964dfe 100644 --- a/gcc/testsuite/gcc.target/nvptx/stack-atomics-run.c +++ b/gcc/testsuite/gcc.target/nvptx/stack-atomics-run.c @@ -39,6 +39,10 @@ main (void) if (b != 1) __builtin_abort (); - + a = 1; + __atomic_store_n (&a, 0, MEMMODEL_RELAXED); + if (a != 0) + __builtin_abort (); + return 0; }