Message ID | 20190107174014.GA17007@intel.com |
---|---|
State | New |
Headers | show |
Series | x86: Don't generate vzeroupper if caller is AVX_U128_DIRTY | expand |
On Mon, Jan 7, 2019 at 6:40 PM H.J. Lu <hongjiu.lu@intel.com> wrote: > > There is no need to generate vzeroupper if caller uses upper bits of > AVX/AVX512 registers, We track caller's avx_u128_state and avoid > vzeroupper when caller's avx_u128_state is AVX_U128_DIRTY. > > Tested on i686 and x86-64 with and without --with-arch=native. > > OK for trunk? In principle OK, but I think we don't have to cache the result of ix86_avx_u128_mode_entry. Simply call the function from ix86_avx_u128_mode_exit; it is a simple function, so I guess we can afford to re-call it one more time per function. Uros. > Thanks. > > H.J. > --- > gcc/ > > PR target/88717 > * config/i386/i386.c (ix86_avx_u128_mode_entry): Set > caller_avx_u128_dirty to true when caller is AVX_U128_DIRTY. > (ix86_avx_u128_mode_exit): Set exit mode to AVX_U128_DIRTY if > caller is AVX_U128_DIRTY. > * config/i386/i386.h (machine_function): Add > caller_avx_u128_dirty. > > gcc/testsuite/ > > PR target/88717 > * gcc.target/i386/pr88717.c: New test. > --- > gcc/config/i386/i386.c | 10 +++++++++- > gcc/config/i386/i386.h | 3 +++ > gcc/testsuite/gcc.target/i386/pr88717.c | 24 ++++++++++++++++++++++++ > 3 files changed, 36 insertions(+), 1 deletion(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr88717.c > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index d01278d866f..9b49a2c1d9c 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -19100,7 +19100,11 @@ ix86_avx_u128_mode_entry (void) > rtx incoming = DECL_INCOMING_RTL (arg); > > if (incoming && ix86_check_avx_upper_register (incoming)) > - return AVX_U128_DIRTY; > + { > + /* Caller is AVX_U128_DIRTY. */ > + cfun->machine->caller_avx_u128_dirty = true; > + return AVX_U128_DIRTY; > + } > } > > return AVX_U128_CLEAN; > @@ -19130,6 +19134,10 @@ ix86_mode_entry (int entity) > static int > ix86_avx_u128_mode_exit (void) > { > + /* Exit mode is set to AVX_U128_DIRTY if caller is AVX_U128_DIRTY. */ > + if (cfun->machine->caller_avx_u128_dirty) > + return AVX_U128_DIRTY; > + > rtx reg = crtl->return_rtx; > > /* Exit mode is set to AVX_U128_DIRTY if there are 256bit > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h > index 83b025e0cf5..c053b657a55 100644 > --- a/gcc/config/i386/i386.h > +++ b/gcc/config/i386/i386.h > @@ -2747,6 +2747,9 @@ struct GTY(()) machine_function { > /* If true, ENDBR is queued at function entrance. */ > BOOL_BITFIELD endbr_queued_at_entrance : 1; > > + /* If true, caller is AVX_U128_DIRTY. */ > + BOOL_BITFIELD caller_avx_u128_dirty : 1; > + > /* The largest alignment, in bytes, of stack slot actually used. */ > unsigned int max_used_stack_alignment; > > diff --git a/gcc/testsuite/gcc.target/i386/pr88717.c b/gcc/testsuite/gcc.target/i386/pr88717.c > new file mode 100644 > index 00000000000..01680998f1b > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr88717.c > @@ -0,0 +1,24 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx512f -mvzeroupper" } */ > + > +#include <immintrin.h> > + > +__m128 > +foo1 (__m256 x) > +{ > + return _mm256_castps256_ps128 (x); > +} > + > +void > +foo2 (float *p, __m256 x) > +{ > + *p = ((__v8sf)x)[0]; > +} > + > +void > +foo3 (float *p, __m512 x) > +{ > + *p = ((__v16sf)x)[0]; > +} > + > +/* { dg-final { scan-assembler-not "vzeroupper" } } */ > -- > 2.20.1 >
On Mon, Jan 7, 2019 at 11:12 PM Uros Bizjak <ubizjak@gmail.com> wrote: > > On Mon, Jan 7, 2019 at 6:40 PM H.J. Lu <hongjiu.lu@intel.com> wrote: > > > > There is no need to generate vzeroupper if caller uses upper bits of > > AVX/AVX512 registers, We track caller's avx_u128_state and avoid > > vzeroupper when caller's avx_u128_state is AVX_U128_DIRTY. > > > > Tested on i686 and x86-64 with and without --with-arch=native. > > > > OK for trunk? > > In principle OK, but I think we don't have to cache the result of > ix86_avx_u128_mode_entry. Simply call the function from > ix86_avx_u128_mode_exit; it is a simple function, so I guess we can > afford to re-call it one more time per function. Do we really need ix86_avx_u128_mode_entry? We can just set entry state to AVX_U128_CLEAN and set exit state to AVX_U128_DIRTY if caller returns AVX/AVX512 register or passes AVX/AVX512 registers to callee. Does this patch look OK? Thanks. H.J. -- diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index d01278d866f..1ac89fd2eb5 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -19087,25 +19087,6 @@ ix86_dirflag_mode_entry (void) return X86_DIRFLAG_RESET; } -static int -ix86_avx_u128_mode_entry (void) -{ - tree arg; - - /* Entry mode is set to AVX_U128_DIRTY if there are - 256bit or 512bit modes used in function arguments. */ - for (arg = DECL_ARGUMENTS (current_function_decl); arg; - arg = TREE_CHAIN (arg)) - { - rtx incoming = DECL_INCOMING_RTL (arg); - - if (incoming && ix86_check_avx_upper_register (incoming)) - return AVX_U128_DIRTY; - } - - return AVX_U128_CLEAN; -} - /* Return a mode that ENTITY is assumed to be switched to at function entry. */ @@ -19117,7 +19098,7 @@ ix86_mode_entry (int entity) case X86_DIRFLAG: return ix86_dirflag_mode_entry (); case AVX_U128: - return ix86_avx_u128_mode_entry (); + return AVX_U128_CLEAN; case I387_TRUNC: case I387_FLOOR: case I387_CEIL: @@ -19130,13 +19111,24 @@ ix86_mode_entry (int entity) static int ix86_avx_u128_mode_exit (void) { + /* Exit mode is set to AVX_U128_DIRTY if there are 256bit or 512bit + modes used in function arguments or function return.. */ rtx reg = crtl->return_rtx; - /* Exit mode is set to AVX_U128_DIRTY if there are 256bit - or 512 bit modes used in the function return register. */ if (reg && ix86_check_avx_upper_register (reg)) return AVX_U128_DIRTY; + tree arg; + + for (arg = DECL_ARGUMENTS (current_function_decl); arg; + arg = TREE_CHAIN (arg)) + { + rtx incoming = DECL_INCOMING_RTL (arg); + + if (incoming && ix86_check_avx_upper_register (incoming)) + return AVX_U128_DIRTY; + } + return AVX_U128_CLEAN; }
On Tue, Jan 8, 2019 at 3:39 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Mon, Jan 7, 2019 at 11:12 PM Uros Bizjak <ubizjak@gmail.com> wrote: > > > > On Mon, Jan 7, 2019 at 6:40 PM H.J. Lu <hongjiu.lu@intel.com> wrote: > > > > > > There is no need to generate vzeroupper if caller uses upper bits of > > > AVX/AVX512 registers, We track caller's avx_u128_state and avoid > > > vzeroupper when caller's avx_u128_state is AVX_U128_DIRTY. > > > > > > Tested on i686 and x86-64 with and without --with-arch=native. > > > > > > OK for trunk? > > > > In principle OK, but I think we don't have to cache the result of > > ix86_avx_u128_mode_entry. Simply call the function from > > ix86_avx_u128_mode_exit; it is a simple function, so I guess we can > > afford to re-call it one more time per function. > > Do we really need ix86_avx_u128_mode_entry? We can just > set entry state to AVX_U128_CLEAN and set exit state to > AVX_U128_DIRTY if caller returns AVX/AVX512 register or passes > AVX/AVX512 registers to callee. > > Does this patch look OK? No, the compiler is then free to move optimal insertion point at the beginning of the function. Uros. > Thanks. > > H.J. > -- > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index d01278d866f..1ac89fd2eb5 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -19087,25 +19087,6 @@ ix86_dirflag_mode_entry (void) > return X86_DIRFLAG_RESET; > } > > -static int > -ix86_avx_u128_mode_entry (void) > -{ > - tree arg; > - > - /* Entry mode is set to AVX_U128_DIRTY if there are > - 256bit or 512bit modes used in function arguments. */ > - for (arg = DECL_ARGUMENTS (current_function_decl); arg; > - arg = TREE_CHAIN (arg)) > - { > - rtx incoming = DECL_INCOMING_RTL (arg); > - > - if (incoming && ix86_check_avx_upper_register (incoming)) > - return AVX_U128_DIRTY; > - } > - > - return AVX_U128_CLEAN; > -} > - > /* Return a mode that ENTITY is assumed to be > switched to at function entry. */ > > @@ -19117,7 +19098,7 @@ ix86_mode_entry (int entity) > case X86_DIRFLAG: > return ix86_dirflag_mode_entry (); > case AVX_U128: > - return ix86_avx_u128_mode_entry (); > + return AVX_U128_CLEAN; > case I387_TRUNC: > case I387_FLOOR: > case I387_CEIL: > @@ -19130,13 +19111,24 @@ ix86_mode_entry (int entity) > static int > ix86_avx_u128_mode_exit (void) > { > + /* Exit mode is set to AVX_U128_DIRTY if there are 256bit or 512bit > + modes used in function arguments or function return.. */ > rtx reg = crtl->return_rtx; > > - /* Exit mode is set to AVX_U128_DIRTY if there are 256bit > - or 512 bit modes used in the function return register. */ > if (reg && ix86_check_avx_upper_register (reg)) > return AVX_U128_DIRTY; > > + tree arg; > + > + for (arg = DECL_ARGUMENTS (current_function_decl); arg; > + arg = TREE_CHAIN (arg)) > + { > + rtx incoming = DECL_INCOMING_RTL (arg); > + > + if (incoming && ix86_check_avx_upper_register (incoming)) > + return AVX_U128_DIRTY; > + } > + > return AVX_U128_CLEAN; > }
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index d01278d866f..9b49a2c1d9c 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -19100,7 +19100,11 @@ ix86_avx_u128_mode_entry (void) rtx incoming = DECL_INCOMING_RTL (arg); if (incoming && ix86_check_avx_upper_register (incoming)) - return AVX_U128_DIRTY; + { + /* Caller is AVX_U128_DIRTY. */ + cfun->machine->caller_avx_u128_dirty = true; + return AVX_U128_DIRTY; + } } return AVX_U128_CLEAN; @@ -19130,6 +19134,10 @@ ix86_mode_entry (int entity) static int ix86_avx_u128_mode_exit (void) { + /* Exit mode is set to AVX_U128_DIRTY if caller is AVX_U128_DIRTY. */ + if (cfun->machine->caller_avx_u128_dirty) + return AVX_U128_DIRTY; + rtx reg = crtl->return_rtx; /* Exit mode is set to AVX_U128_DIRTY if there are 256bit diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 83b025e0cf5..c053b657a55 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -2747,6 +2747,9 @@ struct GTY(()) machine_function { /* If true, ENDBR is queued at function entrance. */ BOOL_BITFIELD endbr_queued_at_entrance : 1; + /* If true, caller is AVX_U128_DIRTY. */ + BOOL_BITFIELD caller_avx_u128_dirty : 1; + /* The largest alignment, in bytes, of stack slot actually used. */ unsigned int max_used_stack_alignment; diff --git a/gcc/testsuite/gcc.target/i386/pr88717.c b/gcc/testsuite/gcc.target/i386/pr88717.c new file mode 100644 index 00000000000..01680998f1b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88717.c @@ -0,0 +1,24 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx512f -mvzeroupper" } */ + +#include <immintrin.h> + +__m128 +foo1 (__m256 x) +{ + return _mm256_castps256_ps128 (x); +} + +void +foo2 (float *p, __m256 x) +{ + *p = ((__v8sf)x)[0]; +} + +void +foo3 (float *p, __m512 x) +{ + *p = ((__v16sf)x)[0]; +} + +/* { dg-final { scan-assembler-not "vzeroupper" } } */