Message ID | 20180712195736.GA54874@intel.com |
---|---|
State | New |
Headers | show |
Series | x86: Tune Skylake, Cannonlake and Icelake as Haswell | expand |
On Thu, Jul 12, 2018 at 9:57 PM, H.J. Lu <hongjiu.lu@intel.com> wrote: > r259399, which added PROCESSOR_SKYLAKE, disabled many x86 optimizations > which are enabled by PROCESSOR_HASWELL. As the result, -mtune=skylake > generates slower codes on Skylake than before. The same also applies > to Cannonlake and Icelak tuning. > > This patch changes -mtune={skylake|cannonlake|icelake} to tune like > -mtune=haswell for until their tuning is properly adjusted. It also > enables -mprefer-vector-width=256 for -mtune=haswell, which has no > impact on codegen when AVX512 isn't enabled. > > Performance impacts on SPEC CPU 2017 rate with 1 copy using > > -march=native -mfpmath=sse -O2 -m64 > > are > > 1. On Broadwell server: > > 500.perlbench_r -0.56% > 502.gcc_r -0.18% > 505.mcf_r 0.24% > 520.omnetpp_r 0.00% > 523.xalancbmk_r -0.32% > 525.x264_r -0.17% > 531.deepsjeng_r 0.00% > 541.leela_r 0.00% > 548.exchange2_r 0.12% > 557.xz_r 0.00% > geomean 0.00% > > 503.bwaves_r 0.00% > 507.cactuBSSN_r 0.21% > 508.namd_r 0.00% > 510.parest_r 0.19% > 511.povray_r -0.48% > 519.lbm_r 0.00% > 521.wrf_r 0.28% > 526.blender_r 0.19% > 527.cam4_r 0.39% > 538.imagick_r 0.00% > 544.nab_r -0.36% > 549.fotonik3d_r 0.51% > 554.roms_r 0.00% > geomean 0.17% > > On Skylake client: > > 500.perlbench_r 0.96% > 502.gcc_r 0.13% > 505.mcf_r -1.03% > 520.omnetpp_r -1.11% > 523.xalancbmk_r 1.02% > 525.x264_r 0.50% > 531.deepsjeng_r 2.97% > 541.leela_r 0.50% > 548.exchange2_r -0.95% > 557.xz_r 2.41% > geomean 0.56% > > 503.bwaves_r 0.49% > 507.cactuBSSN_r 3.17% > 508.namd_r 4.05% > 510.parest_r 0.15% > 511.povray_r 0.80% > 519.lbm_r 3.15% > 521.wrf_r 10.56% > 526.blender_r 2.97% > 527.cam4_r 2.36% > 538.imagick_r 46.40% > 544.nab_r 2.04% > 549.fotonik3d_r 0.00% > 554.roms_r 1.27% > geomean 5.49% > > On Skylake server: > > 500.perlbench_r 0.71% > 502.gcc_r -0.51% > 505.mcf_r -1.06% > 520.omnetpp_r -0.33% > 523.xalancbmk_r -0.22% > 525.x264_r 1.72% > 531.deepsjeng_r -0.26% > 541.leela_r 0.57% > 548.exchange2_r -0.75% > 557.xz_r -1.28% > geomean -0.21% > > 503.bwaves_r 0.00% > 507.cactuBSSN_r 2.66% > 508.namd_r 3.67% > 510.parest_r 1.25% > 511.povray_r 2.26% > 519.lbm_r 1.69% > 521.wrf_r 11.03% > 526.blender_r 3.39% > 527.cam4_r 1.69% > 538.imagick_r 64.59% > 544.nab_r -0.54% > 549.fotonik3d_r 2.68% > 554.roms_r 0.00% > geomean 6.19% > > This patch improves -march=native performance on Skylake up to 60% and > leaves -march=native performance unchanged on Haswell. > > OK for trunk? > > Thanks. > > H.J. > --- > gcc/ > > 2018-07-12 H.J. Lu <hongjiu.lu@intel.com> > Sunil K Pandey <sunil.k.pandey@intel.com> > > PR target/84413 > * config/i386/i386.c (m_HASWELL): Add PROCESSOR_SKYLAKE, > PROCESSOR_SKYLAKE_AVX512, PROCESSOR_CANNONLAKE, > PROCESSOR_ICELAKE_CLIENT and PROCESSOR_ICELAKE_SERVER. > (m_SKYLAKE): Set to 0. > (m_SKYLAKE_AVX512): Likewise. > (m_CANNONLAKE): Likewise. > (m_ICELAKE_CLIENT): Likewise. > (m_ICELAKE_SERVER): Likewise. > * config/i386/x86-tune.def (avx256_optimal): Also enabled for > m_HASWELL. > > gcc/testsuite/ > > 2018-07-12 H.J. Lu <hongjiu.lu@intel.com> > Sunil K Pandey <sunil.k.pandey@intel.com> > > PR target/84413 > * gcc.target/i386/pr84413-1.c: New test. > * gcc.target/i386/pr84413-2.c: Likewise. > * gcc.target/i386/pr84413-3.c: Likewise. > * gcc.target/i386/pr84413-4.c: Likewise. > --- > gcc/config/i386/i386.c | 17 +++++++++++------ > gcc/config/i386/x86-tune.def | 9 ++++++--- > gcc/testsuite/gcc.target/i386/pr84413-1.c | 17 +++++++++++++++++ > gcc/testsuite/gcc.target/i386/pr84413-2.c | 17 +++++++++++++++++ > gcc/testsuite/gcc.target/i386/pr84413-3.c | 17 +++++++++++++++++ > gcc/testsuite/gcc.target/i386/pr84413-4.c | 17 +++++++++++++++++ > 6 files changed, 85 insertions(+), 9 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr84413-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr84413-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr84413-3.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr84413-4.c > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index 9e46b7b136f..762ab89fc9e 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -137,17 +137,22 @@ const struct processor_costs *ix86_cost = NULL; > #define m_CORE2 (HOST_WIDE_INT_1U<<PROCESSOR_CORE2) > #define m_NEHALEM (HOST_WIDE_INT_1U<<PROCESSOR_NEHALEM) > #define m_SANDYBRIDGE (HOST_WIDE_INT_1U<<PROCESSOR_SANDYBRIDGE) > -#define m_HASWELL (HOST_WIDE_INT_1U<<PROCESSOR_HASWELL) > +#define m_HASWELL ((HOST_WIDE_INT_1U<<PROCESSOR_HASWELL) \ > + | (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE) \ > + | (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE_AVX512) \ > + | (HOST_WIDE_INT_1U<<PROCESSOR_CANNONLAKE) \ > + | (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_CLIENT) \ > + | (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_SERVER)) > Please introduce a new per-family define and group processors in this define. Something like m_BDVER, m_BTVER and m_AMD_MULTIPLE for AMD targets. We should not redefine m_HASWELL to include unrelated families. Uros. #define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL) > #define m_BONNELL (HOST_WIDE_INT_1U<<PROCESSOR_BONNELL) > #define m_SILVERMONT (HOST_WIDE_INT_1U<<PROCESSOR_SILVERMONT) > #define m_KNL (HOST_WIDE_INT_1U<<PROCESSOR_KNL) > #define m_KNM (HOST_WIDE_INT_1U<<PROCESSOR_KNM) > -#define m_SKYLAKE (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE) > -#define m_SKYLAKE_AVX512 (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE_AVX512) > -#define m_CANNONLAKE (HOST_WIDE_INT_1U<<PROCESSOR_CANNONLAKE) > -#define m_ICELAKE_CLIENT (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_CLIENT) > -#define m_ICELAKE_SERVER (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_SERVER) > +#define m_SKYLAKE 0 > +#define m_SKYLAKE_AVX512 0 > +#define m_CANNONLAKE 0 > +#define m_ICELAKE_CLIENT 0 > +#define m_ICELAKE_SERVER 0 > #define m_GOLDMONT (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT) > #define m_GOLDMONT_PLUS (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT_PLUS) > #define m_TREMONT (HOST_WIDE_INT_1U<<PROCESSOR_TREMONT) > diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def > index 8a8d5ab2440..c8abaedad8c 100644 > --- a/gcc/config/i386/x86-tune.def > +++ b/gcc/config/i386/x86-tune.def > @@ -444,9 +444,12 @@ DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, > "256_unaligned_store_optimal" > DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2 > | m_ZNVER1) > > -/* X86_TUNE_AVX256_OPTIMAL: Use 256-bit AVX instructions instead of > 512-bit AVX > - instructions in the auto-vectorizer. */ > -DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_SKYLAKE_AVX512) > +/* X86_TUNE_AVX256_OPTIMAL: Use 256-bit AVX instructions instead of > 512-bit > + AVX instructions in the auto-vectorizer. NB: This is also enabled for > + -mtune=haswell so that we can tune Skylake, Cannonlake and Icelake as > + Haswell. */ > +DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_SKYLAKE_AVX512 > + | m_HASWELL) > > /*********************************************************** > ******************/ > /* Historical relics: tuning flags that helps a specific old CPU designs > */ > diff --git a/gcc/testsuite/gcc.target/i386/pr84413-1.c > b/gcc/testsuite/gcc.target/i386/pr84413-1.c > new file mode 100644 > index 00000000000..1c94d7715cf > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr84413-1.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -march=skylake-avx512" } */ > +/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */ > +/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" } } > */ > + > +#define N 1024 > + > +double a[N], b[N], c[N]; > + > +void > +avx512f_test (void) > +{ > + int i; > + > + for (i = 0; i < N; i++) > + c[i] = a[i] * b[i]; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr84413-2.c > b/gcc/testsuite/gcc.target/i386/pr84413-2.c > new file mode 100644 > index 00000000000..adf9b527cd6 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr84413-2.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -march=cannonlake" } */ > +/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */ > +/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" } } > */ > + > +#define N 1024 > + > +double a[N], b[N], c[N]; > + > +void > +avx512f_test (void) > +{ > + int i; > + > + for (i = 0; i < N; i++) > + c[i] = a[i] * b[i]; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr84413-3.c > b/gcc/testsuite/gcc.target/i386/pr84413-3.c > new file mode 100644 > index 00000000000..76bf25fc56b > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr84413-3.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -march=icelake-server" } */ > +/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */ > +/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" } } > */ > + > +#define N 1024 > + > +double a[N], b[N], c[N]; > + > +void > +avx512f_test (void) > +{ > + int i; > + > + for (i = 0; i < N; i++) > + c[i] = a[i] * b[i]; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr84413-4.c > b/gcc/testsuite/gcc.target/i386/pr84413-4.c > new file mode 100644 > index 00000000000..031ef0c8916 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr84413-4.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -march=haswell -mavx512f" } */ > +/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */ > +/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" } } > */ > + > +#define N 1024 > + > +double a[N], b[N], c[N]; > + > +void > +avx512f_test (void) > +{ > + int i; > + > + for (i = 0; i < N; i++) > + c[i] = a[i] * b[i]; > +} > -- > 2.17.1 > >
On Fri, Jul 13, 2018 at 08:53:02AM +0200, Uros Bizjak wrote: > On Thu, Jul 12, 2018 at 9:57 PM, H.J. Lu <hongjiu.lu@intel.com> wrote: > > > r259399, which added PROCESSOR_SKYLAKE, disabled many x86 optimizations > > which are enabled by PROCESSOR_HASWELL. As the result, -mtune=skylake > > generates slower codes on Skylake than before. The same also applies > > to Cannonlake and Icelak tuning. > > > > This patch changes -mtune={skylake|cannonlake|icelake} to tune like > > -mtune=haswell for until their tuning is properly adjusted. It also > > enables -mprefer-vector-width=256 for -mtune=haswell, which has no > > impact on codegen when AVX512 isn't enabled. > > > > Performance impacts on SPEC CPU 2017 rate with 1 copy using > > > > -march=native -mfpmath=sse -O2 -m64 > > > > are > > > > 1. On Broadwell server: > > > > 500.perlbench_r -0.56% > > 502.gcc_r -0.18% > > 505.mcf_r 0.24% > > 520.omnetpp_r 0.00% > > 523.xalancbmk_r -0.32% > > 525.x264_r -0.17% > > 531.deepsjeng_r 0.00% > > 541.leela_r 0.00% > > 548.exchange2_r 0.12% > > 557.xz_r 0.00% > > geomean 0.00% > > > > 503.bwaves_r 0.00% > > 507.cactuBSSN_r 0.21% > > 508.namd_r 0.00% > > 510.parest_r 0.19% > > 511.povray_r -0.48% > > 519.lbm_r 0.00% > > 521.wrf_r 0.28% > > 526.blender_r 0.19% > > 527.cam4_r 0.39% > > 538.imagick_r 0.00% > > 544.nab_r -0.36% > > 549.fotonik3d_r 0.51% > > 554.roms_r 0.00% > > geomean 0.17% > > > > On Skylake client: > > > > 500.perlbench_r 0.96% > > 502.gcc_r 0.13% > > 505.mcf_r -1.03% > > 520.omnetpp_r -1.11% > > 523.xalancbmk_r 1.02% > > 525.x264_r 0.50% > > 531.deepsjeng_r 2.97% > > 541.leela_r 0.50% > > 548.exchange2_r -0.95% > > 557.xz_r 2.41% > > geomean 0.56% > > > > 503.bwaves_r 0.49% > > 507.cactuBSSN_r 3.17% > > 508.namd_r 4.05% > > 510.parest_r 0.15% > > 511.povray_r 0.80% > > 519.lbm_r 3.15% > > 521.wrf_r 10.56% > > 526.blender_r 2.97% > > 527.cam4_r 2.36% > > 538.imagick_r 46.40% > > 544.nab_r 2.04% > > 549.fotonik3d_r 0.00% > > 554.roms_r 1.27% > > geomean 5.49% > > > > On Skylake server: > > > > 500.perlbench_r 0.71% > > 502.gcc_r -0.51% > > 505.mcf_r -1.06% > > 520.omnetpp_r -0.33% > > 523.xalancbmk_r -0.22% > > 525.x264_r 1.72% > > 531.deepsjeng_r -0.26% > > 541.leela_r 0.57% > > 548.exchange2_r -0.75% > > 557.xz_r -1.28% > > geomean -0.21% > > > > 503.bwaves_r 0.00% > > 507.cactuBSSN_r 2.66% > > 508.namd_r 3.67% > > 510.parest_r 1.25% > > 511.povray_r 2.26% > > 519.lbm_r 1.69% > > 521.wrf_r 11.03% > > 526.blender_r 3.39% > > 527.cam4_r 1.69% > > 538.imagick_r 64.59% > > 544.nab_r -0.54% > > 549.fotonik3d_r 2.68% > > 554.roms_r 0.00% > > geomean 6.19% > > > > This patch improves -march=native performance on Skylake up to 60% and > > leaves -march=native performance unchanged on Haswell. > > > > OK for trunk? > > > > Thanks. > > > > H.J. > > --- > > gcc/ > > > > 2018-07-12 H.J. Lu <hongjiu.lu@intel.com> > > Sunil K Pandey <sunil.k.pandey@intel.com> > > > > PR target/84413 > > * config/i386/i386.c (m_HASWELL): Add PROCESSOR_SKYLAKE, > > PROCESSOR_SKYLAKE_AVX512, PROCESSOR_CANNONLAKE, > > PROCESSOR_ICELAKE_CLIENT and PROCESSOR_ICELAKE_SERVER. > > (m_SKYLAKE): Set to 0. > > (m_SKYLAKE_AVX512): Likewise. > > (m_CANNONLAKE): Likewise. > > (m_ICELAKE_CLIENT): Likewise. > > (m_ICELAKE_SERVER): Likewise. > > * config/i386/x86-tune.def (avx256_optimal): Also enabled for > > m_HASWELL. > > > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > > index 9e46b7b136f..762ab89fc9e 100644 > > --- a/gcc/config/i386/i386.c > > +++ b/gcc/config/i386/i386.c > > @@ -137,17 +137,22 @@ const struct processor_costs *ix86_cost = NULL; > > #define m_CORE2 (HOST_WIDE_INT_1U<<PROCESSOR_CORE2) > > #define m_NEHALEM (HOST_WIDE_INT_1U<<PROCESSOR_NEHALEM) > > #define m_SANDYBRIDGE (HOST_WIDE_INT_1U<<PROCESSOR_SANDYBRIDGE) > > -#define m_HASWELL (HOST_WIDE_INT_1U<<PROCESSOR_HASWELL) > > +#define m_HASWELL ((HOST_WIDE_INT_1U<<PROCESSOR_HASWELL) \ > > + | (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE) \ > > + | (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE_AVX512) \ > > + | (HOST_WIDE_INT_1U<<PROCESSOR_CANNONLAKE) \ > > + | (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_CLIENT) \ > > + | (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_SERVER)) > > > > Please introduce a new per-family define and group processors in this > define. Something like m_BDVER, m_BTVER and m_AMD_MULTIPLE for AMD targets. > We should not redefine m_HASWELL to include unrelated families. > Here is the updated patch. OK for trunk if all tests pass? Thanks. H.J. ---- r259399, which added PROCESSOR_SKYLAKE, disabled many x86 optimizations which are enabled by PROCESSOR_HASWELL. As the result, -mtune=skylake generates slower codes on Skylake than before. The same also applies to Cannonlake and Icelak tuning. This patch changes -mtune={skylake|cannonlake|icelake} to tune like -mtune=haswell for until their tuning is properly adjusted. It also enables -mprefer-vector-width=256 for -mtune=haswell, which has no impact on codegen when AVX512 isn't enabled. Performance impacts on SPEC CPU 2017 rate with 1 copy using -march=native -mfpmath=sse -O2 -m64 are 1. On Broadwell server: 500.perlbench_r -0.56% 502.gcc_r -0.18% 505.mcf_r 0.24% 520.omnetpp_r 0.00% 523.xalancbmk_r -0.32% 525.x264_r -0.17% 531.deepsjeng_r 0.00% 541.leela_r 0.00% 548.exchange2_r 0.12% 557.xz_r 0.00% Geomean 0.00% 503.bwaves_r 0.00% 507.cactuBSSN_r 0.21% 508.namd_r 0.00% 510.parest_r 0.19% 511.povray_r -0.48% 519.lbm_r 0.00% 521.wrf_r 0.28% 526.blender_r 0.19% 527.cam4_r 0.39% 538.imagick_r 0.00% 544.nab_r -0.36% 549.fotonik3d_r 0.51% 554.roms_r 0.00% Geomean 0.17% On Skylake client: 500.perlbench_r 0.96% 502.gcc_r 0.13% 505.mcf_r -1.03% 520.omnetpp_r -1.11% 523.xalancbmk_r 1.02% 525.x264_r 0.50% 531.deepsjeng_r 2.97% 541.leela_r 0.50% 548.exchange2_r -0.95% 557.xz_r 2.41% Geomean 0.56% 503.bwaves_r 0.49% 507.cactuBSSN_r 3.17% 508.namd_r 4.05% 510.parest_r 0.15% 511.povray_r 0.80% 519.lbm_r 3.15% 521.wrf_r 10.56% 526.blender_r 2.97% 527.cam4_r 2.36% 538.imagick_r 46.40% 544.nab_r 2.04% 549.fotonik3d_r 0.00% 554.roms_r 1.27% Geomean 5.49% On Skylake server: 500.perlbench_r 0.71% 502.gcc_r -0.51% 505.mcf_r -1.06% 520.omnetpp_r -0.33% 523.xalancbmk_r -0.22% 525.x264_r 1.72% 531.deepsjeng_r -0.26% 541.leela_r 0.57% 548.exchange2_r -0.75% 557.xz_r -1.28% Geomean -0.21% 503.bwaves_r 0.00% 507.cactuBSSN_r 2.66% 508.namd_r 3.67% 510.parest_r 1.25% 511.povray_r 2.26% 519.lbm_r 1.69% 521.wrf_r 11.03% 526.blender_r 3.39% 527.cam4_r 1.69% 538.imagick_r 64.59% 544.nab_r -0.54% 549.fotonik3d_r 2.68% 554.roms_r 0.00% Geomean 6.19% This patch improves -march=native performance on Skylake up to 60% and leaves -march=native performance unchanged on Haswell. gcc/ 2018-07-12 H.J. Lu <hongjiu.lu@intel.com> Sunil K Pandey <sunil.k.pandey@intel.com> PR target/84413 * config/i386/i386.c (m_CORE_AVX512): New. (m_CORE_AVX2): Likewise. (m_CORE_ALL): Add m_CORE_AVX2. * config/i386/x86-tune.def: Replace m_HASWELL with m_CORE_AVX2. Replace m_SKYLAKE_AVX512 with m_CORE_AVX512 on avx256_optimal and remove the rest of m_SKYLAKE_AVX512. gcc/testsuite/ 2018-07-12 H.J. Lu <hongjiu.lu@intel.com> Sunil K Pandey <sunil.k.pandey@intel.com> PR target/84413 * gcc.target/i386/pr84413-1.c: New test. * gcc.target/i386/pr84413-2.c: Likewise. * gcc.target/i386/pr84413-3.c: Likewise. * gcc.target/i386/pr84413-4.c: Likewise. --- gcc/config/i386/i386.c | 5 ++++- gcc/config/i386/x86-tune.def | 26 +++++++++++------------ gcc/testsuite/gcc.target/i386/pr84413-1.c | 17 +++++++++++++++ gcc/testsuite/gcc.target/i386/pr84413-2.c | 17 +++++++++++++++ gcc/testsuite/gcc.target/i386/pr84413-3.c | 17 +++++++++++++++ gcc/testsuite/gcc.target/i386/pr84413-4.c | 17 +++++++++++++++ 6 files changed, 85 insertions(+), 14 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr84413-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr84413-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pr84413-3.c create mode 100644 gcc/testsuite/gcc.target/i386/pr84413-4.c diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 9e46b7b136f..ccc24e375ad 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -138,7 +138,6 @@ const struct processor_costs *ix86_cost = NULL; #define m_NEHALEM (HOST_WIDE_INT_1U<<PROCESSOR_NEHALEM) #define m_SANDYBRIDGE (HOST_WIDE_INT_1U<<PROCESSOR_SANDYBRIDGE) #define m_HASWELL (HOST_WIDE_INT_1U<<PROCESSOR_HASWELL) -#define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL) #define m_BONNELL (HOST_WIDE_INT_1U<<PROCESSOR_BONNELL) #define m_SILVERMONT (HOST_WIDE_INT_1U<<PROCESSOR_SILVERMONT) #define m_KNL (HOST_WIDE_INT_1U<<PROCESSOR_KNL) @@ -148,6 +147,10 @@ const struct processor_costs *ix86_cost = NULL; #define m_CANNONLAKE (HOST_WIDE_INT_1U<<PROCESSOR_CANNONLAKE) #define m_ICELAKE_CLIENT (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_CLIENT) #define m_ICELAKE_SERVER (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_SERVER) +#define m_CORE_AVX512 (m_SKYLAKE_AVX512 | m_CANNONLAKE \ + | m_ICELAKE_CLIENT | m_ICELAKE_SERVER) +#define m_CORE_AVX2 (m_HASWELL | m_SKYLAKE | m_CORE_AVX512) +#define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2) #define m_GOLDMONT (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT) #define m_GOLDMONT_PLUS (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT_PLUS) #define m_TREMONT (HOST_WIDE_INT_1U<<PROCESSOR_TREMONT) diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 8a8d5ab2440..a46450ad99d 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -49,9 +49,9 @@ DEF_TUNE (X86_TUNE_SCHEDULE, "schedule", over partial stores. For example preffer MOVZBL or MOVQ to load 8bit value over movb. */ DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency", - m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL + m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL - | m_KNL | m_KNM | m_AMD_MULTIPLE | m_SKYLAKE_AVX512 | m_TREMONT + | m_KNL | m_KNM | m_AMD_MULTIPLE | m_TREMONT | m_GENERIC) /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: This knob promotes all store @@ -87,8 +87,8 @@ DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall", DEF_TUNE (X86_TUNE_MOVX, "movx", m_PPRO | m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_KNL | m_KNM | m_INTEL - | m_GOLDMONT_PLUS | m_GEODE | m_AMD_MULTIPLE | m_SKYLAKE_AVX512 - | m_HASWELL | m_TREMONT | m_GENERIC) + | m_GOLDMONT_PLUS | m_GEODE | m_AMD_MULTIPLE + | m_CORE_AVX2 | m_TREMONT | m_GENERIC) /* X86_TUNE_MEMORY_MISMATCH_STALL: Avoid partial stores that are followed by full sized loads. */ @@ -105,19 +105,19 @@ DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_32, "fuse_cmp_and_branch_32", /* X86_TUNE_FUSE_CMP_AND_BRANCH_64: Fuse compare with a subsequent conditional jump instruction for TARGET_64BIT. */ DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_64, "fuse_cmp_and_branch_64", - m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_BDVER | m_ZNVER1 | m_GENERIC) + m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER | m_ZNVER1 | m_GENERIC) /* X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS: Fuse compare with a subsequent conditional jump instruction when the condition jump check sign flag (SF) or overflow flag (OF). */ DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags", - m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_BDVER | m_ZNVER1 | m_GENERIC) + m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER | m_ZNVER1 | m_GENERIC) /* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional jump instruction when the alu instruction produces the CCFLAG consumed by the conditional jump instruction. */ DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch", - m_SANDYBRIDGE | m_HASWELL | m_GENERIC) + m_SANDYBRIDGE | m_CORE_AVX2 | m_GENERIC) /*****************************************************************************/ @@ -297,7 +297,7 @@ DEF_TUNE (X86_TUNE_USE_BT, "use_bt", /* X86_TUNE_AVOID_FALSE_DEP_FOR_BMI: Avoid false dependency for bit-manipulation instructions. */ DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi", - m_SANDYBRIDGE | m_HASWELL | m_GENERIC) + m_SANDYBRIDGE | m_CORE_AVX2 | m_GENERIC) /* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based on hardware capabilities. Bdver3 hardware has a loop buffer which makes @@ -349,15 +349,15 @@ DEF_TUNE (X86_TUNE_GENERAL_REGS_SSE_SPILL, "general_regs_sse_spill", /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL: Use movups for misaligned loads instead of a sequence loading registers by parts. */ DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal", - m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_SILVERMONT | m_KNL | m_KNM - | m_INTEL | m_SKYLAKE_AVX512 | m_GOLDMONT | m_GOLDMONT_PLUS + m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_KNL | m_KNM + | m_INTEL | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_AMDFAM10 | m_BDVER | m_BTVER | m_ZNVER1 | m_GENERIC) /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned stores instead of a sequence loading registers by parts. */ DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal", - m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_SILVERMONT | m_KNL | m_KNM - | m_INTEL | m_SKYLAKE_AVX512 | m_GOLDMONT | m_GOLDMONT_PLUS + m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_KNL | m_KNM + | m_INTEL | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_BDVER | m_ZNVER1 | m_GENERIC) /* Use packed single precision instructions where posisble. I.e. movups instead @@ -446,7 +446,7 @@ DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2 /* X86_TUNE_AVX256_OPTIMAL: Use 256-bit AVX instructions instead of 512-bit AVX instructions in the auto-vectorizer. */ -DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_SKYLAKE_AVX512) +DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_CORE_AVX512) /*****************************************************************************/ /* Historical relics: tuning flags that helps a specific old CPU designs */ diff --git a/gcc/testsuite/gcc.target/i386/pr84413-1.c b/gcc/testsuite/gcc.target/i386/pr84413-1.c new file mode 100644 index 00000000000..1c94d7715cf --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr84413-1.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=skylake-avx512" } */ +/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */ +/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" } } */ + +#define N 1024 + +double a[N], b[N], c[N]; + +void +avx512f_test (void) +{ + int i; + + for (i = 0; i < N; i++) + c[i] = a[i] * b[i]; +} diff --git a/gcc/testsuite/gcc.target/i386/pr84413-2.c b/gcc/testsuite/gcc.target/i386/pr84413-2.c new file mode 100644 index 00000000000..adf9b527cd6 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr84413-2.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=cannonlake" } */ +/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */ +/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" } } */ + +#define N 1024 + +double a[N], b[N], c[N]; + +void +avx512f_test (void) +{ + int i; + + for (i = 0; i < N; i++) + c[i] = a[i] * b[i]; +} diff --git a/gcc/testsuite/gcc.target/i386/pr84413-3.c b/gcc/testsuite/gcc.target/i386/pr84413-3.c new file mode 100644 index 00000000000..76bf25fc56b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr84413-3.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=icelake-server" } */ +/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */ +/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" } } */ + +#define N 1024 + +double a[N], b[N], c[N]; + +void +avx512f_test (void) +{ + int i; + + for (i = 0; i < N; i++) + c[i] = a[i] * b[i]; +} diff --git a/gcc/testsuite/gcc.target/i386/pr84413-4.c b/gcc/testsuite/gcc.target/i386/pr84413-4.c new file mode 100644 index 00000000000..031ef0c8916 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr84413-4.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=haswell -mavx512f" } */ +/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */ +/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" } } */ + +#define N 1024 + +double a[N], b[N], c[N]; + +void +avx512f_test (void) +{ + int i; + + for (i = 0; i < N; i++) + c[i] = a[i] * b[i]; +}
On Fri, Jul 13, 2018 at 3:12 PM, H.J. Lu <hjl.tools@gmail.com> wrote: > On Fri, Jul 13, 2018 at 08:53:02AM +0200, Uros Bizjak wrote: > > On Thu, Jul 12, 2018 at 9:57 PM, H.J. Lu <hongjiu.lu@intel.com> wrote: > > > > > r259399, which added PROCESSOR_SKYLAKE, disabled many x86 optimizations > > > which are enabled by PROCESSOR_HASWELL. As the result, -mtune=skylake > > > generates slower codes on Skylake than before. The same also applies > > > to Cannonlake and Icelak tuning. > > > > > > This patch changes -mtune={skylake|cannonlake|icelake} to tune like > > > -mtune=haswell for until their tuning is properly adjusted. It also > > > enables -mprefer-vector-width=256 for -mtune=haswell, which has no > > > impact on codegen when AVX512 isn't enabled. > > > > > > Performance impacts on SPEC CPU 2017 rate with 1 copy using > > > > > > -march=native -mfpmath=sse -O2 -m64 > > > > > > are > > > > > > 1. On Broadwell server: > > > > > > 500.perlbench_r -0.56% > > > 502.gcc_r -0.18% > > > 505.mcf_r 0.24% > > > 520.omnetpp_r 0.00% > > > 523.xalancbmk_r -0.32% > > > 525.x264_r -0.17% > > > 531.deepsjeng_r 0.00% > > > 541.leela_r 0.00% > > > 548.exchange2_r 0.12% > > > 557.xz_r 0.00% > > > geomean 0.00% > > > > > > 503.bwaves_r 0.00% > > > 507.cactuBSSN_r 0.21% > > > 508.namd_r 0.00% > > > 510.parest_r 0.19% > > > 511.povray_r -0.48% > > > 519.lbm_r 0.00% > > > 521.wrf_r 0.28% > > > 526.blender_r 0.19% > > > 527.cam4_r 0.39% > > > 538.imagick_r 0.00% > > > 544.nab_r -0.36% > > > 549.fotonik3d_r 0.51% > > > 554.roms_r 0.00% > > > geomean 0.17% > > > > > > On Skylake client: > > > > > > 500.perlbench_r 0.96% > > > 502.gcc_r 0.13% > > > 505.mcf_r -1.03% > > > 520.omnetpp_r -1.11% > > > 523.xalancbmk_r 1.02% > > > 525.x264_r 0.50% > > > 531.deepsjeng_r 2.97% > > > 541.leela_r 0.50% > > > 548.exchange2_r -0.95% > > > 557.xz_r 2.41% > > > geomean 0.56% > > > > > > 503.bwaves_r 0.49% > > > 507.cactuBSSN_r 3.17% > > > 508.namd_r 4.05% > > > 510.parest_r 0.15% > > > 511.povray_r 0.80% > > > 519.lbm_r 3.15% > > > 521.wrf_r 10.56% > > > 526.blender_r 2.97% > > > 527.cam4_r 2.36% > > > 538.imagick_r 46.40% > > > 544.nab_r 2.04% > > > 549.fotonik3d_r 0.00% > > > 554.roms_r 1.27% > > > geomean 5.49% > > > > > > On Skylake server: > > > > > > 500.perlbench_r 0.71% > > > 502.gcc_r -0.51% > > > 505.mcf_r -1.06% > > > 520.omnetpp_r -0.33% > > > 523.xalancbmk_r -0.22% > > > 525.x264_r 1.72% > > > 531.deepsjeng_r -0.26% > > > 541.leela_r 0.57% > > > 548.exchange2_r -0.75% > > > 557.xz_r -1.28% > > > geomean -0.21% > > > > > > 503.bwaves_r 0.00% > > > 507.cactuBSSN_r 2.66% > > > 508.namd_r 3.67% > > > 510.parest_r 1.25% > > > 511.povray_r 2.26% > > > 519.lbm_r 1.69% > > > 521.wrf_r 11.03% > > > 526.blender_r 3.39% > > > 527.cam4_r 1.69% > > > 538.imagick_r 64.59% > > > 544.nab_r -0.54% > > > 549.fotonik3d_r 2.68% > > > 554.roms_r 0.00% > > > geomean 6.19% > > > > > > This patch improves -march=native performance on Skylake up to 60% and > > > leaves -march=native performance unchanged on Haswell. > > > > > > OK for trunk? > > > > > > Thanks. > > > > > > H.J. > > > --- > > > gcc/ > > > > > > 2018-07-12 H.J. Lu <hongjiu.lu@intel.com> > > > Sunil K Pandey <sunil.k.pandey@intel.com> > > > > > > PR target/84413 > > > * config/i386/i386.c (m_HASWELL): Add PROCESSOR_SKYLAKE, > > > PROCESSOR_SKYLAKE_AVX512, PROCESSOR_CANNONLAKE, > > > PROCESSOR_ICELAKE_CLIENT and PROCESSOR_ICELAKE_SERVER. > > > (m_SKYLAKE): Set to 0. > > > (m_SKYLAKE_AVX512): Likewise. > > > (m_CANNONLAKE): Likewise. > > > (m_ICELAKE_CLIENT): Likewise. > > > (m_ICELAKE_SERVER): Likewise. > > > * config/i386/x86-tune.def (avx256_optimal): Also enabled for > > > m_HASWELL. > > > > > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > > > index 9e46b7b136f..762ab89fc9e 100644 > > > --- a/gcc/config/i386/i386.c > > > +++ b/gcc/config/i386/i386.c > > > @@ -137,17 +137,22 @@ const struct processor_costs *ix86_cost = NULL; > > > #define m_CORE2 (HOST_WIDE_INT_1U<<PROCESSOR_CORE2) > > > #define m_NEHALEM (HOST_WIDE_INT_1U<<PROCESSOR_NEHALEM) > > > #define m_SANDYBRIDGE (HOST_WIDE_INT_1U<<PROCESSOR_SANDYBRIDGE) > > > -#define m_HASWELL (HOST_WIDE_INT_1U<<PROCESSOR_HASWELL) > > > +#define m_HASWELL ((HOST_WIDE_INT_1U<<PROCESSOR_HASWELL) \ > > > + | (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE) \ > > > + | (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE_AVX512) \ > > > + | (HOST_WIDE_INT_1U<<PROCESSOR_CANNONLAKE) \ > > > + | (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_CLIENT) \ > > > + | (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_SERVER)) > > > > > > > Please introduce a new per-family define and group processors in this > > define. Something like m_BDVER, m_BTVER and m_AMD_MULTIPLE for AMD > targets. > > We should not redefine m_HASWELL to include unrelated families. > > > > Here is the updated patch. OK for trunk if all tests pass? > > OK. Thanks, Uros. Thanks. > > H.J. > ---- > r259399, which added PROCESSOR_SKYLAKE, disabled many x86 optimizations > which are enabled by PROCESSOR_HASWELL. As the result, -mtune=skylake > generates slower codes on Skylake than before. The same also applies > to Cannonlake and Icelak tuning. > > This patch changes -mtune={skylake|cannonlake|icelake} to tune like > -mtune=haswell for until their tuning is properly adjusted. It also > enables -mprefer-vector-width=256 for -mtune=haswell, which has no > impact on codegen when AVX512 isn't enabled. > > Performance impacts on SPEC CPU 2017 rate with 1 copy using > > -march=native -mfpmath=sse -O2 -m64 > > are > > 1. On Broadwell server: > > 500.perlbench_r -0.56% > 502.gcc_r -0.18% > 505.mcf_r 0.24% > 520.omnetpp_r 0.00% > 523.xalancbmk_r -0.32% > 525.x264_r -0.17% > 531.deepsjeng_r 0.00% > 541.leela_r 0.00% > 548.exchange2_r 0.12% > 557.xz_r 0.00% > Geomean 0.00% > > 503.bwaves_r 0.00% > 507.cactuBSSN_r 0.21% > 508.namd_r 0.00% > 510.parest_r 0.19% > 511.povray_r -0.48% > 519.lbm_r 0.00% > 521.wrf_r 0.28% > 526.blender_r 0.19% > 527.cam4_r 0.39% > 538.imagick_r 0.00% > 544.nab_r -0.36% > 549.fotonik3d_r 0.51% > 554.roms_r 0.00% > Geomean 0.17% > > On Skylake client: > > 500.perlbench_r 0.96% > 502.gcc_r 0.13% > 505.mcf_r -1.03% > 520.omnetpp_r -1.11% > 523.xalancbmk_r 1.02% > 525.x264_r 0.50% > 531.deepsjeng_r 2.97% > 541.leela_r 0.50% > 548.exchange2_r -0.95% > 557.xz_r 2.41% > Geomean 0.56% > > 503.bwaves_r 0.49% > 507.cactuBSSN_r 3.17% > 508.namd_r 4.05% > 510.parest_r 0.15% > 511.povray_r 0.80% > 519.lbm_r 3.15% > 521.wrf_r 10.56% > 526.blender_r 2.97% > 527.cam4_r 2.36% > 538.imagick_r 46.40% > 544.nab_r 2.04% > 549.fotonik3d_r 0.00% > 554.roms_r 1.27% > Geomean 5.49% > > On Skylake server: > > 500.perlbench_r 0.71% > 502.gcc_r -0.51% > 505.mcf_r -1.06% > 520.omnetpp_r -0.33% > 523.xalancbmk_r -0.22% > 525.x264_r 1.72% > 531.deepsjeng_r -0.26% > 541.leela_r 0.57% > 548.exchange2_r -0.75% > 557.xz_r -1.28% > Geomean -0.21% > > 503.bwaves_r 0.00% > 507.cactuBSSN_r 2.66% > 508.namd_r 3.67% > 510.parest_r 1.25% > 511.povray_r 2.26% > 519.lbm_r 1.69% > 521.wrf_r 11.03% > 526.blender_r 3.39% > 527.cam4_r 1.69% > 538.imagick_r 64.59% > 544.nab_r -0.54% > 549.fotonik3d_r 2.68% > 554.roms_r 0.00% > Geomean 6.19% > > This patch improves -march=native performance on Skylake up to 60% and > leaves -march=native performance unchanged on Haswell. > > gcc/ > > 2018-07-12 H.J. Lu <hongjiu.lu@intel.com> > Sunil K Pandey <sunil.k.pandey@intel.com> > > PR target/84413 > * config/i386/i386.c (m_CORE_AVX512): New. > (m_CORE_AVX2): Likewise. > (m_CORE_ALL): Add m_CORE_AVX2. > * config/i386/x86-tune.def: Replace m_HASWELL with m_CORE_AVX2. > Replace m_SKYLAKE_AVX512 with m_CORE_AVX512 on avx256_optimal > and remove the rest of m_SKYLAKE_AVX512. > > gcc/testsuite/ > > 2018-07-12 H.J. Lu <hongjiu.lu@intel.com> > Sunil K Pandey <sunil.k.pandey@intel.com> > > PR target/84413 > * gcc.target/i386/pr84413-1.c: New test. > * gcc.target/i386/pr84413-2.c: Likewise. > * gcc.target/i386/pr84413-3.c: Likewise. > * gcc.target/i386/pr84413-4.c: Likewise. > > --- > gcc/config/i386/i386.c | 5 ++++- > gcc/config/i386/x86-tune.def | 26 +++++++++++------------ > gcc/testsuite/gcc.target/i386/pr84413-1.c | 17 +++++++++++++++ > gcc/testsuite/gcc.target/i386/pr84413-2.c | 17 +++++++++++++++ > gcc/testsuite/gcc.target/i386/pr84413-3.c | 17 +++++++++++++++ > gcc/testsuite/gcc.target/i386/pr84413-4.c | 17 +++++++++++++++ > 6 files changed, 85 insertions(+), 14 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr84413-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr84413-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr84413-3.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr84413-4.c > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index 9e46b7b136f..ccc24e375ad 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -138,7 +138,6 @@ const struct processor_costs *ix86_cost = NULL; > #define m_NEHALEM (HOST_WIDE_INT_1U<<PROCESSOR_NEHALEM) > #define m_SANDYBRIDGE (HOST_WIDE_INT_1U<<PROCESSOR_SANDYBRIDGE) > #define m_HASWELL (HOST_WIDE_INT_1U<<PROCESSOR_HASWELL) > -#define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL) > #define m_BONNELL (HOST_WIDE_INT_1U<<PROCESSOR_BONNELL) > #define m_SILVERMONT (HOST_WIDE_INT_1U<<PROCESSOR_SILVERMONT) > #define m_KNL (HOST_WIDE_INT_1U<<PROCESSOR_KNL) > @@ -148,6 +147,10 @@ const struct processor_costs *ix86_cost = NULL; > #define m_CANNONLAKE (HOST_WIDE_INT_1U<<PROCESSOR_CANNONLAKE) > #define m_ICELAKE_CLIENT (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_CLIENT) > #define m_ICELAKE_SERVER (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_SERVER) > +#define m_CORE_AVX512 (m_SKYLAKE_AVX512 | m_CANNONLAKE \ > + | m_ICELAKE_CLIENT | m_ICELAKE_SERVER) > +#define m_CORE_AVX2 (m_HASWELL | m_SKYLAKE | m_CORE_AVX512) > +#define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2) > #define m_GOLDMONT (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT) > #define m_GOLDMONT_PLUS (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT_PLUS) > #define m_TREMONT (HOST_WIDE_INT_1U<<PROCESSOR_TREMONT) > diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def > index 8a8d5ab2440..a46450ad99d 100644 > --- a/gcc/config/i386/x86-tune.def > +++ b/gcc/config/i386/x86-tune.def > @@ -49,9 +49,9 @@ DEF_TUNE (X86_TUNE_SCHEDULE, "schedule", > over partial stores. For example preffer MOVZBL or MOVQ to load 8bit > value over movb. */ > DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency", > - m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL > + m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 > | m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | > m_INTEL > - | m_KNL | m_KNM | m_AMD_MULTIPLE | m_SKYLAKE_AVX512 | m_TREMONT > + | m_KNL | m_KNM | m_AMD_MULTIPLE | m_TREMONT > | m_GENERIC) > > /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: This knob promotes all store > @@ -87,8 +87,8 @@ DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, > "partial_flag_reg_stall", > DEF_TUNE (X86_TUNE_MOVX, "movx", > m_PPRO | m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE > | m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_KNL | m_KNM | m_INTEL > - | m_GOLDMONT_PLUS | m_GEODE | m_AMD_MULTIPLE | m_SKYLAKE_AVX512 > - | m_HASWELL | m_TREMONT | m_GENERIC) > + | m_GOLDMONT_PLUS | m_GEODE | m_AMD_MULTIPLE > + | m_CORE_AVX2 | m_TREMONT | m_GENERIC) > > /* X86_TUNE_MEMORY_MISMATCH_STALL: Avoid partial stores that are > followed by > full sized loads. */ > @@ -105,19 +105,19 @@ DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_32, > "fuse_cmp_and_branch_32", > /* X86_TUNE_FUSE_CMP_AND_BRANCH_64: Fuse compare with a subsequent > conditional jump instruction for TARGET_64BIT. */ > DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_64, "fuse_cmp_and_branch_64", > - m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_BDVER | m_ZNVER1 | > m_GENERIC) > + m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER | m_ZNVER1 | > m_GENERIC) > > /* X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS: Fuse compare with a > subsequent conditional jump instruction when the condition jump > check sign flag (SF) or overflow flag (OF). */ > DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, > "fuse_cmp_and_branch_soflags", > - m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_BDVER | m_ZNVER1 | > m_GENERIC) > + m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER | m_ZNVER1 | > m_GENERIC) > > /* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional > jump instruction when the alu instruction produces the CCFLAG consumed > by > the conditional jump instruction. */ > DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch", > - m_SANDYBRIDGE | m_HASWELL | m_GENERIC) > + m_SANDYBRIDGE | m_CORE_AVX2 | m_GENERIC) > > > /*********************************************************** > ******************/ > @@ -297,7 +297,7 @@ DEF_TUNE (X86_TUNE_USE_BT, "use_bt", > /* X86_TUNE_AVOID_FALSE_DEP_FOR_BMI: Avoid false dependency > for bit-manipulation instructions. */ > DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi", > - m_SANDYBRIDGE | m_HASWELL | m_GENERIC) > + m_SANDYBRIDGE | m_CORE_AVX2 | m_GENERIC) > > /* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based > on hardware capabilities. Bdver3 hardware has a loop buffer which makes > @@ -349,15 +349,15 @@ DEF_TUNE (X86_TUNE_GENERAL_REGS_SSE_SPILL, > "general_regs_sse_spill", > /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL: Use movups for misaligned loads > instead > of a sequence loading registers by parts. */ > DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, > "sse_unaligned_load_optimal", > - m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_SILVERMONT | m_KNL | > m_KNM > - | m_INTEL | m_SKYLAKE_AVX512 | m_GOLDMONT | m_GOLDMONT_PLUS > + m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_KNL | > m_KNM > + | m_INTEL | m_GOLDMONT | m_GOLDMONT_PLUS > | m_TREMONT | m_AMDFAM10 | m_BDVER | m_BTVER | m_ZNVER1 | > m_GENERIC) > > /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned > stores instead > of a sequence loading registers by parts. */ > DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, > "sse_unaligned_store_optimal", > - m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_SILVERMONT | m_KNL | > m_KNM > - | m_INTEL | m_SKYLAKE_AVX512 | m_GOLDMONT | m_GOLDMONT_PLUS > + m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_KNL | > m_KNM > + | m_INTEL | m_GOLDMONT | m_GOLDMONT_PLUS > | m_TREMONT | m_BDVER | m_ZNVER1 | m_GENERIC) > > /* Use packed single precision instructions where posisble. I.e. movups > instead > @@ -446,7 +446,7 @@ DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", > m_BDVER | m_BTVER2 > > /* X86_TUNE_AVX256_OPTIMAL: Use 256-bit AVX instructions instead of > 512-bit AVX > instructions in the auto-vectorizer. */ > -DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_SKYLAKE_AVX512) > +DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_CORE_AVX512) > > /*********************************************************** > ******************/ > /* Historical relics: tuning flags that helps a specific old CPU designs > */ > diff --git a/gcc/testsuite/gcc.target/i386/pr84413-1.c > b/gcc/testsuite/gcc.target/i386/pr84413-1.c > new file mode 100644 > index 00000000000..1c94d7715cf > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr84413-1.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -march=skylake-avx512" } */ > +/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */ > +/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" } } > */ > + > +#define N 1024 > + > +double a[N], b[N], c[N]; > + > +void > +avx512f_test (void) > +{ > + int i; > + > + for (i = 0; i < N; i++) > + c[i] = a[i] * b[i]; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr84413-2.c > b/gcc/testsuite/gcc.target/i386/pr84413-2.c > new file mode 100644 > index 00000000000..adf9b527cd6 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr84413-2.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -march=cannonlake" } */ > +/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */ > +/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" } } > */ > + > +#define N 1024 > + > +double a[N], b[N], c[N]; > + > +void > +avx512f_test (void) > +{ > + int i; > + > + for (i = 0; i < N; i++) > + c[i] = a[i] * b[i]; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr84413-3.c > b/gcc/testsuite/gcc.target/i386/pr84413-3.c > new file mode 100644 > index 00000000000..76bf25fc56b > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr84413-3.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -march=icelake-server" } */ > +/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */ > +/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" } } > */ > + > +#define N 1024 > + > +double a[N], b[N], c[N]; > + > +void > +avx512f_test (void) > +{ > + int i; > + > + for (i = 0; i < N; i++) > + c[i] = a[i] * b[i]; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr84413-4.c > b/gcc/testsuite/gcc.target/i386/pr84413-4.c > new file mode 100644 > index 00000000000..031ef0c8916 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr84413-4.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -march=haswell -mavx512f" } */ > +/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */ > +/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" } } > */ > + > +#define N 1024 > + > +double a[N], b[N], c[N]; > + > +void > +avx512f_test (void) > +{ > + int i; > + > + for (i = 0; i < N; i++) > + c[i] = a[i] * b[i]; > +} > -- > 2.17.1 > >
> > > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > > > > index 9e46b7b136f..762ab89fc9e 100644 > > > > --- a/gcc/config/i386/i386.c > > > > +++ b/gcc/config/i386/i386.c > > > > @@ -137,17 +137,22 @@ const struct processor_costs *ix86_cost = NULL; > > > > #define m_CORE2 (HOST_WIDE_INT_1U<<PROCESSOR_CORE2) > > > > #define m_NEHALEM (HOST_WIDE_INT_1U<<PROCESSOR_NEHALEM) > > > > #define m_SANDYBRIDGE (HOST_WIDE_INT_1U<<PROCESSOR_SANDYBRIDGE) > > > > -#define m_HASWELL (HOST_WIDE_INT_1U<<PROCESSOR_HASWELL) > > > > +#define m_HASWELL ((HOST_WIDE_INT_1U<<PROCESSOR_HASWELL) \ > > > > + | (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE) \ > > > > + | (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE_AVX512) \ > > > > + | (HOST_WIDE_INT_1U<<PROCESSOR_CANNONLAKE) \ > > > > + | (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_CLIENT) \ > > > > + | (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_SERVER)) > > > > > > > > > > Please introduce a new per-family define and group processors in this > > > define. Something like m_BDVER, m_BTVER and m_AMD_MULTIPLE for AMD > > targets. > > > We should not redefine m_HASWELL to include unrelated families. > > > > > > > Here is the updated patch. OK for trunk if all tests pass? > > > > > OK. We have also noticed that benchmarks on skylake are not good compared to haswell, this nicely explains it. I think this is -march=native regression compared to GCC versions that did not suppored better CPUs than Haswell. So it would be nice to backport it. Honza
On Fri, Jul 13, 2018 at 9:07 AM, Jan Hubicka <hubicka@ucw.cz> wrote: >> > > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c >> > > > index 9e46b7b136f..762ab89fc9e 100644 >> > > > --- a/gcc/config/i386/i386.c >> > > > +++ b/gcc/config/i386/i386.c >> > > > @@ -137,17 +137,22 @@ const struct processor_costs *ix86_cost = NULL; >> > > > #define m_CORE2 (HOST_WIDE_INT_1U<<PROCESSOR_CORE2) >> > > > #define m_NEHALEM (HOST_WIDE_INT_1U<<PROCESSOR_NEHALEM) >> > > > #define m_SANDYBRIDGE (HOST_WIDE_INT_1U<<PROCESSOR_SANDYBRIDGE) >> > > > -#define m_HASWELL (HOST_WIDE_INT_1U<<PROCESSOR_HASWELL) >> > > > +#define m_HASWELL ((HOST_WIDE_INT_1U<<PROCESSOR_HASWELL) \ >> > > > + | (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE) \ >> > > > + | (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE_AVX512) \ >> > > > + | (HOST_WIDE_INT_1U<<PROCESSOR_CANNONLAKE) \ >> > > > + | (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_CLIENT) \ >> > > > + | (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_SERVER)) >> > > > >> > > >> > > Please introduce a new per-family define and group processors in this >> > > define. Something like m_BDVER, m_BTVER and m_AMD_MULTIPLE for AMD >> > targets. >> > > We should not redefine m_HASWELL to include unrelated families. >> > > >> > >> > Here is the updated patch. OK for trunk if all tests pass? >> > >> > >> OK. > > We have also noticed that benchmarks on skylake are not good compared to > haswell, this nicely explains it. I think this is -march=native regression > compared to GCC versions that did not suppored better CPUs than Haswell. So it > would be nice to backport it. Yes, we should. Here is the patch to backport to GCC 8. OK for GCC 8 after it has been checked into trunk? Thanks.
> > We have also noticed that benchmarks on skylake are not good compared to > > haswell, this nicely explains it. I think this is -march=native regression > > compared to GCC versions that did not suppored better CPUs than Haswell. So it > > would be nice to backport it. > > Yes, we should. Here is the patch to backport to GCC 8. OK for GCC 8 after > it has been checked into trunk? OK, Honza > > Thanks. > > -- > H.J. > From 40a1050b330b421a1f445cb2a40b5a002da2e6d6 Mon Sep 17 00:00:00 2001 > From: "H.J. Lu" <hjl.tools@gmail.com> > Date: Mon, 4 Jun 2018 19:16:06 -0700 > Subject: [PATCH] x86: Tune Skylake, Cannonlake and Icelake as Haswell > > r259399, which added PROCESSOR_SKYLAKE, disabled many x86 optimizations > which are enabled by PROCESSOR_HASWELL. As the result, -mtune=skylake > generates slower codes on Skylake than before. The same also applies > to Cannonlake and Icelak tuning. > > This patch changes -mtune={skylake|cannonlake|icelake} to tune like > -mtune=haswell for until their tuning is properly adjusted. It also > enables -mprefer-vector-width=256 for -mtune=haswell, which has no > impact on codegen when AVX512 isn't enabled. > > Performance impacts on SPEC CPU 2017 rate with 1 copy using > > -march=native -mfpmath=sse -O2 -m64 > > are > > 1. On Broadwell server: > > 500.perlbench_r -0.56% > 502.gcc_r -0.18% > 505.mcf_r 0.24% > 520.omnetpp_r 0.00% > 523.xalancbmk_r -0.32% > 525.x264_r -0.17% > 531.deepsjeng_r 0.00% > 541.leela_r 0.00% > 548.exchange2_r 0.12% > 557.xz_r 0.00% > Geomean 0.00% > > 503.bwaves_r 0.00% > 507.cactuBSSN_r 0.21% > 508.namd_r 0.00% > 510.parest_r 0.19% > 511.povray_r -0.48% > 519.lbm_r 0.00% > 521.wrf_r 0.28% > 526.blender_r 0.19% > 527.cam4_r 0.39% > 538.imagick_r 0.00% > 544.nab_r -0.36% > 549.fotonik3d_r 0.51% > 554.roms_r 0.00% > Geomean 0.17% > > On Skylake client: > > 500.perlbench_r 0.96% > 502.gcc_r 0.13% > 505.mcf_r -1.03% > 520.omnetpp_r -1.11% > 523.xalancbmk_r 1.02% > 525.x264_r 0.50% > 531.deepsjeng_r 2.97% > 541.leela_r 0.50% > 548.exchange2_r -0.95% > 557.xz_r 2.41% > Geomean 0.56% > > 503.bwaves_r 0.49% > 507.cactuBSSN_r 3.17% > 508.namd_r 4.05% > 510.parest_r 0.15% > 511.povray_r 0.80% > 519.lbm_r 3.15% > 521.wrf_r 10.56% > 526.blender_r 2.97% > 527.cam4_r 2.36% > 538.imagick_r 46.40% > 544.nab_r 2.04% > 549.fotonik3d_r 0.00% > 554.roms_r 1.27% > Geomean 5.49% > > On Skylake server: > > 500.perlbench_r 0.71% > 502.gcc_r -0.51% > 505.mcf_r -1.06% > 520.omnetpp_r -0.33% > 523.xalancbmk_r -0.22% > 525.x264_r 1.72% > 531.deepsjeng_r -0.26% > 541.leela_r 0.57% > 548.exchange2_r -0.75% > 557.xz_r -1.28% > Geomean -0.21% > > 503.bwaves_r 0.00% > 507.cactuBSSN_r 2.66% > 508.namd_r 3.67% > 510.parest_r 1.25% > 511.povray_r 2.26% > 519.lbm_r 1.69% > 521.wrf_r 11.03% > 526.blender_r 3.39% > 527.cam4_r 1.69% > 538.imagick_r 64.59% > 544.nab_r -0.54% > 549.fotonik3d_r 2.68% > 554.roms_r 0.00% > Geomean 6.19% > > This patch improves -march=native performance on Skylake up to 60% and > leaves -march=native performance unchanged on Haswell. > > gcc/ > > Backport from mainline > 2018-07-12 H.J. Lu <hongjiu.lu@intel.com> > Sunil K Pandey <sunil.k.pandey@intel.com> > > PR target/84413 > * config/i386/i386.c (m_CORE_AVX512): New. > (m_CORE_AVX2): Likewise. > (m_CORE_ALL): Add m_CORE_AVX2. > * config/i386/x86-tune.def: Replace m_HASWELL with m_CORE_AVX2. > Replace m_SKYLAKE_AVX512 with m_CORE_AVX512 on avx256_optimal > and remove the rest of m_SKYLAKE_AVX512. > > gcc/testsuite/ > > Backport from mainline > 2018-07-12 H.J. Lu <hongjiu.lu@intel.com> > Sunil K Pandey <sunil.k.pandey@intel.com> > > PR target/84413 > * gcc.target/i386/pr84413-1.c: New test. > * gcc.target/i386/pr84413-2.c: Likewise. > * gcc.target/i386/pr84413-3.c: Likewise. > * gcc.target/i386/pr84413-4.c: Likewise. > --- > gcc/config/i386/i386.c | 5 ++++- > gcc/config/i386/x86-tune.def | 26 +++++++++++------------ > gcc/testsuite/gcc.target/i386/pr84413-1.c | 17 +++++++++++++++ > gcc/testsuite/gcc.target/i386/pr84413-2.c | 17 +++++++++++++++ > gcc/testsuite/gcc.target/i386/pr84413-3.c | 17 +++++++++++++++ > gcc/testsuite/gcc.target/i386/pr84413-4.c | 17 +++++++++++++++ > 6 files changed, 85 insertions(+), 14 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr84413-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr84413-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr84413-3.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr84413-4.c > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index d7dad81786a..8a032371e7f 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -139,7 +139,6 @@ const struct processor_costs *ix86_cost = NULL; > #define m_NEHALEM (HOST_WIDE_INT_1U<<PROCESSOR_NEHALEM) > #define m_SANDYBRIDGE (HOST_WIDE_INT_1U<<PROCESSOR_SANDYBRIDGE) > #define m_HASWELL (HOST_WIDE_INT_1U<<PROCESSOR_HASWELL) > -#define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL) > #define m_BONNELL (HOST_WIDE_INT_1U<<PROCESSOR_BONNELL) > #define m_SILVERMONT (HOST_WIDE_INT_1U<<PROCESSOR_SILVERMONT) > #define m_KNL (HOST_WIDE_INT_1U<<PROCESSOR_KNL) > @@ -149,6 +148,10 @@ const struct processor_costs *ix86_cost = NULL; > #define m_CANNONLAKE (HOST_WIDE_INT_1U<<PROCESSOR_CANNONLAKE) > #define m_ICELAKE_CLIENT (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_CLIENT) > #define m_ICELAKE_SERVER (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_SERVER) > +#define m_CORE_AVX512 (m_SKYLAKE_AVX512 | m_CANNONLAKE \ > + | m_ICELAKE_CLIENT | m_ICELAKE_SERVER) > +#define m_CORE_AVX2 (m_HASWELL | m_SKYLAKE | m_CORE_AVX512) > +#define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2) > #define m_INTEL (HOST_WIDE_INT_1U<<PROCESSOR_INTEL) > > #define m_GEODE (HOST_WIDE_INT_1U<<PROCESSOR_GEODE) > diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def > index 60625668236..c99e45cba58 100644 > --- a/gcc/config/i386/x86-tune.def > +++ b/gcc/config/i386/x86-tune.def > @@ -48,9 +48,9 @@ DEF_TUNE (X86_TUNE_SCHEDULE, "schedule", > over partial stores. For example preffer MOVZBL or MOVQ to load 8bit > value over movb. */ > DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency", > - m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL > + m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 > | m_BONNELL | m_SILVERMONT | m_INTEL > - | m_KNL | m_KNM | m_AMD_MULTIPLE | m_SKYLAKE_AVX512 | m_GENERIC) > + | m_KNL | m_KNM | m_AMD_MULTIPLE | m_GENERIC) > > /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: This knob promotes all store > destinations to be 128bit to allow register renaming on 128bit SSE units, > @@ -84,8 +84,8 @@ DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall", > partial dependencies. */ > DEF_TUNE (X86_TUNE_MOVX, "movx", > m_PPRO | m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE > - | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL | m_HASWELL > - | m_GEODE | m_AMD_MULTIPLE | m_SKYLAKE_AVX512 | m_GENERIC) > + | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL | m_CORE_AVX2 > + | m_GEODE | m_AMD_MULTIPLE | m_GENERIC) > > /* X86_TUNE_MEMORY_MISMATCH_STALL: Avoid partial stores that are followed by > full sized loads. */ > @@ -101,19 +101,19 @@ DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_32, "fuse_cmp_and_branch_32", > /* X86_TUNE_FUSE_CMP_AND_BRANCH_64: Fuse compare with a subsequent > conditional jump instruction for TARGET_64BIT. */ > DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_64, "fuse_cmp_and_branch_64", > - m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_BDVER | m_ZNVER1 | m_GENERIC) > + m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER | m_ZNVER1 | m_GENERIC) > > /* X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS: Fuse compare with a > subsequent conditional jump instruction when the condition jump > check sign flag (SF) or overflow flag (OF). */ > DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags", > - m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_BDVER | m_ZNVER1 | m_GENERIC) > + m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER | m_ZNVER1 | m_GENERIC) > > /* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional > jump instruction when the alu instruction produces the CCFLAG consumed by > the conditional jump instruction. */ > DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch", > - m_SANDYBRIDGE | m_HASWELL | m_GENERIC) > + m_SANDYBRIDGE | m_CORE_AVX2 | m_GENERIC) > > > /*****************************************************************************/ > @@ -286,7 +286,7 @@ DEF_TUNE (X86_TUNE_USE_BT, "use_bt", > /* X86_TUNE_AVOID_FALSE_DEP_FOR_BMI: Avoid false dependency > for bit-manipulation instructions. */ > DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi", > - m_SANDYBRIDGE | m_HASWELL | m_GENERIC) > + m_SANDYBRIDGE | m_CORE_AVX2 | m_GENERIC) > > /* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based > on hardware capabilities. Bdver3 hardware has a loop buffer which makes > @@ -335,15 +335,15 @@ DEF_TUNE (X86_TUNE_GENERAL_REGS_SSE_SPILL, "general_regs_sse_spill", > /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL: Use movups for misaligned loads instead > of a sequence loading registers by parts. */ > DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal", > - m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_SILVERMONT | m_KNL | m_KNM > - | m_INTEL | m_SKYLAKE_AVX512 | m_AMDFAM10 | m_BDVER | m_BTVER > + m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_KNL | m_KNM > + | m_INTEL | m_AMDFAM10 | m_BDVER | m_BTVER > | m_ZNVER1 | m_GENERIC) > > /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned stores instead > of a sequence loading registers by parts. */ > DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal", > - m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_SILVERMONT | m_KNL | m_KNM > - | m_INTEL | m_SKYLAKE_AVX512 | m_BDVER | m_ZNVER1 | m_GENERIC) > + m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_KNL | m_KNM > + | m_INTEL | m_BDVER | m_ZNVER1 | m_GENERIC) > > /* Use packed single precision instructions where posisble. I.e. movups instead > of movupd. */ > @@ -429,7 +429,7 @@ DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2 > > /* X86_TUNE_AVX256_OPTIMAL: Use 256-bit AVX instructions instead of 512-bit AVX > instructions in the auto-vectorizer. */ > -DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_SKYLAKE_AVX512) > +DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_CORE_AVX512) > > /*****************************************************************************/ > /* Historical relics: tuning flags that helps a specific old CPU designs */ > diff --git a/gcc/testsuite/gcc.target/i386/pr84413-1.c b/gcc/testsuite/gcc.target/i386/pr84413-1.c > new file mode 100644 > index 00000000000..1c94d7715cf > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr84413-1.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -march=skylake-avx512" } */ > +/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */ > +/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" } } */ > + > +#define N 1024 > + > +double a[N], b[N], c[N]; > + > +void > +avx512f_test (void) > +{ > + int i; > + > + for (i = 0; i < N; i++) > + c[i] = a[i] * b[i]; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr84413-2.c b/gcc/testsuite/gcc.target/i386/pr84413-2.c > new file mode 100644 > index 00000000000..adf9b527cd6 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr84413-2.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -march=cannonlake" } */ > +/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */ > +/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" } } */ > + > +#define N 1024 > + > +double a[N], b[N], c[N]; > + > +void > +avx512f_test (void) > +{ > + int i; > + > + for (i = 0; i < N; i++) > + c[i] = a[i] * b[i]; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr84413-3.c b/gcc/testsuite/gcc.target/i386/pr84413-3.c > new file mode 100644 > index 00000000000..76bf25fc56b > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr84413-3.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -march=icelake-server" } */ > +/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */ > +/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" } } */ > + > +#define N 1024 > + > +double a[N], b[N], c[N]; > + > +void > +avx512f_test (void) > +{ > + int i; > + > + for (i = 0; i < N; i++) > + c[i] = a[i] * b[i]; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr84413-4.c b/gcc/testsuite/gcc.target/i386/pr84413-4.c > new file mode 100644 > index 00000000000..031ef0c8916 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr84413-4.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -march=haswell -mavx512f" } */ > +/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */ > +/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" } } */ > + > +#define N 1024 > + > +double a[N], b[N], c[N]; > + > +void > +avx512f_test (void) > +{ > + int i; > + > + for (i = 0; i < N; i++) > + c[i] = a[i] * b[i]; > +} > -- > 2.17.1 >
On Fri, Jul 13, 2018 at 9:31 AM, Jan Hubicka <hubicka@ucw.cz> wrote: >> > We have also noticed that benchmarks on skylake are not good compared to >> > haswell, this nicely explains it. I think this is -march=native regression >> > compared to GCC versions that did not suppored better CPUs than Haswell. So it >> > would be nice to backport it. >> >> Yes, we should. Here is the patch to backport to GCC 8. OK for GCC 8 after >> it has been checked into trunk? > > OK, > Honza >> >> Thanks. >> >> -- >> H.J. > >> From 40a1050b330b421a1f445cb2a40b5a002da2e6d6 Mon Sep 17 00:00:00 2001 >> From: "H.J. Lu" <hjl.tools@gmail.com> >> Date: Mon, 4 Jun 2018 19:16:06 -0700 >> Subject: [PATCH] x86: Tune Skylake, Cannonlake and Icelake as Haswell >> >> r259399, which added PROCESSOR_SKYLAKE, disabled many x86 optimizations >> which are enabled by PROCESSOR_HASWELL. As the result, -mtune=skylake >> generates slower codes on Skylake than before. The same also applies >> to Cannonlake and Icelak tuning. >> >> This patch changes -mtune={skylake|cannonlake|icelake} to tune like >> -mtune=haswell for until their tuning is properly adjusted. It also >> enables -mprefer-vector-width=256 for -mtune=haswell, which has no >> impact on codegen when AVX512 isn't enabled. >> >> Performance impacts on SPEC CPU 2017 rate with 1 copy using >> >> -march=native -mfpmath=sse -O2 -m64 >> >> are >> >> 1. On Broadwell server: >> >> 500.perlbench_r -0.56% >> 502.gcc_r -0.18% >> 505.mcf_r 0.24% >> 520.omnetpp_r 0.00% >> 523.xalancbmk_r -0.32% >> 525.x264_r -0.17% >> 531.deepsjeng_r 0.00% >> 541.leela_r 0.00% >> 548.exchange2_r 0.12% >> 557.xz_r 0.00% >> Geomean 0.00% >> >> 503.bwaves_r 0.00% >> 507.cactuBSSN_r 0.21% >> 508.namd_r 0.00% >> 510.parest_r 0.19% >> 511.povray_r -0.48% >> 519.lbm_r 0.00% >> 521.wrf_r 0.28% >> 526.blender_r 0.19% >> 527.cam4_r 0.39% >> 538.imagick_r 0.00% >> 544.nab_r -0.36% >> 549.fotonik3d_r 0.51% >> 554.roms_r 0.00% >> Geomean 0.17% >> >> On Skylake client: >> >> 500.perlbench_r 0.96% >> 502.gcc_r 0.13% >> 505.mcf_r -1.03% >> 520.omnetpp_r -1.11% >> 523.xalancbmk_r 1.02% >> 525.x264_r 0.50% >> 531.deepsjeng_r 2.97% >> 541.leela_r 0.50% >> 548.exchange2_r -0.95% >> 557.xz_r 2.41% >> Geomean 0.56% >> >> 503.bwaves_r 0.49% >> 507.cactuBSSN_r 3.17% >> 508.namd_r 4.05% >> 510.parest_r 0.15% >> 511.povray_r 0.80% >> 519.lbm_r 3.15% >> 521.wrf_r 10.56% >> 526.blender_r 2.97% >> 527.cam4_r 2.36% >> 538.imagick_r 46.40% >> 544.nab_r 2.04% >> 549.fotonik3d_r 0.00% >> 554.roms_r 1.27% >> Geomean 5.49% >> >> On Skylake server: >> >> 500.perlbench_r 0.71% >> 502.gcc_r -0.51% >> 505.mcf_r -1.06% >> 520.omnetpp_r -0.33% >> 523.xalancbmk_r -0.22% >> 525.x264_r 1.72% >> 531.deepsjeng_r -0.26% >> 541.leela_r 0.57% >> 548.exchange2_r -0.75% >> 557.xz_r -1.28% >> Geomean -0.21% >> >> 503.bwaves_r 0.00% >> 507.cactuBSSN_r 2.66% >> 508.namd_r 3.67% >> 510.parest_r 1.25% >> 511.povray_r 2.26% >> 519.lbm_r 1.69% >> 521.wrf_r 11.03% >> 526.blender_r 3.39% >> 527.cam4_r 1.69% >> 538.imagick_r 64.59% >> 544.nab_r -0.54% >> 549.fotonik3d_r 2.68% >> 554.roms_r 0.00% >> Geomean 6.19% >> >> This patch improves -march=native performance on Skylake up to 60% and >> leaves -march=native performance unchanged on Haswell. >> >> gcc/ >> >> Backport from mainline >> 2018-07-12 H.J. Lu <hongjiu.lu@intel.com> >> Sunil K Pandey <sunil.k.pandey@intel.com> >> >> PR target/84413 >> * config/i386/i386.c (m_CORE_AVX512): New. >> (m_CORE_AVX2): Likewise. >> (m_CORE_ALL): Add m_CORE_AVX2. >> * config/i386/x86-tune.def: Replace m_HASWELL with m_CORE_AVX2. >> Replace m_SKYLAKE_AVX512 with m_CORE_AVX512 on avx256_optimal >> and remove the rest of m_SKYLAKE_AVX512. >> >> gcc/testsuite/ >> >> Backport from mainline >> 2018-07-12 H.J. Lu <hongjiu.lu@intel.com> >> Sunil K Pandey <sunil.k.pandey@intel.com> >> >> PR target/84413 >> * gcc.target/i386/pr84413-1.c: New test. >> * gcc.target/i386/pr84413-2.c: Likewise. >> * gcc.target/i386/pr84413-3.c: Likewise. >> * gcc.target/i386/pr84413-4.c: Likewise. This is the patch I checked into trunk. I dropped gcc.target/i386/pr84413-4.c: /* { dg-do compile } */ /* { dg-options "-O3 -march=haswell -mavx512f" } */ /* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */ /* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" } } */ #define N 1024 double a[N], b[N], c[N]; void avx512f_test (void) { int i; for (i = 0; i < N; i++) c[i] = a[i] * b[i]; } since -mtune=haswell no longer enables avx256_optimal in my first patch. I will do the same for GCC8 backport. Thanks.
On Fri, 13 Jul 2018, H.J. Lu wrote:
> I will do the same for GCC8 backport.
Can you please add a note to gcc-8/changes.html? This seems big
enough to warrant a note in a part for GCC 8.2.
(At gcc-7/changes.html you can see how to go about this for minor
releases.)
Gerald
On Sat, Jul 14, 2018 at 06:09:47PM +0200, Gerald Pfeifer wrote: > On Fri, 13 Jul 2018, H.J. Lu wrote: > > I will do the same for GCC8 backport. > > Can you please add a note to gcc-8/changes.html? This seems big > enough to warrant a note in a part for GCC 8.2. > > (At gcc-7/changes.html you can see how to go about this for minor > releases.) > Like this? H.J. --- Index: changes.html =================================================================== RCS file: /cvs/gcc/wwwdocs/htdocs/gcc-8/changes.html,v retrieving revision 1.88 diff -u -p -r1.88 changes.html --- changes.html 14 Jun 2018 13:52:35 -0000 1.88 +++ changes.html 14 Jul 2018 21:17:10 -0000 @@ -1312,5 +1312,23 @@ known to be fixed in the 8.1 release. Th complete (that is, it is possible that some PRs that have been fixed are not listed here).</p> +<!-- .................................................................. --> +<h2 id="GCC8.2">GCC 8.2</h2> + +<p>This is the <a href="https://gcc.gnu.org/bugzilla/buglist.cgi?bug_status=RESOLVED&resolution=FIXED&target_milestone=8.2">list +of problem reports (PRs)</a> from GCC's bug tracking system that are +known to be fixed in the 8.1 release. This list might not be +complete (that is, it is possible that some PRs that have been fixed +are not listed here).</p> + +<h3>Target Specific Changes</h3> + +<h4>IA-32/x86-64</h4> + <ul> + <li> <code>-mtune=native</code> performance regression + <a href="https://gcc.gnu.org/bugzilla/show_bug.cgi?id=84413">PR84413</a> + on Intel Skylake processors has been fixed.</li> + </ul> + </body> </html>
On Sat, 14 Jul 2018, H.J. Lu wrote:
> Like this?
Yes, this looks fine.
Thanks!
Gerald
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 9e46b7b136f..762ab89fc9e 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -137,17 +137,22 @@ const struct processor_costs *ix86_cost = NULL; #define m_CORE2 (HOST_WIDE_INT_1U<<PROCESSOR_CORE2) #define m_NEHALEM (HOST_WIDE_INT_1U<<PROCESSOR_NEHALEM) #define m_SANDYBRIDGE (HOST_WIDE_INT_1U<<PROCESSOR_SANDYBRIDGE) -#define m_HASWELL (HOST_WIDE_INT_1U<<PROCESSOR_HASWELL) +#define m_HASWELL ((HOST_WIDE_INT_1U<<PROCESSOR_HASWELL) \ + | (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE) \ + | (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE_AVX512) \ + | (HOST_WIDE_INT_1U<<PROCESSOR_CANNONLAKE) \ + | (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_CLIENT) \ + | (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_SERVER)) #define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL) #define m_BONNELL (HOST_WIDE_INT_1U<<PROCESSOR_BONNELL) #define m_SILVERMONT (HOST_WIDE_INT_1U<<PROCESSOR_SILVERMONT) #define m_KNL (HOST_WIDE_INT_1U<<PROCESSOR_KNL) #define m_KNM (HOST_WIDE_INT_1U<<PROCESSOR_KNM) -#define m_SKYLAKE (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE) -#define m_SKYLAKE_AVX512 (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE_AVX512) -#define m_CANNONLAKE (HOST_WIDE_INT_1U<<PROCESSOR_CANNONLAKE) -#define m_ICELAKE_CLIENT (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_CLIENT) -#define m_ICELAKE_SERVER (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_SERVER) +#define m_SKYLAKE 0 +#define m_SKYLAKE_AVX512 0 +#define m_CANNONLAKE 0 +#define m_ICELAKE_CLIENT 0 +#define m_ICELAKE_SERVER 0 #define m_GOLDMONT (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT) #define m_GOLDMONT_PLUS (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT_PLUS) #define m_TREMONT (HOST_WIDE_INT_1U<<PROCESSOR_TREMONT) diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 8a8d5ab2440..c8abaedad8c 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -444,9 +444,12 @@ DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_store_optimal" DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2 | m_ZNVER1) -/* X86_TUNE_AVX256_OPTIMAL: Use 256-bit AVX instructions instead of 512-bit AVX - instructions in the auto-vectorizer. */ -DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_SKYLAKE_AVX512) +/* X86_TUNE_AVX256_OPTIMAL: Use 256-bit AVX instructions instead of 512-bit + AVX instructions in the auto-vectorizer. NB: This is also enabled for + -mtune=haswell so that we can tune Skylake, Cannonlake and Icelake as + Haswell. */ +DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_SKYLAKE_AVX512 + | m_HASWELL) /*****************************************************************************/ /* Historical relics: tuning flags that helps a specific old CPU designs */ diff --git a/gcc/testsuite/gcc.target/i386/pr84413-1.c b/gcc/testsuite/gcc.target/i386/pr84413-1.c new file mode 100644 index 00000000000..1c94d7715cf --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr84413-1.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=skylake-avx512" } */ +/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */ +/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" } } */ + +#define N 1024 + +double a[N], b[N], c[N]; + +void +avx512f_test (void) +{ + int i; + + for (i = 0; i < N; i++) + c[i] = a[i] * b[i]; +} diff --git a/gcc/testsuite/gcc.target/i386/pr84413-2.c b/gcc/testsuite/gcc.target/i386/pr84413-2.c new file mode 100644 index 00000000000..adf9b527cd6 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr84413-2.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=cannonlake" } */ +/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */ +/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" } } */ + +#define N 1024 + +double a[N], b[N], c[N]; + +void +avx512f_test (void) +{ + int i; + + for (i = 0; i < N; i++) + c[i] = a[i] * b[i]; +} diff --git a/gcc/testsuite/gcc.target/i386/pr84413-3.c b/gcc/testsuite/gcc.target/i386/pr84413-3.c new file mode 100644 index 00000000000..76bf25fc56b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr84413-3.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=icelake-server" } */ +/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */ +/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" } } */ + +#define N 1024 + +double a[N], b[N], c[N]; + +void +avx512f_test (void) +{ + int i; + + for (i = 0; i < N; i++) + c[i] = a[i] * b[i]; +} diff --git a/gcc/testsuite/gcc.target/i386/pr84413-4.c b/gcc/testsuite/gcc.target/i386/pr84413-4.c new file mode 100644 index 00000000000..031ef0c8916 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr84413-4.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=haswell -mavx512f" } */ +/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */ +/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" } } */ + +#define N 1024 + +double a[N], b[N], c[N]; + +void +avx512f_test (void) +{ + int i; + + for (i = 0; i < N; i++) + c[i] = a[i] * b[i]; +}