Message ID | 20180209073200.29318-1-siddhesh@sourceware.org |
---|---|
State | New |
Headers | show |
Series | [v3] Disable reg offset in quad-word store for Falkor | expand |
Ping! On Friday 09 February 2018 01:02 PM, Siddhesh Poyarekar wrote: > Hi, > > Here's v3 of the patch to disable register offset addressing mode for > stores of 128-bit values on Falkor because they're very costly. > Following Kyrill's suggestion, I compared the codegen for a53 and > found that the codegen was quite different. Jim's original patch is > the most minimal compromise for this and is also a cleaner temporary > fix before I attempt to split address costs into loads and stores for > gcc9. > > So v3 is essentially a very slightly cleaned up version of v1 again, > this time with confirmation that there are no codegen changes in > CPU2017 on non-falkor builds; only the codegen for -mcpu=falkor is > different. > > ---- > > On Falkor, because of an idiosyncracy of how the pipelines are > designed, a quad-word store using a reg+reg addressing mode is almost > twice as slow as an add followed by a quad-word store with a single > reg addressing mode. So we get better performance if we disallow > addressing modes using register offsets with quad-word stores. This > is the most minimal change for gcc8, I will volunteer to make a more > lasting change for gcc9 where I split the addressing mode costs into > loads and stores wherever possible and needed. > > This patch improves fpspeed by 0.17% and intspeed by 0.62% in CPU2017, > with xalancbmk_s (3.84%) wrf_s (1.46%) and mcf_s (1.62%) being the > biggest winners. There were no regressions beyond 0.4%. > > 2018-xx-xx Jim Wilson <jim.wilson@linaro.org> > Kugan Vivenakandarajah <kugan.vivekanandarajah@linaro.org> > Siddhesh Poyarekar <siddhesh@sourceware.org> > > gcc/ > * config/aarch64/aarch64-protos.h (aarch64_movti_target_operand_p): > New. > * config/aarch64/aarch64-simd.md (aarch64_simd_mov<mode>): Use Utf. > * config/aarch64/aarch64-tuning-flags.def > (SLOW_REGOFFSET_QUADWORD_STORE): New. > * config/aarch64/aarch64.c (qdf24xx_tunings): Add > SLOW_REGOFFSET_QUADWORD_STORE to tuning flags. > (aarch64_movti_target_operand_p): New. > * config/aarch64/aarch64.md (movti_aarch64): Use Utf. > (movtf_aarch64): Likewise. > * config/aarch64/constraints.md (Utf): New. > > gcc/testsuite > * gcc.target/aarch64/pr82533.c: New test case. > --- > gcc/config/aarch64/aarch64-protos.h | 1 + > gcc/config/aarch64/aarch64-simd.md | 4 ++-- > gcc/config/aarch64/aarch64-tuning-flags.def | 4 ++++ > gcc/config/aarch64/aarch64.c | 14 +++++++++++++- > gcc/config/aarch64/aarch64.md | 8 ++++---- > gcc/config/aarch64/constraints.md | 6 ++++++ > gcc/testsuite/gcc.target/aarch64/pr82533.c | 11 +++++++++++ > 7 files changed, 41 insertions(+), 7 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/aarch64/pr82533.c > > diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h > index cda2895d28e..5a0323deb1e 100644 > --- a/gcc/config/aarch64/aarch64-protos.h > +++ b/gcc/config/aarch64/aarch64-protos.h > @@ -433,6 +433,7 @@ bool aarch64_simd_mem_operand_p (rtx); > bool aarch64_sve_ld1r_operand_p (rtx); > bool aarch64_sve_ldr_operand_p (rtx); > bool aarch64_sve_struct_memory_operand_p (rtx); > +bool aarch64_movti_target_operand_p (rtx); > rtx aarch64_simd_vect_par_cnst_half (machine_mode, int, bool); > rtx aarch64_tls_get_addr (void); > tree aarch64_fold_builtin (tree, int, tree *, bool); > diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md > index 3d1f6a01cb7..f7daac3e28d 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -131,9 +131,9 @@ > > (define_insn "*aarch64_simd_mov<VQ:mode>" > [(set (match_operand:VQ 0 "nonimmediate_operand" > - "=w, Umq, m, w, ?r, ?w, ?r, w") > + "=w, Umq, Utf, w, ?r, ?w, ?r, w") > (match_operand:VQ 1 "general_operand" > - "m, Dz, w, w, w, r, r, Dn"))] > + "m, Dz, w, w, w, r, r, Dn"))] > "TARGET_SIMD > && (register_operand (operands[0], <MODE>mode) > || aarch64_simd_reg_or_zero (operands[1], <MODE>mode))" > diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def > index ea9ead234cb..04baf5b6de6 100644 > --- a/gcc/config/aarch64/aarch64-tuning-flags.def > +++ b/gcc/config/aarch64/aarch64-tuning-flags.def > @@ -41,4 +41,8 @@ AARCH64_EXTRA_TUNING_OPTION ("slow_unaligned_ldpw", SLOW_UNALIGNED_LDPW) > are not considered cheap. */ > AARCH64_EXTRA_TUNING_OPTION ("cheap_shift_extend", CHEAP_SHIFT_EXTEND) > > +/* Don't use a register offset in a memory address for a quad-word store. */ > +AARCH64_EXTRA_TUNING_OPTION ("slow_regoffset_quadword_store", > + SLOW_REGOFFSET_QUADWORD_STORE) > + > #undef AARCH64_EXTRA_TUNING_OPTION > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c > index 228fd1b908d..c0a05598415 100644 > --- a/gcc/config/aarch64/aarch64.c > +++ b/gcc/config/aarch64/aarch64.c > @@ -875,7 +875,7 @@ static const struct tune_params qdf24xx_tunings = > 2, /* min_div_recip_mul_df. */ > 0, /* max_case_values. */ > tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ > - (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ > + (AARCH64_EXTRA_TUNE_SLOW_REGOFFSET_QUADWORD_STORE), /* tune_flags. */ > &qdf24xx_prefetch_tune > }; > > @@ -13634,6 +13634,18 @@ aarch64_sve_struct_memory_operand_p (rtx op) > && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last)); > } > > +/* Return TRUE if OP uses an efficient memory address for quad-word target. */ > +bool > +aarch64_movti_target_operand_p (rtx op) > +{ > + if (! optimize_size > + && (aarch64_tune_params.extra_tuning_flags > + & AARCH64_EXTRA_TUNE_SLOW_REGOFFSET_QUADWORD_STORE)) > + return MEM_P (op) && ! (GET_CODE (XEXP (op, 0)) == PLUS > + && ! CONST_INT_P (XEXP (XEXP (op, 0), 1))); > + return MEM_P (op); > +} > + > /* Emit a register copy from operand to operand, taking care not to > early-clobber source registers in the process. > > diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md > index 5a2a9309a3b..1e6edcf51f2 100644 > --- a/gcc/config/aarch64/aarch64.md > +++ b/gcc/config/aarch64/aarch64.md > @@ -1080,9 +1080,9 @@ > > (define_insn "*movti_aarch64" > [(set (match_operand:TI 0 > - "nonimmediate_operand" "= r,w, r,w,r,m,m,w,m") > + "nonimmediate_operand" "= r,w, r,w,r,m,m,w,Utf") > (match_operand:TI 1 > - "aarch64_movti_operand" " rUti,r, w,w,m,r,Z,m,w"))] > + "aarch64_movti_operand" " rUti,r, w,w,m,r,Z,m, w"))] > "(register_operand (operands[0], TImode) > || aarch64_reg_or_zero (operands[1], TImode))" > "@ > @@ -1227,9 +1227,9 @@ > > (define_insn "*movtf_aarch64" > [(set (match_operand:TF 0 > - "nonimmediate_operand" "=w,?&r,w ,?r,w,?w,w,m,?r,m ,m") > + "nonimmediate_operand" "=w,?&r,w ,?r,w,?w,w,Utf,?r,m ,m") > (match_operand:TF 1 > - "general_operand" " w,?r, ?r,w ,Y,Y ,m,w,m ,?r,Y"))] > + "general_operand" " w,?r, ?r,w ,Y,Y ,m,w ,m ,?r,Y"))] > "TARGET_FLOAT && (register_operand (operands[0], TFmode) > || aarch64_reg_or_fp_zero (operands[1], TFmode))" > "@ > diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md > index f052103e859..35aa62996ae 100644 > --- a/gcc/config/aarch64/constraints.md > +++ b/gcc/config/aarch64/constraints.md > @@ -235,6 +235,12 @@ > (and (match_code "mem") > (match_test "aarch64_sve_ldr_operand_p (op)"))) > > +(define_memory_constraint "Utf" > + "@internal > + An efficient memory address for a quad-word target operand." > + (and (match_code "mem") > + (match_test "aarch64_movti_target_operand_p (op)"))) > + > (define_memory_constraint "Utv" > "@internal > An address valid for loading/storing opaque structure > diff --git a/gcc/testsuite/gcc.target/aarch64/pr82533.c b/gcc/testsuite/gcc.target/aarch64/pr82533.c > new file mode 100644 > index 00000000000..fa28ffac03a > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/pr82533.c > @@ -0,0 +1,11 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mcpu=falkor -O2 -ftree-vectorize" } */ > + > +void > +copy (int N, double *c, double *a) > +{ > + for (int i = 0; i < N; ++i) > + c[i] = a[i]; > +} > + > +/* { dg-final { scan-assembler-not "str\tq\[0-9\]+, \\\[x\[0-9\]+, x\[0-9\]+\\\]" } } */ >
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index cda2895d28e..5a0323deb1e 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -433,6 +433,7 @@ bool aarch64_simd_mem_operand_p (rtx); bool aarch64_sve_ld1r_operand_p (rtx); bool aarch64_sve_ldr_operand_p (rtx); bool aarch64_sve_struct_memory_operand_p (rtx); +bool aarch64_movti_target_operand_p (rtx); rtx aarch64_simd_vect_par_cnst_half (machine_mode, int, bool); rtx aarch64_tls_get_addr (void); tree aarch64_fold_builtin (tree, int, tree *, bool); diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 3d1f6a01cb7..f7daac3e28d 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -131,9 +131,9 @@ (define_insn "*aarch64_simd_mov<VQ:mode>" [(set (match_operand:VQ 0 "nonimmediate_operand" - "=w, Umq, m, w, ?r, ?w, ?r, w") + "=w, Umq, Utf, w, ?r, ?w, ?r, w") (match_operand:VQ 1 "general_operand" - "m, Dz, w, w, w, r, r, Dn"))] + "m, Dz, w, w, w, r, r, Dn"))] "TARGET_SIMD && (register_operand (operands[0], <MODE>mode) || aarch64_simd_reg_or_zero (operands[1], <MODE>mode))" diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def index ea9ead234cb..04baf5b6de6 100644 --- a/gcc/config/aarch64/aarch64-tuning-flags.def +++ b/gcc/config/aarch64/aarch64-tuning-flags.def @@ -41,4 +41,8 @@ AARCH64_EXTRA_TUNING_OPTION ("slow_unaligned_ldpw", SLOW_UNALIGNED_LDPW) are not considered cheap. */ AARCH64_EXTRA_TUNING_OPTION ("cheap_shift_extend", CHEAP_SHIFT_EXTEND) +/* Don't use a register offset in a memory address for a quad-word store. */ +AARCH64_EXTRA_TUNING_OPTION ("slow_regoffset_quadword_store", + SLOW_REGOFFSET_QUADWORD_STORE) + #undef AARCH64_EXTRA_TUNING_OPTION diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 228fd1b908d..c0a05598415 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -875,7 +875,7 @@ static const struct tune_params qdf24xx_tunings = 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + (AARCH64_EXTRA_TUNE_SLOW_REGOFFSET_QUADWORD_STORE), /* tune_flags. */ &qdf24xx_prefetch_tune }; @@ -13634,6 +13634,18 @@ aarch64_sve_struct_memory_operand_p (rtx op) && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last)); } +/* Return TRUE if OP uses an efficient memory address for quad-word target. */ +bool +aarch64_movti_target_operand_p (rtx op) +{ + if (! optimize_size + && (aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_SLOW_REGOFFSET_QUADWORD_STORE)) + return MEM_P (op) && ! (GET_CODE (XEXP (op, 0)) == PLUS + && ! CONST_INT_P (XEXP (XEXP (op, 0), 1))); + return MEM_P (op); +} + /* Emit a register copy from operand to operand, taking care not to early-clobber source registers in the process. diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 5a2a9309a3b..1e6edcf51f2 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -1080,9 +1080,9 @@ (define_insn "*movti_aarch64" [(set (match_operand:TI 0 - "nonimmediate_operand" "= r,w, r,w,r,m,m,w,m") + "nonimmediate_operand" "= r,w, r,w,r,m,m,w,Utf") (match_operand:TI 1 - "aarch64_movti_operand" " rUti,r, w,w,m,r,Z,m,w"))] + "aarch64_movti_operand" " rUti,r, w,w,m,r,Z,m, w"))] "(register_operand (operands[0], TImode) || aarch64_reg_or_zero (operands[1], TImode))" "@ @@ -1227,9 +1227,9 @@ (define_insn "*movtf_aarch64" [(set (match_operand:TF 0 - "nonimmediate_operand" "=w,?&r,w ,?r,w,?w,w,m,?r,m ,m") + "nonimmediate_operand" "=w,?&r,w ,?r,w,?w,w,Utf,?r,m ,m") (match_operand:TF 1 - "general_operand" " w,?r, ?r,w ,Y,Y ,m,w,m ,?r,Y"))] + "general_operand" " w,?r, ?r,w ,Y,Y ,m,w ,m ,?r,Y"))] "TARGET_FLOAT && (register_operand (operands[0], TFmode) || aarch64_reg_or_fp_zero (operands[1], TFmode))" "@ diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md index f052103e859..35aa62996ae 100644 --- a/gcc/config/aarch64/constraints.md +++ b/gcc/config/aarch64/constraints.md @@ -235,6 +235,12 @@ (and (match_code "mem") (match_test "aarch64_sve_ldr_operand_p (op)"))) +(define_memory_constraint "Utf" + "@internal + An efficient memory address for a quad-word target operand." + (and (match_code "mem") + (match_test "aarch64_movti_target_operand_p (op)"))) + (define_memory_constraint "Utv" "@internal An address valid for loading/storing opaque structure diff --git a/gcc/testsuite/gcc.target/aarch64/pr82533.c b/gcc/testsuite/gcc.target/aarch64/pr82533.c new file mode 100644 index 00000000000..fa28ffac03a --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/pr82533.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-mcpu=falkor -O2 -ftree-vectorize" } */ + +void +copy (int N, double *c, double *a) +{ + for (int i = 0; i < N; ++i) + c[i] = a[i]; +} + +/* { dg-final { scan-assembler-not "str\tq\[0-9\]+, \\\[x\[0-9\]+, x\[0-9\]+\\\]" } } */